第一个爬虫

一:程序测试

def gameOver(N,z,y):
    if(z-y==2 and z==20 and y==20):
        return True
    elif(y-z==2 and z==20 and y==20):
        return True
    elif(z==29 and y==29):
        if(z-y==1 or y-z==1):
            return True
    else:
        return False
def Test():
    try:
        N = [1, 2, 3, 4, 5, 5, 5]
        z = [13, 19, 20, 21, 14, 17, 15]
        y = [15, 18, 20, 23, 16, 16, 0]
        result = ["True", "False", "False", "True", "True", "True", "True"]
        for i in range(0, 7):
            if str(gameOver(N[i], z[i], y[i])) == result[i]:
                print("Test {}: Right Result!".format(i+1))
            else:
                print("Test {}: Error Result!".format(i+1))
    except Exception as e:
         print("Error:", e)
Test()

二:打开360网页

import requests
def getHTMLText(url):
    try:
        r=requests.get(url,timeout=30)
        from bs4 import BeautifulSoup
        soup=BeautifulSoup(r.text)
        r.raise_for_status()
        r.encoding='utf_8'
        return (r.text,r.status_code,r.encoding,len(r.text),len(soup.p.contents),type(soup))
    except:
        return""
url='https://hao.360.cn'
for i in range(20):
    print(i)
    print(getHTMLText(url))

 

三:html计算

# -*- coding: utf-8 -*-
"""
Created on Mon May 20 10:03:00 2019

@author: 27
"""
from requests import get
def getText(url):
    try:
        r = get(url, timeout=5)
        r.raise_for_status()
        r.encoding = 'utf-8'
        return r.text
    except Exception as e:
        print("Error:", e)
        return ''
from bs4 import BeautifulSoup
url = "https://www.runoob.com/"
html = getText(url)
soup = BeautifulSoup(html)
print("head:", soup.head) print("head:", len(soup.head)) print("body:", soup.body) print("body:", len(soup.body)) print("title:",soup.title) print("special_id:", soup.find(id='cd-login') import re def getACH(text): text_unicode = text.strip() string = re.compile('[^\u4e00-\u9fff]') ACH = "".join(string.split(text_unicode)) return ACH print("ACH:", ACH(html))

四:中国大学排名(爬取年费2018)

# -*- coding: utf-8 -*-
"""
Created on Thu May 23 08:39:07 2019
@author: 27
"""
import requests
from bs4 import BeautifulSoup
import pandas as pd
allUniv=[]
def getHTMLText(url):
    try:
        r=requests.get(url,timeout=30)
        r.raise_for_status()
        r.encoding="utf-8"
        return r.text
    except:
        return ""
####代码中每个td标签包含大学排名表格的夜歌列数值,与表头一一对应。要获取其中数据要先找到<tr></tr>标签,并遍历其中每一个<td></td>标签,获取其值写入程序的数据结构中##########
def fillUnivList(soup):
    data=soup.find_all('tr')  #找到所有的tr标签
    for tr in data:
        ltd=tr.find_all('td')  #在每个tr标签中找到所有的td标签
        if len(ltd)==0:
            continue
        singleUniv=[]   #创建空列表对象,储存当前<tr>标签表示大学的数据
        for td in ltd:
            singleUniv.append(td.string) #提取td标签中的信息
        allUniv.append(singleUniv)
def printUnivList(num):
    print("{1:^2}{2:{0}^10}{3:{0}^6}{4:{0}^4}{5:{0}^10}".format(chr(12288),"排名","学校名称","省市","总分","年费"))
    a=[]
    for i in range(num):
        u=allUniv[i]
        print("{1:^4}{2:{0}^10}{3:{0}^5}{4:{0}^8}{5:{0}^10}".format(chr(12288),u[0],u[1],u[2],u[3],u[6]))
def main(num):
    url='http://www.zuihaodaxue.com/zuihaodaxuepaiming2018.html'
    html=getHTMLText(url)
    soup=BeautifulSoup(html,"html.parser")
    fillUnivList(soup)
    printUnivList(num)
main(30)
list=allUniv
name=["paiming","学校名称","省市","总分","生涯质量", "培养结果", "社会声誉", "科研规模", "科研质量", "顶尖成果" ,"顶尖人才", "科技服务", "成果转化","学生国际化"]
test=pd.DataFrame(columns=name,data=list)
test.to_csv('university.csv',encoding='gbk')

 

五:函数介绍总结

1.

requests 库的网页请求函数
函数 说明
get(url [, timeout=n]) 对应HTTP的GET方式,设定请求超时时间为n秒
post(url, data={'key':'value'}) 对应HTTP的POST方式,字典用于传输客户数据
delete(url) 对应HTTP的DELETE方式
head(url) 对应HTTP的HEAD方式
options(url) 对应HTTP的OPTIONS方式
put(url, data={'key':'value'})

对应HTTP的PUT方式,字典用于传输客户数据

2.

response对象的属性
属性 说明
status_code HTTP请求的返回状态
encoding HTTP响应内容的编码方式
text HTTP响应内容的字符串形式
content HTTP响应内容的二进制形式
response对象的方法
方法 说明
json() 若http响应内容中包含json格式数据, 则解析json数据
raise_for_status() 若http返回的状态码不是200, 则产生异常
posted @ 2019-05-23 08:52  简笺  阅读(270)  评论(0)    收藏  举报