第一个爬虫
一:程序测试
def gameOver(N,z,y): if(z-y==2 and z==20 and y==20): return True elif(y-z==2 and z==20 and y==20): return True elif(z==29 and y==29): if(z-y==1 or y-z==1): return True else: return False def Test(): try: N = [1, 2, 3, 4, 5, 5, 5] z = [13, 19, 20, 21, 14, 17, 15] y = [15, 18, 20, 23, 16, 16, 0] result = ["True", "False", "False", "True", "True", "True", "True"] for i in range(0, 7): if str(gameOver(N[i], z[i], y[i])) == result[i]: print("Test {}: Right Result!".format(i+1)) else: print("Test {}: Error Result!".format(i+1)) except Exception as e: print("Error:", e) Test()
二:打开360网页
import requests def getHTMLText(url): try: r=requests.get(url,timeout=30) from bs4 import BeautifulSoup soup=BeautifulSoup(r.text) r.raise_for_status() r.encoding='utf_8' return (r.text,r.status_code,r.encoding,len(r.text),len(soup.p.contents),type(soup)) except: return"" url='https://hao.360.cn' for i in range(20): print(i) print(getHTMLText(url))
三:html计算
# -*- coding: utf-8 -*- """ Created on Mon May 20 10:03:00 2019 @author: 27 """ from requests import get def getText(url): try: r = get(url, timeout=5) r.raise_for_status() r.encoding = 'utf-8' return r.text except Exception as e: print("Error:", e) return '' from bs4 import BeautifulSoup
url = "https://www.runoob.com/"
html = getText(url)
soup = BeautifulSoup(html) print("head:", soup.head) print("head:", len(soup.head)) print("body:", soup.body) print("body:", len(soup.body)) print("title:",soup.title) print("special_id:", soup.find(id='cd-login') import re def getACH(text): text_unicode = text.strip() string = re.compile('[^\u4e00-\u9fff]') ACH = "".join(string.split(text_unicode)) return ACH print("ACH:", ACH(html))
四:中国大学排名(爬取年费2018)
# -*- coding: utf-8 -*-
"""
Created on Thu May 23 08:39:07 2019
"""
Created on Thu May 23 08:39:07 2019
@author: 27
"""
import requests
from bs4 import BeautifulSoup
import pandas as pd
allUniv=[]
def getHTMLText(url):
try:
r=requests.get(url,timeout=30)
r.raise_for_status()
r.encoding="utf-8"
return r.text
except:
return ""
####代码中每个td标签包含大学排名表格的夜歌列数值,与表头一一对应。要获取其中数据要先找到<tr></tr>标签,并遍历其中每一个<td></td>标签,获取其值写入程序的数据结构中##########
def fillUnivList(soup):
data=soup.find_all('tr') #找到所有的tr标签
for tr in data:
ltd=tr.find_all('td') #在每个tr标签中找到所有的td标签
if len(ltd)==0:
continue
singleUniv=[] #创建空列表对象,储存当前<tr>标签表示大学的数据
for td in ltd:
singleUniv.append(td.string) #提取td标签中的信息
allUniv.append(singleUniv)
def printUnivList(num):
print("{1:^2}{2:{0}^10}{3:{0}^6}{4:{0}^4}{5:{0}^10}".format(chr(12288),"排名","学校名称","省市","总分","年费"))
a=[]
for i in range(num):
u=allUniv[i]
print("{1:^4}{2:{0}^10}{3:{0}^5}{4:{0}^8}{5:{0}^10}".format(chr(12288),u[0],u[1],u[2],u[3],u[6]))
def main(num):
url='http://www.zuihaodaxue.com/zuihaodaxuepaiming2018.html'
html=getHTMLText(url)
soup=BeautifulSoup(html,"html.parser")
fillUnivList(soup)
printUnivList(num)
main(30)
list=allUniv
name=["paiming","学校名称","省市","总分","生涯质量", "培养结果", "社会声誉", "科研规模", "科研质量", "顶尖成果" ,"顶尖人才", "科技服务", "成果转化","学生国际化"]
test=pd.DataFrame(columns=name,data=list)
test.to_csv('university.csv',encoding='gbk')
"""
import requests
from bs4 import BeautifulSoup
import pandas as pd
allUniv=[]
def getHTMLText(url):
try:
r=requests.get(url,timeout=30)
r.raise_for_status()
r.encoding="utf-8"
return r.text
except:
return ""
####代码中每个td标签包含大学排名表格的夜歌列数值,与表头一一对应。要获取其中数据要先找到<tr></tr>标签,并遍历其中每一个<td></td>标签,获取其值写入程序的数据结构中##########
def fillUnivList(soup):
data=soup.find_all('tr') #找到所有的tr标签
for tr in data:
ltd=tr.find_all('td') #在每个tr标签中找到所有的td标签
if len(ltd)==0:
continue
singleUniv=[] #创建空列表对象,储存当前<tr>标签表示大学的数据
for td in ltd:
singleUniv.append(td.string) #提取td标签中的信息
allUniv.append(singleUniv)
def printUnivList(num):
print("{1:^2}{2:{0}^10}{3:{0}^6}{4:{0}^4}{5:{0}^10}".format(chr(12288),"排名","学校名称","省市","总分","年费"))
a=[]
for i in range(num):
u=allUniv[i]
print("{1:^4}{2:{0}^10}{3:{0}^5}{4:{0}^8}{5:{0}^10}".format(chr(12288),u[0],u[1],u[2],u[3],u[6]))
def main(num):
url='http://www.zuihaodaxue.com/zuihaodaxuepaiming2018.html'
html=getHTMLText(url)
soup=BeautifulSoup(html,"html.parser")
fillUnivList(soup)
printUnivList(num)
main(30)
list=allUniv
name=["paiming","学校名称","省市","总分","生涯质量", "培养结果", "社会声誉", "科研规模", "科研质量", "顶尖成果" ,"顶尖人才", "科技服务", "成果转化","学生国际化"]
test=pd.DataFrame(columns=name,data=list)
test.to_csv('university.csv',encoding='gbk')
五:函数介绍总结
1.
函数 | 说明 |
get(url [, timeout=n]) | 对应HTTP的GET方式,设定请求超时时间为n秒 |
post(url, data={'key':'value'}) | 对应HTTP的POST方式,字典用于传输客户数据 |
delete(url) | 对应HTTP的DELETE方式 |
head(url) | 对应HTTP的HEAD方式 |
options(url) | 对应HTTP的OPTIONS方式 |
put(url, data={'key':'value'}) |
对应HTTP的PUT方式,字典用于传输客户数据 |
2.
属性 | 说明 |
status_code | HTTP请求的返回状态 |
encoding | HTTP响应内容的编码方式 |
text | HTTP响应内容的字符串形式 |
content | HTTP响应内容的二进制形式 |
方法 | 说明 |
json() | 若http响应内容中包含json格式数据, 则解析json数据 |
raise_for_status() | 若http返回的状态码不是200, 则产生异常 |