Python爬虫Spider 一
来自北京图灵学院刘英。
参考资料:
- Python网络数据采集,图灵工业出版
- 精通Python爬虫框架Scrapy,人民邮电出版社
- Python3网络爬虫:http://blog.csdn.net/c406495762/article/details/72858983
- Scrapy官方教程:http://scrapy-chs.readthedocs.io/zh_CN/0.24/intro/tutorial.html
预备知识:
- URL
- HTTP协议
- web前端:html,css,js
- ajax
- re,xpath
- xml
一、爬虫简介:
爬虫定义:网络爬虫(又被称为网页蜘蛛,网络机器人,在FOAF社区中间,更经常的称为网页追逐者),是一种按照一定的规则,自动地抓取万维网信息的程序或脚本。
另外一些不常使用的名字还有蚂蚁、自动索引、、模拟程序或者蠕虫。
两大特征:
- 能按作者要求下载数据或内容
- 能自动在网上流窜
三大步骤:
- 下载网页
- 提取正确的信息
- 根据一定规则自动跳到另外网页上执行上两步
爬虫分类:
- 通用爬虫
- 专用爬虫(聚集爬虫
Python网络包简介:
- Python2:urllib、urllib2、urllib3、httplib、httpli2、requests
- Python3:urllib、urllib3、httplib2、requests
- Python2:urllib和urllib2配合使用,或requests
- Python3:urllib、requests
二、urllib
包含模块:
- urllib.request:打开和读取urls
- urllib.error:包含urllib.request产生的常见的错误,使用try捕捉
- urllib.parse:包含解析url的方法(见案例4)
- urllib.robotparse:解析robots.txt文件
案例1:
from urllib import request url='http://sogo.com' rsp=request.urlopen(url) print(type(rsp)) # <class 'http.client.HTTPResponse'> html=rsp.read() print(type(html)) # <class 'bytes'> html=html.decode() print(type(html)) # <class 'str'> print(html)
案例2:使用chardet自动检测字符编码
1 from urllib import request 2 import chardet 3 4 url='http://sogo.com' 5 rsp=request.urlopen(url) 6 html=rsp.read() 7 8 charset=chardet.detect(html) 9 print(type(charset)) # <class 'dict'> 10 print(charset) 11 print(charset.get('encoding')) 12 13 html=html.decode(charset.get('encoding','utf-8')) #使用字典get方法的好处:如果没有此键,不会报错。 14 print(html)
案例3:
1 from urllib import request 2 3 url='http://sogo.com' 4 rsp=request.urlopen(url) 5 print(rsp.geturl()) # https://www.sogo.com/ 6 print(rsp.getcode()) # 200 7 print(rsp.info()) #结果见多行注释 8 ''' 9 Server: nginx 10 Date: Sun, 01 Dec 2019 14:53:13 GMT 11 Content-Type: text/html; charset=utf-8 12 Transfer-Encoding: chunked 13 Connection: close 14 Vary: Accept-Encoding 15 Set-Cookie: ABTEST=3|1575211993|v17; expires=Tue, 31-Dec-19 14:53:13 GMT; path=/ 16 P3P: CP="CURa ADMa DEVa PSAo PSDo OUR BUS UNI PUR INT DEM STA PRE COM NAV OTC NOI DSP COR" 17 Set-Cookie: IPLOC=CN3201; expires=Mon, 30-Nov-20 14:53:13 GMT; domain=.sogo.com; path=/ 18 P3P: CP="CURa ADMa DEVa PSAo PSDo OUR BUS UNI PUR INT DEM STA PRE COM NAV OTC NOI DSP COR" 19 Set-Cookie: SUID=B2E70270541C940A000000005DE3D3D9; expires=Sat, 26-Nov-2039 14:53:13 GMT; domain=.sogo.com; path=/ 20 P3P: CP="CURa ADMa DEVa PSAo PSDo OUR BUS UNI PUR INT DEM STA PRE COM NAV OTC NOI DSP COR" 21 x-log-ext: nodejs=1 22 Set-Cookie: black_passportid=; path=/; expires=Thu, 01 Jan 1970 00:00:00 GMT; domain=.sogo.com 23 Pragma: No-cache 24 Cache-Control: max-age=0 25 Expires: Sun, 01 Dec 2019 14:53:13 GMT 26 27 28 '''
案例4:parse解析url参数(下面两个例子打印的url可以在浏览器访问,在代码中运行无结果,可能网站采用了反爬虫技术)
1 from urllib import request, parse 2 3 url = 'https://www.sogou.com/web?' 4 wd = input('请输入要搜索的关键字:') # 输入:黄山 5 # 用字典结构定义data 6 data = { 7 'query': wd 8 } 9 data = parse.urlencode(data) 10 print(type(data)) # <class 'str'> 11 print(data) # query=%E9%BB%84%E5%B1%B1 12 url += data 13 print(url) # https://www.sogou.com/web?query=%E9%BB%84%E5%B1%B1 14 rsp = request.urlopen(url) 15 html = rsp.read().decode() 16 print(html)
1 from urllib import request, parse 2 3 url = 'https://www.baidu.com/s?' 4 wd = input('请输入要搜索的关键字:') # 输入:长城 5 # 用字典结构定义data 6 data = { 7 'wd': wd 8 } 9 data = parse.urlencode(data) 10 print(type(data)) # <class 'str'> 11 print(data) # wd=%E9%95%BF%E5%9F%8E 12 url += data 13 print(url) # https://www.baidu.com/s?wd=%E9%95%BF%E5%9F%8E 14 rsp = request.urlopen(url) 15 html = rsp.read() 16 html=html.decode() 17 print(html)
案例5:post请求:自动对参数加密,比get安全
1 from urllib import request, parse 2 import json 3 4 baseUrl = 'http://fanyi.baidu.com/sug' 5 data = {'kw': 'girl'} 6 data = parse.urlencode(data) 7 print(type(data)) # <class 'str'> 8 data = data.encode('utf-8') 9 print(type(data)) # <class 'bytes'> 10 rsp = request.urlopen(baseUrl, data=data) 11 html = rsp.read().decode() 12 print(type(html)) # <class 'str'> 13 html = json.loads(html) 14 print(type(html)) # <class 'dict'> 15 print(html) #{'errmsg': '参数错误', 'errno': 1001}不知道为啥
1 from urllib import request, parse 2 import json 3 4 baseUrl = 'http://fanyi.baidu.com/sug' 5 data = {'kw': 'girl'} 6 data = parse.urlencode(data) 7 print(type(data)) # <class 'str'> 8 data = data.encode('utf-8') 9 print(type(data)) # <class 'bytes'> 10 headers = {'Content-Length': len(data)} 11 req = request.Request(baseUrl, data=data, headers=headers) 12 rsp = request.urlopen(req) 13 html = rsp.read().decode() 14 print(type(html)) # <class 'str'> 15 html = json.loads(html) 16 print(type(html)) # <class 'dict'> 17 print(html) #{'errmsg': '参数错误', 'errno': 1001}
error模块:HTTPError,URLError
1 from urllib import request,error 2 3 # url='http://sogo.com' 4 # url='http://sogooooooooooooooooo.com' #会导致URLError 5 url='http://www.sipo.gov.cn/www' #会导致HTTPError 6 try: 7 req=request.Request(url) 8 rsp=request.urlopen(req) 9 html=rsp.read().decode() 10 print(html) 11 except error.HTTPError as e: 12 print('HTTPError:{}'.format(e.reason)) 13 print('HTTPError:{}'.format(e)) 14 except error.URLError as e: 15 print('URLError:{}'.format(e.reason)) #URLError:[Errno 11001] getaddrinfo failed 16 print('URLError:{}'.format(e)) #URLError:<urlopen error [Errno 11001] getaddrinfo failed> 17 18 '''HTTPError的打印: 19 HTTPError:Not Found 20 HTTPError:HTTP Error 404: Not Found 21 ''' 22 except Exception as e: 23 print(e)
User-Agent
1 from urllib import request,error 2 3 url='http://sogo.com' 4 try: 5 #方式一: 6 ''' 7 headers={} 8 headers['User-Agent']='Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0' 9 req=request.Request(url,headers=headers) 10 ''' 11 12 #方式二: 13 req=request.Request(url) 14 req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0') 15 16 rsp=request.urlopen(req) 17 html=rsp.read().decode() 18 print(html) 19 except error.HTTPError as e: 20 print(e) 21 except error.URLError as e: 22 print(e) 23 except Exception as e: 24 print(e) 25 print('Done!')
代理 proxy
1 from urllib import request,error 2 3 url='http://sogo.com' 4 5 #可获取公用代理IP:西刺免费代理IP:www.xicidaili.com www.goubanjia.com 6 7 #使用代理步骤 8 #1.设置代理地址 9 proxy={'http':'223.199.31.116:9999'} 10 #2.创建ProxyHandler 11 proxyHandler=request.ProxyHandler(proxy) 12 #3.创建Opener 13 opener=request.build_opener((proxyHandler)) 14 #4.安装Opener 15 request.install_opener(opener) 16 17 try: 18 rsp=request.urlopen(url) 19 html=rsp.read().decode() 20 print(html) 21 except error.HTTPError as e: 22 print(e) 23 except error.URLError as e: 24 print(e) 25 except BaseException as e: 26 print(e)
cookie和session
由于http协议的无记忆性,人们为了弥补这个缺憾,所采用的一个补充协议。
cookie是发给用户(即浏览器)的一段信息,session是保存在服务器上的另一半信息,用来记录用户信息。
cookie和session区别:存放位置不同,cookie在用户端,session在服务器端。cookie不安全
1 from urllib import request,error 2 3 url='http://www.renren.com/973090887/profile' 4 5 #方式一: 6 # req=request.Request(url) 7 # rsp=request.urlopen(req) 8 9 #方式二: 10 rsp=request.urlopen(url) 11 12 html=rsp.read().decode('utf-8') 13 with open('rsp.html','w') as f: 14 f.write(html)
直接复制cookie,注:headers={'Cookie':''}也可小写的cookie如headers={'cookie':''}
1 from urllib import request,error 2 3 url='http://www.renren.com/973090887/profile' 4 headers={'Cookie':'anonymid=k46zbise-53dqfd; depovince=GW; jebecookies=b42c5959-ff61-401a-9163-06036762bca2|||||; _r01_=1; ick_login=cc725645-63f4-402a-86e5-6ec548cdf7db; t=83b415286308687c31207afe721529db7; societyguester=83b415286308687c31207afe721529db7; id=973090887; xnsid=f66a859a; JSESSIONID=abc9MR0JPSynRciKxLj8w; ver=7.0; loginfrom=null; jebe_key=8757b4d5-eed1-4a7a-b767-a881d17faa92%7Ca98b442c97be77cf055d58a8be6ee9ca%7C1576413790880%7C1%7C1576413789040; jebe_key=8757b4d5-eed1-4a7a-b767-a881d17faa92%7Ca98b442c97be77cf055d58a8be6ee9ca%7C1576413790880%7C1%7C1576413789050; wp_fold=0'} 5 req=request.Request(url,headers=headers) 6 rsp=request.urlopen(req) 7 html=rsp.read().decode() 8 with open('rsp.html','w') as f: 9 f.write(html)
http模块包含一些关于cookie的模块,通过他们可以自动使用cookie
- CookieJar
- 管理存储cookie,向传出的http请求添加cookie
- cookie存储在内存中,CookieJar实例回收后cookie将消失
- FileCookieJar(filename,delayload=None,policy=None)
- 使用文件管理cookie
- filename是保存cookie的文件
- MozillaCookieJar(filename,delayload=None,policy=None)
- 创建与mozilla浏览器cookie.txt兼容的FileCookieJar实例
- LwpCookieJar
- 创建与libwww-perl标准兼容的Set-Cookie3格式的FileCookieJar实例
关系:CookieJar -> FileCookieJar -> MozillaCookieJar & LwpCookieJar
1 from urllib import request,parse 2 from http import cookiejar 3 4 #创建cookiejar实例 5 cookie=cookiejar.CookieJar() 6 #生成cookie管理器 7 cookie_handler=request.HTTPCookieProcessor(cookie) 8 #创建http请求管理器 9 http_handler=request.HTTPHandler() 10 #生成https管理器 11 https_handler=request.HTTPSHandler() 12 #创建请求管理器 13 opener=request.build_opener(http_handler,https_handler,cookie_handler) 14 15 def login(): 16 url='http://www.renren.com/PLogin.do' 17 data={'email':'13119144223','password':'123456'} 18 data=parse.urlencode(data) 19 print('dataAfterUrlEncode=',data) 20 req=request.Request(url,data=data.encode()) 21 rsp=opener.open(req)
print('cookie===>>>',cookie)
print('cookie dir--->>>',dir(cookie))
for i in cookie:
print(i) 22 23 def getHomePage(): 24 url='http://www.renren.com/965187997/profile' 25 rsp=opener.open(url) 26 html=rsp.read().decode() 27 with open('rsp.html','w',encoding='utf-8') as f: 28 f.write(html) 29 def main(): 30 login() 31 getHomePage() 32 33 if __name__ == '__main__': 34 main()
cookie保存到文件
1 from urllib import request,parse 2 from http import cookiejar 3 4 #创建cookiejar实例 5 cookie=cookiejar.MozillaCookieJar('cookie.txt') 6 #生成cookie管理器 7 cookie_handler=request.HTTPCookieProcessor(cookie) 8 #创建http请求管理器 9 http_handler=request.HTTPHandler() 10 #生成https管理器 11 https_handler=request.HTTPSHandler() 12 #创建请求管理器 13 opener=request.build_opener(http_handler,https_handler,cookie_handler) 14 15 def login(): 16 url='http://www.renren.com/PLogin.do' 17 data={'email':'13119144223','password':'123456'} 18 data=parse.urlencode(data) 19 print('dataAfterUrlEncode=',data) 20 req=request.Request(url,data=data.encode()) 21 rsp=opener.open(req) 22 print('cookie===>>>',cookie) 23 print('cookie dir--->>>',dir(cookie)) 24 for i in cookie: 25 print(i) 26 27 #保存cookie: ignore_discard即cookie即将丢弃也保存,ignore_expires即使cookie即将过期也保存 28 cookie.save(ignore_discard=True,ignore_expires=True) 29 30 def getHomePage(): 31 url='http://www.renren.com/965187997/profile' 32 rsp=opener.open(url) 33 html=rsp.read().decode() 34 with open('rsp.html','w',encoding='utf-8') as f: 35 f.write(html) 36 def main(): 37 login() 38 getHomePage() 39 40 if __name__ == '__main__': 41 main()
cookie.txt文件内容:
# Netscape HTTP Cookie File # http://curl.haxx.se/rfc/cookie_spec.html # This is a generated file! Do not edit. .renren.com TRUE / FALSE 1607525840 _de 420A8DC764CD1624FC7C8526DA9A3A25 .renren.com TRUE / FALSE 1734101840 anonymid k474v1fq-iqokun .renren.com TRUE / FALSE first_login_flag 1 .renren.com TRUE / FALSE id 965187997 .renren.com TRUE / FALSE 1579013840 ln_hurl http://head.xiaonei.com/photos/0/0/men_main.gif .renren.com TRUE / FALSE 1579013840 ln_uact 13119144223 .renren.com TRUE / FALSE loginfrom null .renren.com TRUE / FALSE p f089872e2b5af59fd90305191668f7c27 .renren.com TRUE / FALSE societyguester 93d42a597819c5e50b77a188d04bf43d7 .renren.com TRUE / FALSE t 93d42a597819c5e50b77a188d04bf43d7 .renren.com TRUE / FALSE ver 7.0 .renren.com TRUE / FALSE xnsid f5efa6dd .renren.com TRUE /xtalk/ FALSE t 20f31c075aa6b30a902235ff55699182 www.renren.com FALSE / FALSE JSESSIONID abcvLdUTFcjFkh53fek8w
cookie.load()
1 from urllib import request,parse 2 from http import cookiejar 3 4 #创建cookiejar实例 5 cookie=cookiejar.MozillaCookieJar() 6 cookie.load('cookie.txt',ignore_expires=True,ignore_discard=True) 7 #生成cookie管理器 8 cookie_handler=request.HTTPCookieProcessor(cookie) 9 #创建http请求管理器 10 http_handler=request.HTTPHandler() 11 #生成https管理器 12 https_handler=request.HTTPSHandler() 13 #创建请求管理器 14 opener=request.build_opener(http_handler,https_handler,cookie_handler) 15 16 def getHomePage(): 17 url='http://www.renren.com/965187997/profile' 18 rsp=opener.open(url) 19 html=rsp.read().decode() 20 with open('rsp.html','w',encoding='utf-8') as f: 21 f.write(html) 22 def main(): 23 getHomePage() 24 25 if __name__ == '__main__': 26 main()
简单的访问
1 from urllib import request 2 3 url='https://www.12306.cn/mormhweb' 4 rsp=request.urlopen(url) 5 html=rsp.read().decode() 6 print(html)
若有SSL未认证问题用这个:
1 from urllib import request 2 import ssl 3 4 #利用未认证的上下文环境替换认证的上下文环境 5 ssl._create_default_https_context=ssl._create_unverified_context 6 7 url='https://www.12306.cn/mormhweb' 8 rsp=request.urlopen(url) 9 html=rsp.read().decode() 10 print(html)
复制浏览器F12相关信息,访问有道(youdao)post请求:
1 from urllib import request,parse 2 3 def youdao(): 4 url='http://fanyi.youdao.com/translate_o?smartresult=dict&smartresult=rule' 5 6 data={ 7 'i':'girl', 8 'from':'AUTO', 9 'to':'AUTO', 10 'smartresult':'dict', 11 'client':'fanyideskweb', 12 'salt':'15764998714040', 13 'sign':'6d154aa4d240a327c1b00a7265ee0c42', 14 'ts':'1576499871404', 15 'bv':'e2a78ed30c66e16a857c5b6486a1d326', 16 'doctype':'json', 17 'version':'2.1', 18 'keyfrom':'fanyi.web', 19 'action':'FY_BY_REALTlME' 20 } 21 22 data=parse.urlencode(data).encode() 23 24 headers={ 25 'Accept':'application/json, text/javascript, */*; q=0.01', 26 'Accept-Language':'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', 27 'Connection':'keep-alive', 28 'Content-Length':'237', 29 'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8', 30 'Cookie':'YOUDAO_MOBILE_ACCESS_TYPE=1; DICT_UGC=be3af0da19b5c5e6aa4e17bd8d90b28a|; OUTFOX_SEARCH_USER_ID=878677551@112.20.83.126; JSESSIONID=abccfVgNRwMree-0iOo8w; OUTFOX_SEARCH_USER_ID_NCOO=629431719.4616601; _ntes_nnid=cdfe6631d454b95eb3fe744ddcd37a9d,1576498433752; ___rl__test__cookies=1576499871397', 31 'Host':'fanyi.youdao.com', 32 'Origin':'http://fanyi.youdao.com', 33 'Referer':'http://fanyi.youdao.com/?keyfrom=dict2.top', 34 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0', 35 'X-Requested-With':'XMLHttpRequest' 36 } 37 38 req=request.Request(url,data=data,headers=headers) 39 rsp=request.urlopen(req) 40 html=rsp.read().decode() 41 print(html) 42 43 youdao()
修改上面代码,获取加密算法:
获取JS代码:
粘贴到在线工具:https://tool.oschina.net/codeformat/js,或换个格式化好了的且可复制的浏览器(如智慧联想浏览器)
找到salt,sign:
通过浏览器控制台查看JS代码功能:
1 from urllib import request,parse 2 3 def getSalt(): 4 ''' 5 salt: i, 6 i = r + parseInt(10 * Math.random(), 10); 7 r = "" + (new Date).getTime(), 8 9 salt=(new Date).getTime() + parseInt(10 * Math.random(), 10) 10 ''' 11 import time,random 12 return int(time.time()*1000)+random.randint(0,10) 13 14 def getMD5(v): 15 import hashlib 16 md5=hashlib.md5() 17 18 md5.update(v.encode()) 19 sign=md5.hexdigest() 20 21 return sign 22 23 def getSign(key,salt): 24 ''' 25 sign: n.md5("fanyideskweb" + e + i + "n%A-rKaT5fb[Gy?;N5@Tj") 26 i = r + parseInt(10 * Math.random(), 10); 27 r = "" + (new Date).getTime(), 28 29 sign="fanyideskweb" + key + salt +"n%A-rKaT5fb[Gy?;N5@Tj" 30 ''' 31 return getMD5(''.join(("fanyideskweb" , key , str(salt) ,"n%A-rKaT5fb[Gy?;N5@Tj"))) 32 33 def youdao(key): 34 url='http://fanyi.youdao.com/translate_o?smartresult=dict&smartresult=rule' 35 36 salt=getSalt() 37 data={ 38 'i':key, 39 'from':'AUTO', 40 'to':'AUTO', 41 'smartresult':'dict', 42 'client':'fanyideskweb', 43 'salt':str(salt), 44 'sign':getSign(key,salt), 45 'ts':'1576499871404', 46 'bv':'e2a78ed30c66e16a857c5b6486a1d326', 47 'doctype':'json', 48 'version':'2.1', 49 'keyfrom':'fanyi.web', 50 'action':'FY_BY_REALTlME' 51 } 52 53 data=parse.urlencode(data).encode() 54 55 headers={ 56 'Accept':'application/json, text/javascript, */*; q=0.01', 57 'Accept-Language':'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', 58 'Connection':'keep-alive', 59 'Content-Length':len(data), 60 'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8', 61 'Cookie':'YOUDAO_MOBILE_ACCESS_TYPE=1; DICT_UGC=be3af0da19b5c5e6aa4e17bd8d90b28a|; OUTFOX_SEARCH_USER_ID=878677551@112.20.83.126; JSESSIONID=abccfVgNRwMree-0iOo8w; OUTFOX_SEARCH_USER_ID_NCOO=629431719.4616601; _ntes_nnid=cdfe6631d454b95eb3fe744ddcd37a9d,1576498433752; ___rl__test__cookies=1576499871397', 62 'Host':'fanyi.youdao.com', 63 'Origin':'http://fanyi.youdao.com', 64 'Referer':'http://fanyi.youdao.com/?keyfrom=dict2.top', 65 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0', 66 'X-Requested-With':'XMLHttpRequest' 67 } 68 69 req=request.Request(url,data=data,headers=headers) 70 rsp=request.urlopen(req) 71 html=rsp.read().decode() 72 print(html) 73 74 youdao('boy')
……