Python爬虫Spider 二
来自北京图灵学院刘英
ajax
运行有问题:urllib.error.HTTPError: HTTP Error 418
from urllib import request import json url='https://movie.douban.com/j/chart/top_list?type=11&interval_id=100%3A90&action=&start=40&limit=20' rsp=request.urlopen(url) data=rsp.read().decode() data=json.loads(data) print(data)
requests库:
Requests - 献给人类
HTTP for Humans,更简洁更友好
继承了urllib的所有特征
底层使用的是urllib3
开源地址:https://github.com/requests/requests
中文文档:http://docs.python-requests.org/zh_CN/latest/index.html
import requests url='http://sogo.com' # 方式一: rsp=requests.get(url) print(rsp.text) #方式二: rsp=requests.request('get',url) print(rsp.text)
requests带参数的get
1 import requests 2 3 url='http://sogo.com/web?' 4 kw={'query':'天龙八部'} 5 headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0'} 6 rsp=requests.get(url,params=kw,headers=headers) 7 print(rsp.text) 8 print(rsp.content) 9 print(rsp.url) 10 print(rsp.encoding) 11 print(rsp.status_code)
requests的Post请求:
1 import requests 2 3 url = 'https://fanyi.baidu.com/sug' 4 data = {'kw': 'girl'} 5 headers = { 6 'Content-Length': str(len(data)), 7 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0' 8 } 9 rsp = requests.post(url, data=data, headers=headers) 10 print(rsp.text) 11 print(rsp.json())
request Proxy
proxies={'http':'address of proxy','https':'address of proxy'}
rsp=requests.request('get','http:xxx',proxies=proxies)
proxy={'http':'China:123456@192.168.1.2:1234}
rsp=requests.get('http://baidu.com',proxies=proxy)
auth={'username','password'}
rsp=requests.get('http://sogo.com',auth=auth)
https请求验证SSL证书,参数verify负责表示是否需要验证SSL证书,默认是True,如果不需要验证SSL证书,则设置成False表示关闭:
rsp=requests.get('https://sogo.com',verify=False)
正则:
match、pattern、group等用法:
1 # 正则match使用案例 2 import re 3 4 s = r'([a-z]+) ([a-z])+' 5 pattern = re.compile(s, re.I) # re.I表示忽略大小写 6 m = pattern.match('Hello world wide web') 7 8 print(type(m)) # <class '_sre.SRE_Match'> 9 print(m) # <_sre.SRE_Match object; span=(0, 11), match='Hello world'> 10 11 print(m.group()) # 即group(0)表示匹配成功的整个子串。结果:Hello world 12 print(m.span()) # 即 span(0) , (0, 11) 返回匹配成功整个子串的跨度 13 14 print(m.group(1)) # Hello 15 print(m.span(1)) # (0, 5) 16 17 print(m.group(2)) # d 这个没搞懂?! 18 print(m.span(2)) # (10, 11) 19 20 print(m.groups()) # ('Hello', 'd') 21 22 # print(m.group(3)) # IndexError: no such group 23 # print(m.span(3)) 24 25 print('##############以下同上#############')
26 """Try to apply the pattern at the start of the string, returning
a match object, or None if no match was found.""" 27 result = re.match(r'([a-z]+) ([a-z])+', 'Hello world wide web', re.I) #从开头开始匹配,好比加了^ 28 29 print(type(result)) # <class '_sre.SRE_Match'> 30 print(result) # <_sre.SRE_Match object; span=(0, 11), match='Hello world'> 31 print(result.group()) # Hello world 32 print(result.span()) # (0, 11) 33 print(result.group(1)) # Hello 34 print(result.span(1)) # (0, 5) 35 print(result.group(2)) # d 36 print(result.span(2)) # (10, 11) 37 print(result.groups()) # ('Hello', 'd') 38 # print(result.group(3)) #IndexError: no such group 39 # print(result.span(3))
search使用:
1 # 正则search使用案例 2 import re 3 4 s = r'\d+' 5 pattern = re.compile(s) 6 m = pattern.search('one12two34three56') 7 print(m) # <_sre.SRE_Match object; span=(3, 5), match='12'> 8 print(m.group()) # 12 9 print(m.groups()) # () 10 print(m.group(0)) # 12 11 # print(m.group(1))# IndexError: no such group 12 print(m.span()) # (3, 5) 13 14 m = pattern.search('one12two34three56', 10, 40) 15 print(m.group()) # 56 16 17 print('############################') 18 19 result = re.search(r'\d+', 'one12two34three56') 20 print(result) # <_sre.SRE_Match object; span=(3, 5), match='12'> 21 print(result.group()) # 12 22 print(result.groups()) # () 23 print(result.group(0)) # 12 24 print(result.span()) # (3, 5)
findall
1 # findall使用案例 2 import re 3 4 pattern = re.compile(r'\d+') 5 s = pattern.findall('i am 18 years old and 170 height') 6 print(s) # ['18', '170'] 7 8 # 以下同上 9 10 print(re.findall(r'\d+', 'i am 18 years old and 170 height')) # ['18', '170']
finditer
1 # finditer案例 2 import re 3 4 pattern = re.compile(r'\d+') 5 s = pattern.finditer('i am 19 years old and 170 heigth') 6 print(s) # <callable_iterator object at 0x0000021B4904FA90> 7 for i in s: 8 print(i.group()) 9 '''19 10 170''' 11 12 print('##########以下同上##########') 13 14 s = re.finditer(r'\d+', 'i am 19 years old and 170 heigth') 15 print(s) # <callable_iterator object at 0x0000021B48FC6C18> 16 for i in s: 17 print(i.group()) 18 '''19 19 170'''
匹配中文:
1 # 匹配中文unicode案例 2 import re 3 4 hello = '世界,你好!' 5 pattern = re.compile(r'[\u4e00-\u9fa5]+') 6 m = pattern.findall(hello) 7 print(m) # ['世界', '你好'] 8 9 print(re.findall(r'[\u4e00-\u9fa5]+', '世界,你好!')) # ['世界', '你好']
贪婪与非贪婪模式
- 贪婪模式:在整个表达式匹配成功的前提下,尽可能多的匹配
- 非贪婪模式:在整个表达式匹配成功的前提下,尽可以少的匹配
- python里面数量词默认是贪婪模式
例:查找文本:abbbc
正则表达式:ab*
贪婪:结果abbb
非贪婪:结果a
1 # 贪婪非贪婪默认值案例 2 import re 3 4 result = re.match('ab*', 'abbb') 5 print(result.group()) # abbb
xpath定位:/bookstore/book[1],/bookstore/book[last()],/bookstore/book[last()-1]
beautifulSoup4
http://beautifulsoup.readthedocs.io/zh_CN/v4.4.0/
运行有问题:bs4.FeatureNotFound: Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library?
1 from urllib import request 2 from bs4 import BeautifulSoup 3 4 url='http://www.baidu.com' 5 rsp=request.urlopen(url) 6 content=rsp.read() 7 soup=BeautifulSoup(content,'lxml') #'lxml'改成'html.parser'就可以 8 9 #bs自动转码 10 content=soup.prettify() 11 print(content)
四大对象:
- Tag
- 对应HTML中的标签
- 可以通过soup.tag_name
- tag两个重要属性:name、attrs
- NavigableString :对应内容值
- BeautifulSoup
- 表示的是一个文档的内容,大部分可以把他当做tag对象
- 一般我们可用soup表示
- Comment
- 特殊类型的NavigableString对象
- 对其输出,则内容不包括注释符号
1 from urllib import request 2 from bs4 import BeautifulSoup 3 4 url='http://www.baidu.com' 5 rsp=request.urlopen(url) 6 content=rsp.read() 7 soup=BeautifulSoup(content,'html.parser') 8 9 #bs自动转码 10 content=soup.prettify() 11 print(content) 12 print('-'*50) 13 print(soup.head) 14 print('-'*50) 15 print(soup.meta) 16 print('-'*50) 17 print(soup.link) 18 print(soup.link.name) 19 print(soup.link.attrs) 20 print(soup.link.attrs['type']) 21 soup.link.attrs['type']='hello world' 22 print(soup.link) 23 print('-'*50) 24 print(soup.title) 25 print(soup.title.name) 26 print(soup.title.attrs) 27 print(soup.title.string) 28 print('-'*50) 29 print(soup.name) 30 print(soup.attrs)
find_all:
1 from urllib import request 2 from bs4 import BeautifulSoup 3 4 url='http://www.baidu.com' 5 rsp=request.urlopen(url) 6 content=rsp.read() 7 soup=BeautifulSoup(content,'html.parser') 8 9 print(soup.name) 10 print('='*50) 11 for node in soup.head.contents: 12 if node.name=='meta': 13 print(node) 14 if node.name=='title': 15 print(node) #没有打印内容,好奇怪! 16 print('='*50) 17 tags=soup.find_all(name='meta') 18 print(tags) 19 print('='*50) 20 import re 21 tags=soup.find_all(re.compile('^me')) 22 for tag in tags: 23 print(tag) 24 print('='*50) 25 tags=soup.find_all(re.compile('^me'),content='always') 26 for tag in tags: 27 print(tag)
CSS选择器:
- 使用soup.select,返回一个列表
- 通过标签名称:soup.select('title')
- 通过类名:soup.select('.content')
- id查找:soup.select('#name_id')
- 组合查找:soup.select('title .content #name_id')
- 属性查找:soup.select('img[class='photo'])
- 获取tag内容:tag.get_text
1 from urllib import request 2 from bs4 import BeautifulSoup 3 4 url='http://www.baidu.com' 5 rsp=request.urlopen(url) 6 content=rsp.read() 7 soup=BeautifulSoup(content,'html.parser') 8 9 print(soup.prettify()) 10 print('*'*50) 11 titles=soup.select('title') 12 print(titles[0]) 13 14 print('*'*20) 15 metas=soup.select('meta[content="always"]') 16 print(metas[0])
DHTML
Selenium: pip install selenium==2.48.0
官网:http://selenium-python.readthedocs.io/index.html
无界面浏览器:PhantomJS(幽灵)
from selenium import webdriver driver=webdriver.Firefox() driver.get('http://www.baidu.com') print(driver.title)
from selenium.webdriver.common.keys import Keys
driver.find_element_by_id('kw').send_keys(Key.CONTROL,'a')
driver.find_element_by_id('kw').send_keys(Key.CONTROL,'x')
driver.find_element_by_id('kw').send_keys(u'黄山')
driver.save_screenshot('huangshang.png')
driver.find_element_by_id('su').send_keys(Keys.RETURN)
获取图片验证码:运行有问题:pytesseract.pytesseract.TesseractNotFoundError: tesseract is not installed or it's not in your path
import pytesseract from PIL import Image image=Image.open('a.png') text=pytesseract.image_to_string(image) print(text)
……