Python爬虫Spider 二

来自北京图灵学院刘英

ajax

运行有问题:urllib.error.HTTPError: HTTP Error 418

from urllib import request
import json

url='https://movie.douban.com/j/chart/top_list?type=11&interval_id=100%3A90&action=&start=40&limit=20'
rsp=request.urlopen(url)
data=rsp.read().decode()
data=json.loads(data)
print(data)

 


requests库:

Requests - 献给人类

HTTP for Humans,更简洁更友好

继承了urllib的所有特征

底层使用的是urllib3

开源地址:https://github.com/requests/requests

中文文档:http://docs.python-requests.org/zh_CN/latest/index.html

import requests

url='http://sogo.com'

# 方式一:
rsp=requests.get(url)
print(rsp.text)

#方式二:
rsp=requests.request('get',url)
print(rsp.text)

 requests带参数的get

 1 import requests
 2 
 3 url='http://sogo.com/web?'
 4 kw={'query':'天龙八部'}
 5 headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0'}
 6 rsp=requests.get(url,params=kw,headers=headers)
 7 print(rsp.text)
 8 print(rsp.content)
 9 print(rsp.url)
10 print(rsp.encoding)
11 print(rsp.status_code)

 requests的Post请求:

 1 import requests
 2 
 3 url = 'https://fanyi.baidu.com/sug'
 4 data = {'kw': 'girl'}
 5 headers = {
 6             'Content-Length': str(len(data)),
 7             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0'
 8            }
 9 rsp = requests.post(url, data=data, headers=headers)
10 print(rsp.text)
11 print(rsp.json())

request  Proxy

proxies={'http':'address of proxy','https':'address of proxy'}

rsp=requests.request('get','http:xxx',proxies=proxies)

 

proxy={'http':'China:123456@192.168.1.2:1234}

rsp=requests.get('http://baidu.com',proxies=proxy)

 

auth={'username','password'}

rsp=requests.get('http://sogo.com',auth=auth)


 https请求验证SSL证书,参数verify负责表示是否需要验证SSL证书,默认是True,如果不需要验证SSL证书,则设置成False表示关闭:

rsp=requests.get('https://sogo.com',verify=False)


正则:

match、pattern、group等用法:

 1 # 正则match使用案例
 2 import re
 3 
 4 s = r'([a-z]+) ([a-z])+'
 5 pattern = re.compile(s, re.I)  # re.I表示忽略大小写
 6 m = pattern.match('Hello world wide web')
 7 
 8 print(type(m))  # <class '_sre.SRE_Match'>
 9 print(m)  # <_sre.SRE_Match object; span=(0, 11), match='Hello world'>
10 
11 print(m.group())  # 即group(0)表示匹配成功的整个子串。结果:Hello world
12 print(m.span())  # 即 span(0) , (0, 11) 返回匹配成功整个子串的跨度
13 
14 print(m.group(1))  # Hello
15 print(m.span(1))  # (0, 5)
16 
17 print(m.group(2))  # d 这个没搞懂?!
18 print(m.span(2))  # (10, 11)
19 
20 print(m.groups())  # ('Hello', 'd')
21 
22 # print(m.group(3)) # IndexError: no such group
23 # print(m.span(3))
24 
25 print('##############以下同上#############')
26   """Try to apply the pattern at the start of the string, returning
      a match object, or None if no match was found."""
27 result = re.match(r'([a-z]+) ([a-z])+', 'Hello world wide web', re.I) #从开头开始匹配,好比加了^ 28 29 print(type(result)) # <class '_sre.SRE_Match'> 30 print(result) # <_sre.SRE_Match object; span=(0, 11), match='Hello world'> 31 print(result.group()) # Hello world 32 print(result.span()) # (0, 11) 33 print(result.group(1)) # Hello 34 print(result.span(1)) # (0, 5) 35 print(result.group(2)) # d 36 print(result.span(2)) # (10, 11) 37 print(result.groups()) # ('Hello', 'd') 38 # print(result.group(3)) #IndexError: no such group 39 # print(result.span(3))

 search使用:

 1 # 正则search使用案例
 2 import re
 3 
 4 s = r'\d+'
 5 pattern = re.compile(s)
 6 m = pattern.search('one12two34three56')
 7 print(m)  # <_sre.SRE_Match object; span=(3, 5), match='12'>
 8 print(m.group())  # 12
 9 print(m.groups())  # ()
10 print(m.group(0))  # 12
11 # print(m.group(1))# IndexError: no such group
12 print(m.span())  # (3, 5)
13 
14 m = pattern.search('one12two34three56', 10, 40)
15 print(m.group())  # 56
16 
17 print('############################')
18 
19 result = re.search(r'\d+', 'one12two34three56')
20 print(result)  # <_sre.SRE_Match object; span=(3, 5), match='12'>
21 print(result.group())  # 12
22 print(result.groups())  # ()
23 print(result.group(0))  # 12
24 print(result.span())  # (3, 5)

 findall

 1 # findall使用案例
 2 import re
 3 
 4 pattern = re.compile(r'\d+')
 5 s = pattern.findall('i am 18 years old and 170 height')
 6 print(s)  # ['18', '170']
 7 
 8 # 以下同上
 9 
10 print(re.findall(r'\d+', 'i am 18 years old and 170 height'))  # ['18', '170']

finditer

 1 # finditer案例
 2 import re
 3 
 4 pattern = re.compile(r'\d+')
 5 s = pattern.finditer('i am 19 years old and 170 heigth')
 6 print(s)  # <callable_iterator object at 0x0000021B4904FA90>
 7 for i in s:
 8     print(i.group())
 9     '''19
10 170'''
11 
12 print('##########以下同上##########')
13 
14 s = re.finditer(r'\d+', 'i am 19 years old and 170 heigth')
15 print(s)  # <callable_iterator object at 0x0000021B48FC6C18>
16 for i in s:
17     print(i.group())
18     '''19
19 170'''

 匹配中文:

1 # 匹配中文unicode案例
2 import re
3 
4 hello = '世界,你好!'
5 pattern = re.compile(r'[\u4e00-\u9fa5]+')
6 m = pattern.findall(hello)
7 print(m)  # ['世界', '你好']
8 
9 print(re.findall(r'[\u4e00-\u9fa5]+', '世界,你好!'))  # ['世界', '你好']

 贪婪与非贪婪模式

  • 贪婪模式:在整个表达式匹配成功的前提下,尽可能多的匹配
  • 非贪婪模式:在整个表达式匹配成功的前提下,尽可以少的匹配
  • python里面数量词默认是贪婪模式

例:查找文本:abbbc

  正则表达式:ab*

  贪婪:结果abbb

  非贪婪:结果a

1 # 贪婪非贪婪默认值案例
2 import re
3 
4 result = re.match('ab*', 'abbb')
5 print(result.group())  # abbb

 

xpath定位:/bookstore/book[1],/bookstore/book[last()],/bookstore/book[last()-1]


beautifulSoup4

http://beautifulsoup.readthedocs.io/zh_CN/v4.4.0/

运行有问题:bs4.FeatureNotFound: Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library?

 1 from urllib import request
 2 from bs4 import BeautifulSoup
 3 
 4 url='http://www.baidu.com'
 5 rsp=request.urlopen(url)
 6 content=rsp.read()
 7 soup=BeautifulSoup(content,'lxml') #'lxml'改成'html.parser'就可以
 8 
 9 #bs自动转码
10 content=soup.prettify()
11 print(content)

 四大对象:

  • Tag
  1. 对应HTML中的标签
  2. 可以通过soup.tag_name
  3. tag两个重要属性:name、attrs
  • NavigableString :对应内容值
  • BeautifulSoup
  1. 表示的是一个文档的内容,大部分可以把他当做tag对象
  2. 一般我们可用soup表示
  • Comment
  1. 特殊类型的NavigableString对象
  2. 对其输出,则内容不包括注释符号
 1 from urllib import request
 2 from bs4 import BeautifulSoup
 3 
 4 url='http://www.baidu.com'
 5 rsp=request.urlopen(url)
 6 content=rsp.read()
 7 soup=BeautifulSoup(content,'html.parser')
 8 
 9 #bs自动转码
10 content=soup.prettify()
11 print(content)
12 print('-'*50)
13 print(soup.head)
14 print('-'*50)
15 print(soup.meta)
16 print('-'*50)
17 print(soup.link)
18 print(soup.link.name)
19 print(soup.link.attrs)
20 print(soup.link.attrs['type'])
21 soup.link.attrs['type']='hello world'
22 print(soup.link)
23 print('-'*50)
24 print(soup.title)
25 print(soup.title.name)
26 print(soup.title.attrs)
27 print(soup.title.string)
28 print('-'*50)
29 print(soup.name)
30 print(soup.attrs)

 find_all:

 1 from urllib import request
 2 from bs4 import BeautifulSoup
 3 
 4 url='http://www.baidu.com'
 5 rsp=request.urlopen(url)
 6 content=rsp.read()
 7 soup=BeautifulSoup(content,'html.parser')
 8 
 9 print(soup.name)
10 print('='*50)
11 for node in soup.head.contents:
12     if node.name=='meta':
13         print(node)
14     if node.name=='title':
15         print(node) #没有打印内容,好奇怪!
16 print('='*50)
17 tags=soup.find_all(name='meta')
18 print(tags)
19 print('='*50)
20 import re
21 tags=soup.find_all(re.compile('^me'))
22 for tag in tags:
23     print(tag)
24 print('='*50)
25 tags=soup.find_all(re.compile('^me'),content='always')
26 for tag in tags:
27     print(tag)

 CSS选择器:

  - 使用soup.select,返回一个列表

  - 通过标签名称:soup.select('title')

  - 通过类名:soup.select('.content')

  - id查找:soup.select('#name_id')

  - 组合查找:soup.select('title .content #name_id')

  - 属性查找:soup.select('img[class='photo'])

  - 获取tag内容:tag.get_text

 1 from urllib import request
 2 from bs4 import BeautifulSoup
 3 
 4 url='http://www.baidu.com'
 5 rsp=request.urlopen(url)
 6 content=rsp.read()
 7 soup=BeautifulSoup(content,'html.parser')
 8 
 9 print(soup.prettify())
10 print('*'*50)
11 titles=soup.select('title')
12 print(titles[0])
13 
14 print('*'*20)
15 metas=soup.select('meta[content="always"]')
16 print(metas[0])

 


 

DHTML

Selenium: pip install selenium==2.48.0

官网:http://selenium-python.readthedocs.io/index.html

无界面浏览器:PhantomJS(幽灵)

from selenium import webdriver

driver=webdriver.Firefox()
driver.get('http://www.baidu.com')
print(driver.title)

 from selenium.webdriver.common.keys import Keys

driver.find_element_by_id('kw').send_keys(Key.CONTROL,'a')

driver.find_element_by_id('kw').send_keys(Key.CONTROL,'x')

driver.find_element_by_id('kw').send_keys(u'黄山')

driver.save_screenshot('huangshang.png')

driver.find_element_by_id('su').send_keys(Keys.RETURN)


 

获取图片验证码:运行有问题:pytesseract.pytesseract.TesseractNotFoundError: tesseract is not installed or it's not in your path

import pytesseract
from PIL import Image

image=Image.open('a.png')
text=pytesseract.image_to_string(image)
print(text)

 

……

posted @ 2019-12-16 22:10  xiongjiawei  阅读(111)  评论(0)    收藏  举报