1 #requests+正则爬取豆瓣图书
2
3 import requests
4 import re
5
6 def get_html(url):
7 headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36 LBBROWSER'}
8 response = requests.get(url,headers=headers)
9 html = response.text
10 return html
11
12
13 def get_books(url):
14
15 html = get_html(url)
16 pattern = re.compile(r'<li.*?cover.*?href="(.*?)".*?title="(.*?)".*?more-meta.*?author">(.*?)</span>.*?year">(.*?)</span>.*?</li>',re.S)
17 result = re.findall(pattern,html)
18 for rs in result:
19 link,book,name,data = rs
20 book = re.sub('\s','',book)#可用sub去掉换行空白等
21
22 print(link,book,name.strip(),data.strip())#也可用strip去掉换行空白
23
24
25 if __name__ == '__main__':
26
27 url = 'https://book.douban.com/'
28 get_books(url)