1 # requests+xpath+map爬取百度贴吧
2 # 目标内容:跟帖用户名,跟帖内容,跟帖时间
3 # 分解:
4 # requests获取网页
5 # xpath提取内容
6 # map实现多线程爬虫
7 import requests
8 from requests.exceptions import RequestException
9 from lxml import etree
10 import json
11 from multiprocessing.dummy import Pool as ThreadPool
12
13 def get_html(url):
14 try:
15 response = requests.get(url)
16 if response.status_code == 200:
17 return response.text
18 else:
19 return None
20 except RequestException:
21 return None
22
23 def parse_html(html):
24 selector = etree.HTML(html)
25 data = selector.xpath('//div[@class="l_post j_l_post l_post_bright "]')
26 for each in data:
27 rs = each.xpath('@data-field')[0]
28 rs = json.loads(rs)
29 author = rs.get('author').get('user_name')
30 author_id = rs.get('content').get('post_id')
31 content = each.xpath('div/div/cc/div[@id="post_content_%s"]/text()'% author_id)[0].strip()
32 date = rs.get('content').get('date')
33 yield {
34 'author':author,
35 'content':content,
36 'date':date
37 }
38
39 def save_to_txt(result):
40 print('正在存储:',result)
41
42 with open('tieba.txt','a',encoding='utf-8') as f:
43 f.write('回帖作者:'+result['author']+'\n')
44 f.write('回帖内容:'+result['content']+'\n')
45 f.write('回帖时间:'+result['date']+'\n')
46 f.write('\n')
47
48
49 def main(url):
50 html = get_html(url)
51 if html:
52 for result in parse_html(html):
53 save_to_txt(result)
54
55 if __name__=='__main__':
56
57 pool = ThreadPool(4)
58 urls=[]
59 base_url = 'http://tieba.baidu.com/p/3522395718?pn='
60 for page_num in range(1, 21):
61 url = base_url + str(page_num)
62 urls.append(url)
63
64 pool.map(main,urls)
65 pool.close()
66 pool.join()