1 #!/usr/bin/env python3
2 # -*- coding: utf-8 -*-
3 # author:Momo time:2018/6/30
4
5 """
6 目标网站:http://tieba.baidu.com/p/3522395718
7 目标内容:跟帖用户名,跟帖内容,跟帖时间
8 涉及知识:
9 Requests 获取网页
10 XPath 提取内容
11 map 实现多线程爬虫
12 掌握以下知识:使用xpath进行网页提取
13 使用map实现多线程爬虫
14 """
15
16 from lxml import etree
17 from multiprocessing.dummy import Pool as ThreadPool
18 import urllib.request
19 import json
20 # from imp import reload
21
22 # # "将贴吧拷下的代码保存为utf-8"
23 # import sys
24 # reload(sys)
25 # sys.setdefaultencoding('utf-8')
26
27 def towrite(contentdict):
28 f.writelines(u'回帖时间:' + str(contentdict['topic_reply_time']) + '\n' )
29 f.writelines(u'回帖内容:' + contentdict['topic_reply_content'] + '\n')
30 f.writelines(u'回帖人:' + str(contentdict['user_name']) + '\n\n')
31
32 def spider(url):
33 html_page = urllib.request.urlopen(url)
34 html_code = html_page.read().decode('utf-8')
35 selector = etree.HTML(html_code)
36 contetnt_field = selector.xpath('//div[@class="l_post j_l_post l_post_bright "]')
37 item = {}
38 for each in contetnt_field:
39 reply_info = json.loads(each.xpath('@data-field')[0].replace('"', ''))
40 author = reply_info['author']['user_name']
41 content = each.xpath('div[@class="d_post_content_main"]/div/cc/div[@class="d_post_content j_d_post_content clearfix"]/text()')[0]
42 reply_time = reply_info['content']['date']
43 print(content)
44 print(reply_time)
45 print(author)
46 item['user_name'] = author
47 item['topic_reply_content'] = content
48 item['topic_reply_time'] = reply_time
49 towrite(item)
50
51 if __name__ == '__main__':
52 pool = ThreadPool(4)
53 f = open('content.txt', 'a',encoding='utf-8')
54 page = []
55 for i in range(1,21):
56 newpage = 'http://tieba.baidu.com/p/3522395718?pn=' + str(i)
57 page.append(newpage)
58
59 results = pool.map(spider, page )
60 pool.close()
61 pool.join()
62 f.close()