多进程爬取知乎评论
多进程爬取知乎评论
逛知乎时想,如果写个爬虫把评论抓下来,应该蛮有意思的,说干就干。
先看一下网页结构,果然在XHR下找到了一个像JSON的文件,打开一看,想要的数据都躲在里面,不说废话,直接上代码。
1 __author__ = 'xyz'
2
3 import requests
4 from urllib.parse import urlencode
5 from multiprocessing.dummy import Pool as ThreadPool
6
7 def get_info(url):
8 dic = { }#建个字典存储数据
9
10 headers = {
11 'authorization': 'Bearer 2|1:0|10:1517047037|4:z_c0|80:MS4xU0IxRkJ3QUFBQUFtQUFBQVlBSlZUZjJhV1Z1U2FhTURoSVdhb0V1dV9abTJFNkE4aGpBVnlBPT0=|7200c19ff1e6c1a18390b9af7392f8dcca584bcb3c31cd3d8161d661c18eaa76',
12 'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36',
13 'cookie':'_zap = fa9bac27 - b3bb - 408f - 8ea8 - 581d77a0cc7e;z_c0 = "2|1:0|10:1517047037|4:z_c0|80:MS4xU0IxRkJ3QUFBQUFtQUFBQVlBSlZUZjJhV1Z1U2FhTURoSVdhb0V1dV9abTJFNkE4aGpBVnlBPT0=|7200c19ff1e6c1a18390b9af7392f8dcca584bcb3c31cd3d8161d661c18eaa76";__utma = 51854390.1871736191.1517047484.1517047484.1517047484.1;__utmz = 51854390.1517047484.1.1.utmcsr = zhihu.com | utmccn = (referral) | utmcmd = referral | utmcct = / people / wa - jue - ji - xiao - wang - zi - 45 / activities;__utmv = 51854390.100 - - | 2 = registration_date = 20180111 = 1 ^ 3 = entry_date = 20180111 = 1;q_c1 = cf2ffc48a71847e8804976b27342d96a | 1519734001000 | 1517047027000;__DAYU_PP = FEmB3nFIebeie76Qvafi283c612c853b;_xsrf = 45a278fa - 78a9 - 48a5 - 997e - f8ef03822600;d_c0 = "APDslTIFUQ2PTnO_uZxNy2IraDAP4g1nffs=|1521555479"'
14 }
15 # 请求头,反爬的关键,精髓在'authorization'
16 html = requests.get(url,headers=headers)
17 for i in range(len(html.json()['data'])):
18 dic['content'] = res = html.json()['data'][i]['content']
19 dic['author'] = res = html.json()['data'][i]['author']['member']['name']
20 print('下载了{}'.format(i/(len(html.json()['data'])-1)*100)+'%')
21 #显示下载进度,友好度+1
22 write(dic)
23 #循环下载
24
25 def write(dic):
26 with open('zhihu.text','a',encoding='utf-8') as f :
27 f.writelines('昵称:' + dic['author']+'\n')
28 f.writelines('内容:' + dic['content']+'\n\n')
29 #将信息写入本地文件,编码要留心
30
31 if __name__ == '__main__' :
32 pool = ThreadPool(4)#多进程下载,4个进程池
33 page = []#将url存入列表,嘿嘿嘿
34 base_url = 'https://www.zhihu.com/api/v4/answers/346473778/comments?'
35 for i in range(0,240,20):
36 data = {
37 'include': 'data[*].author,collapsed,reply_to_author,disliked,content,voting,vote_count,is_parent_author,is_author',
38 'order': 'normal',
39 'limit': '20',
40 'offset': i,
41 'status': 'open'}
42 queries = urlencode(data) # integrate and incorporate
43 url = base_url + queries
44 page.append(url)
45 results = pool.map(get_info, page)
46 pool.close()
记手笔记
知乎还是挖了一个小坑的,第一次请求时给我返回了一个401.我表面稳如老狗,其实慌的一逼。我喝了口水,冷静下来一想,一定有什么不为人知的反爬机制,于是研究了一下返回的内容。
1 {'error': {'message': '请求头或参数封装错误', 'code': 100, 'name': 'AuthenticationInvalidRequest'}}
我一拍脑袋,真是个小机灵鬼,我在header上加一手Authentication就好了。测试一下,果然美滋滋。

浙公网安备 33010602011771号