1 # -*- coding:utf-8 -*-
2 import requests
3 import json
4 import re
5 import os
6 import gevent
7 import time
8 import random
9 from multiprocessing.dummy import Pool as ThreadPool
10 from bs4 import BeautifulSoup
11 class CommentCrawl(object):
12 '''
13 用来爬取新浪微博评论数据
14 '''
15 headers = {
16 'User-Agent': '',
17 'Cookie': ''}
18 ALPHABET = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
19 all_comment = []
20 def __init__(self,urlll,file_name):
21 self.urlll = urlll
22 self.file_name=file_name
23 def base62_decode(self,string, alphabet=ALPHABET):
24 base = len(alphabet)
25 strlen = len(string)
26 num = 0
27 idx = 0
28 for char in string:
29 power = (strlen - (idx + 1))
30 num += alphabet.index(char) * (base ** power)
31 idx += 1
32 return num
33
34 def parser_url(self):
35 code = self.urlll.split('?')[0].split('/')[-1]
36 code1 = code[0]
37 code2 = code[1:5]
38 code3 = code[5:]
39 id1 = self.base62_decode(code1)
40 id2 = self.base62_decode(code2)
41 id3 = self.base62_decode(code3)
42 numList = [id1, id2, id3]
43 plus = ''.join(map(str, numList))
44 comment_url = 'http://weibo.com/aj/v6/comment/big?ajwvr=6&id='+ plus +'&root_comment_max_id_type=0&page={}'
45 return comment_url
46 def get_url_page(self):
47 r = requests.get(self.parser_url().format(1),headers=self.headers)
48 data = json.loads(r.text)
49 total_page = data['data']['page']['totalpage']
50 return total_page
51
52 def all_urls(self):
53 all_urls = [self.parser_url().format(i + 1) for i in range(self.get_url_page())]
54 return all_urls
55
56 def comment_parser(self,html):
57 soup = BeautifulSoup(html, 'html.parser')
58 data = soup.select('.WB_text')
59 comment = [i.text.split(':')[-1] for i in data]
60 return comment
61 def finnal_text(self,url):
62 finnal_all_comment=''.join(self.all_comment)
63 r1 = requests.get(url,headers=self.headers)
64 time.sleep(random.randint(1,5))
65 data1 = json.loads(r1.text)
66 html =data1['data']['html']
67 finnal_data = self.comment_parser(html)
68 self.all_comment+=finnal_data
69 print(len(self.all_comment))
70 return finnal_all_comment
71 def save_file(self,url):
72 path = os.getcwd()
73 filename = self.file_name + '.txt'
74 file = path + '/' + filename
75 f = open(file, 'a+', encoding='utf-8')
76 f.write(self.finnal_text(url))
77
78 if __name__ == "__main__":
79 aa = CommentCrawl('http://weibo.com/2202387347/EFdPHe50Z?from=page_1006062202387347_profile&wvr=6&mod=weibotime','小米6发布会')
80 all_link = aa.all_urls()
81 pool=ThreadPool(4)
82 results = pool.map(aa.save_file,all_link)
83 pool.close()
84 pool.join()