用来爬取新浪微博评论数据

 1 # -*- coding:utf-8 -*-
 2 import requests
 3 import json
 4 import re
 5 import os
 6 import gevent
 7 import time
 8 import random
 9 from multiprocessing.dummy import Pool as ThreadPool
10 from bs4 import BeautifulSoup
11 class CommentCrawl(object):
12     '''
13     用来爬取新浪微博评论数据
14     '''
15     headers = {
16         'User-Agent': '',
17         'Cookie': ''}
18     ALPHABET = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
19     all_comment = []
20     def __init__(self,urlll,file_name):
21         self.urlll = urlll
22         self.file_name=file_name
23     def base62_decode(self,string, alphabet=ALPHABET):
24         base = len(alphabet)
25         strlen = len(string)
26         num = 0
27         idx = 0
28         for char in string:
29             power = (strlen - (idx + 1))
30             num += alphabet.index(char) * (base ** power)
31             idx += 1
32         return num
33 
34     def parser_url(self):
35         code = self.urlll.split('?')[0].split('/')[-1]
36         code1 = code[0]
37         code2 = code[1:5]
38         code3 = code[5:]
39         id1 = self.base62_decode(code1)
40         id2 = self.base62_decode(code2)
41         id3 = self.base62_decode(code3)
42         numList = [id1, id2, id3]
43         plus = ''.join(map(str, numList))
44         comment_url = 'http://weibo.com/aj/v6/comment/big?ajwvr=6&id='+ plus +'&root_comment_max_id_type=0&page={}'
45         return comment_url
46     def get_url_page(self):
47         r = requests.get(self.parser_url().format(1),headers=self.headers)
48         data = json.loads(r.text)
49         total_page = data['data']['page']['totalpage']
50         return total_page
51 
52     def all_urls(self):
53         all_urls = [self.parser_url().format(i + 1) for i in range(self.get_url_page())]
54         return all_urls
55 
56     def comment_parser(self,html):
57         soup = BeautifulSoup(html, 'html.parser')
58         data = soup.select('.WB_text')
59         comment = [i.text.split('：')[-1] for i in data]
60         return comment
61     def finnal_text(self,url):
62         finnal_all_comment=''.join(self.all_comment)
63         r1 = requests.get(url,headers=self.headers)
64         time.sleep(random.randint(1,5))
65         data1 = json.loads(r1.text)
66         html =data1['data']['html']
67         finnal_data = self.comment_parser(html)
68         self.all_comment+=finnal_data
69         print(len(self.all_comment))
70         return finnal_all_comment
71     def save_file(self,url):
72         path = os.getcwd()
73         filename = self.file_name + '.txt'
74         file = path + '/' + filename
75         f = open(file, 'a+', encoding='utf-8')
76         f.write(self.finnal_text(url))
77 
78 if __name__ == "__main__":
79     aa = CommentCrawl('http://weibo.com/2202387347/EFdPHe50Z?from=page_1006062202387347_profile&wvr=6&mod=weibotime','小米6发布会')
80     all_link = aa.all_urls()
81     pool=ThreadPool(4)
82     results = pool.map(aa.save_file,all_link)
83     pool.close()
84     pool.join()
posted @ 2017-04-19 13:40 Erick-LONG 阅读(2505) 评论(1) 收藏举报
刷新页面返回顶部
Erick - LONG

Be Patient! Be Positive! Be Persistence!

用来爬取新浪微博评论数据

公告