腾讯微博用户关注与听众的爬取
按广度的方式爬取用户的关注和听众,腾讯微博已经停运了,网上找的登入代码已经都过时了,自己分析不出来,就直接把cookie复制下了,这样就能获取要登录的内容了。
由于停运,只能获取40页的内容,文件格式为[source,target] 表示source 关注 target。由于从source爬取的话可以从关注里找到target,而从target爬取的话会从听众找到source,所以就需要写个去重了。
一小时大概能获取2万条消息。就一路写下去,没用线程。
辣鸡代码如下:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
@auther: Starry
@file: Tencentweibo.py
@time:  2018/7/15 9:50
'''
import requests
from bs4  import BeautifulSoup
from queue import Queue
import time
import datetime
import json
import csv
import os
cookies = {
    
}
headers = {
    "Accept": "*/*",
    "Accept-Encoding": "gzip, deflate",
    "Accept-Language": "zh-CN,zh;q=0.9",
    "Cache-Control": "no-cache",
    "Connection": "keep-alive",
    "Host": "api.t.qq.com",
    "Pragma": "no-cache",
    "Referer": "http://api.t.qq.com/proxy.html",
    "rf": "http://t.qq.com/anjianbin1979/following?t=1#u=anjianbin1979&t=1&st=1&p=2",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"
}
class TencentWeibo:
    COUNT = 0
    def __init__(self, start_name, start_title):
        self.start_name = start_name
        self.start_titile = start_title
        self.que = Queue()
        self.nameToId = {}
        self.current_num = 1
        self.visName = []
        self.unique = {}
        self.init_exe()
    def init_exe(self):
        if not os.path.exists('information.csv'):
            self.csv_information = csv.writer(open('information.csv','a',newline='',encoding='utf-8'),dialect='excel')
            self.csv_information.writerow(['id','user','name'])
            self.csv_information.writerow([1,self.start_name,self.start_titile])
            self.que.put(self.start_name)
            self.nameToId[self.start_name] = self.current_num
            self.unique[self.nameToId[self.start_name]] = []
        else:
            with open('information.csv','r',encoding='utf-8') as f:
                csvFile = csv.reader(f,dialect='excel')
                for index, item in enumerate(csvFile):
                    if index == 0:continue
                    self.que.put(item[1])
                    self.nameToId[item[1]] = int(item[0])
                    self.current_num = int(item[0])
            self.csv_information = csv.writer(open('information.csv', 'a', newline='', encoding='utf-8'),
                                              dialect='excel')
        if not os.path.exists('data.csv'):
            self.csv_data = csv.writer(open('data.csv', 'a', newline='', encoding='utf-8'), dialect='excel')
            self.csv_data.writerow(['Source', 'Target'])
        else:
            FLAG = 0
            with open('data.csv', 'r', encoding='utf-8') as f:
                csvFile = csv.reader(f, dialect='excel')
                for index, item in enumerate(csvFile):
                    if index==0:continue
                    id1, id2 = int(item[0]),int(item[1])
                    if id1 not in self.unique.keys():
                        self.unique[id1] = []
                    if id2 not in self.unique.keys():
                        self.unique[id2] = []
                    self.unique[id1].append(id2)
                    FLAG = min(id1,id2)
            while not self.que.empty():
                name = self.que.get()
                id = self.nameToId[name]
                if id == FLAG:
                    break
                else:
                    self.visName.append(name)
            self.csv_data = csv.writer(open('data.csv', 'a', newline='', encoding='utf-8'), dialect='excel')
        print('开始爬取啦!!!')
    def DealHtml(self,html, Flag, name):
        soup = BeautifulSoup(html, 'html.parser')
        li = soup.find_all('div', attrs={"class": "userName"})
        for chlid in li:
            try:
                id = chlid.find('a').get('href')[1:]
                title = chlid.find('a').string
                if id not in self.nameToId.keys():
                    self.current_num += 1
                    self.nameToId[id] = self.current_num
                    self.que.put(id)
                    self.csv_information.writerow([self.current_num, id, title])
                    if self.nameToId[id] not in self.unique.keys():
                        self.unique[self.nameToId[id]] = []
                # if self.COUNT == 1000:
                #     print('已经爬取了%s条消息了'%self.current_num)
                #     self.COUNT = 0
                # self.COUNT += 1
                if Flag == 1:
                    # print("关注",id,title)
                    if self.nameToId[id] not in self.unique[self.nameToId[name]]:
                        self.unique[self.nameToId[name]].append(self.nameToId[id])
                        # print([self.nameToId[name],self.nameToId[id]])
                        self.csv_data.writerow([self.nameToId[name], self.nameToId[id]])
                elif Flag == 2:
                    # print("粉丝",id, title)
                    if self.nameToId[name] not in self.unique[self.nameToId[id]]:
                        self.unique[self.nameToId[id]].append(self.nameToId[name])
                        # print([self.nameToId[id], self.nameToId[name]])
                        self.csv_data.writerow([self.nameToId[id], self.nameToId[name]])
            except Exception as e:
                print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),e)
    def getFans(self, name):
        for i in range(1, 41):
            try:
                ctime = str(int(time.time() * 1000))
                url = "http://api.t.qq.com/relations/follow_apollo.php?u={0}&t=2&st=1&p={1}&apiType=14&apiHost=http://api.t.qq.com&_r={2}&g_tk=325301840".format(
                    name, str(i), ctime)
                ret = requests.get(url=url, headers=headers, cookies=cookies,timeout=10)
                ret_json = json.loads(ret.text)
                if "info" in ret_json.keys():
                    self.DealHtml(ret_json['info'], 2, name)
                else:
                    break
            except Exception as e:
                print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),e)
    def getIdol(self, name):
        for i in range(1, 41):
            try:
                ctime = str(int(time.time() * 1000))
                url = "http://api.t.qq.com/relations/follow_apollo.php?u={0}&t=1&st=1&p={1}&apiType=14&apiHost=http://api.t.qq.com&_r={2}&g_tk=325301840".format(
                    name, str(i), ctime)
                ret = requests.get(url=url, headers=headers, cookies=cookies,timeout=10)
                ret_json = json.loads(ret.text)
                if "info" in ret_json.keys():
                    self.DealHtml(ret_json['info'], 1, name)
                else:
                    break
            except Exception as e:
                print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),e)
    def start(self):
        while not self.que.empty():
            visiter = self.que.get()
            if visiter not in self.visName:
                self.visName.append(visiter)
                self.getIdol(visiter)
                self.getFans(visiter)
class TencentWeiboArticles:
    def __init__(self):
        self.que = Queue()
        self.IdToInformation = {}
        # self.
    def start(self):
        pass
weibo = TencentWeibo('xie_na','谢娜')
weibo.start()

 
                
            
         
         浙公网安备 33010602011771号
浙公网安备 33010602011771号