微博搜索
"""
author:张鑫
date:2021/12/3 10:01
https://weibo.com/ajax/statuses/longtext?id=KDJGenW1X
https://weibo.com/1281382091/KDJGenW1X?refer_flag=1001030103_
"""
import random
import re
import time
from urllib.parse import quote
import pymongo
import requests
from lxml import etree
def remove_label(content):
if '<' in content:
pre = re.compile('>(.*?)<')
content = content.replace(' ', '')
content = ''.join(pre.findall(content))
return content
else:
content = content.replace(' ', '')
return content
database = pymongo.MongoClient('localhost', 27017)
client = database['weibo']
search_list = client['search_list']
q = quote('新婚姻法')
for page in range(19, 101):
print(f'*************第{page}页***************')
time.sleep(random.randint(3, 5))
url = f'https://s.weibo.com/weibo?q={q}&Refer=realtime_weibo&page={page}'
print(url)
headers = {
'cookie': 'SINAGLOBAL=209674443713.62775.1637812588940; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WFrm3zGJwUUhLB_Zq91EPT85JpX5KMhUgL.Fo-4ehn7SoeXehe2dJLoI05LxK-L12zLBKBLxK-LBK-L12zLxKML1-2L1hBLxK-L12zL1hMLxKqLBo5L1KB4e0Mt; UOR=,,login.sina.com.cn; ALF=1670033602; SSOLoginState=1638497603; SCF=AvfZc65wQjQdiV7RbqiIW2ty9XKEfdXFF4Sj9KtoCva0Pqi5xTUK1Jc5QCmWvvSik408olEIiaU8s4J6hmSiJj4.; SUB=_2A25MrQ0TDeRhGeNH61oR9i3Iyz-IHXVv23nbrDV8PUNbmtAKLWvDkW9NSvWJkV4-FW9DdWOqkOlW-djeqAeQHm3n; _s_tentry=login.sina.com.cn; Apache=2332814448343.1055.1638497606244; ULV=1638497606602:20:1:5:2332814448343.1055.1638497606244:1638173526637'
}
requests.adapters.DEFAULT_RETRIES = 5 # 增加重连次数
s = requests.session()
s.keep_alive = False # 关闭多余连接
html = s.get(url=url, headers=headers, verify=False).content.decode()
tree = etree.HTML(html)
# 作者
for i in range(1, 23):
time.sleep(random.randint(3, 5))
# 详情页
detail_url = tree.xpath(f'//div[{i}]/div/div[1]/div[2]/p[1]/a[1]//@href')
# print(detail_url)
if detail_url == []:
continue
else:
try:
second_url = 'https://weibo.com/ajax/statuses/show?id=' + (''.join(detail_url).split('/')[-1]).replace(
'?refer_flag=1001030103_', '')
# print(second_url)
requests.adapters.DEFAULT_RETRIES = 5 # 增加重连次数
s = requests.session()
s.keep_alive = False # 关闭多余连接
html2 = s.get(url=second_url, headers=headers, verify=False).json()
# print(html2)
search = {}
search['详情页连接'] = 'https:' + ''.join(detail_url)
search['用户名'] = html2['user']['screen_name']
search['发布时间'] = html2['created_at']
search['来源'] = html2['source']
search['分享'] = html2['reposts_count']
search['评论'] = html2['comments_count']
search['点赞'] = html2['attitudes_count']
data_url = f'https://weibo.com/ajax/statuses/longtext?id=' + second_url.split('/')[-1].replace(
'show?id=',
'')
# print(f'data_url:{data_url}')
requests.adapters.DEFAULT_RETRIES = 5 # 增加重连次数
s = requests.session()
s.keep_alive = False # 关闭多余连接
html3 = s.get(url=data_url, headers=headers, verify=False).json()
# print(html3)
try:
search['文章内容'] = ''.join(remove_label(html3['data']['longTextContent'])).replace('\n', '').replace(
' ',
'').replace(
'\u200b', '')
except:
search['文章内容'] = ''.join(remove_label(html2['text_raw'])).replace('\n', '').replace(' ',
'').replace(
'\u200b', '')
# print(search)
# print(html3, type(html3))
count = search_list.count_documents({'用户名': search["用户名"]})
if count == 0:
search_list.insert_one(search)
print('******************************')
print(search)
print('入库成功')
print('******************************')
print('\n')
else:
print(search)
print('数据已存在')
except:
continue