微博搜索

"""
author:张鑫
date:2021/12/3 10:01
https://weibo.com/ajax/statuses/longtext?id=KDJGenW1X
https://weibo.com/1281382091/KDJGenW1X?refer_flag=1001030103_
"""
import random
import re
import time
from urllib.parse import quote

import pymongo
import requests
from lxml import etree


def remove_label(content):
    if '<' in content:
        pre = re.compile('>(.*?)<')
        content = content.replace('&nbsp', '')
        content = ''.join(pre.findall(content))
        return content
    else:
        content = content.replace('&nbsp', '')
        return content


database = pymongo.MongoClient('localhost', 27017)
client = database['weibo']
search_list = client['search_list']
q = quote('新婚姻法')
for page in range(19, 101):
    print(f'*************第{page}页***************')
    time.sleep(random.randint(3, 5))
    url = f'https://s.weibo.com/weibo?q={q}&Refer=realtime_weibo&page={page}'
    print(url)
    headers = {
        'cookie': 'SINAGLOBAL=209674443713.62775.1637812588940; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WFrm3zGJwUUhLB_Zq91EPT85JpX5KMhUgL.Fo-4ehn7SoeXehe2dJLoI05LxK-L12zLBKBLxK-LBK-L12zLxKML1-2L1hBLxK-L12zL1hMLxKqLBo5L1KB4e0Mt; UOR=,,login.sina.com.cn; ALF=1670033602; SSOLoginState=1638497603; SCF=AvfZc65wQjQdiV7RbqiIW2ty9XKEfdXFF4Sj9KtoCva0Pqi5xTUK1Jc5QCmWvvSik408olEIiaU8s4J6hmSiJj4.; SUB=_2A25MrQ0TDeRhGeNH61oR9i3Iyz-IHXVv23nbrDV8PUNbmtAKLWvDkW9NSvWJkV4-FW9DdWOqkOlW-djeqAeQHm3n; _s_tentry=login.sina.com.cn; Apache=2332814448343.1055.1638497606244; ULV=1638497606602:20:1:5:2332814448343.1055.1638497606244:1638173526637'
    }
    requests.adapters.DEFAULT_RETRIES = 5  # 增加重连次数
    s = requests.session()
    s.keep_alive = False  # 关闭多余连接
    html = s.get(url=url, headers=headers, verify=False).content.decode()
    tree = etree.HTML(html)
    # 作者
    for i in range(1, 23):
        time.sleep(random.randint(3, 5))
        # 详情页
        detail_url = tree.xpath(f'//div[{i}]/div/div[1]/div[2]/p[1]/a[1]//@href')
        # print(detail_url)
        if detail_url == []:
            continue
        else:
            try:
                second_url = 'https://weibo.com/ajax/statuses/show?id=' + (''.join(detail_url).split('/')[-1]).replace(
                    '?refer_flag=1001030103_', '')
                # print(second_url)
                requests.adapters.DEFAULT_RETRIES = 5  # 增加重连次数
                s = requests.session()
                s.keep_alive = False  # 关闭多余连接
                html2 = s.get(url=second_url, headers=headers, verify=False).json()
                # print(html2)

                search = {}
                search['详情页连接'] = 'https:' + ''.join(detail_url)
                search['用户名'] = html2['user']['screen_name']
                search['发布时间'] = html2['created_at']
                search['来源'] = html2['source']

                search['分享'] = html2['reposts_count']
                search['评论'] = html2['comments_count']
                search['点赞'] = html2['attitudes_count']
                data_url = f'https://weibo.com/ajax/statuses/longtext?id=' + second_url.split('/')[-1].replace(
                    'show?id=',
                    '')
                # print(f'data_url:{data_url}')
                requests.adapters.DEFAULT_RETRIES = 5  # 增加重连次数
                s = requests.session()
                s.keep_alive = False  # 关闭多余连接
                html3 = s.get(url=data_url, headers=headers, verify=False).json()
                # print(html3)

                try:
                    search['文章内容'] = ''.join(remove_label(html3['data']['longTextContent'])).replace('\n', '').replace(
                        ' ',
                        '').replace(
                        '\u200b', '')
                except:
                    search['文章内容'] = ''.join(remove_label(html2['text_raw'])).replace('\n', '').replace(' ',
                                                                                                        '').replace(
                        '\u200b', '')
                # print(search)
                # print(html3, type(html3))
                count = search_list.count_documents({'用户名': search["用户名"]})
                if count == 0:
                    search_list.insert_one(search)
                    print('******************************')
                    print(search)
                    print('入库成功')
                    print('******************************')

                    print('\n')
                else:
                    print(search)
                    print('数据已存在')
            except:
                continue
posted @ 2021-12-03 16:11 布都御魂阅读(770) 评论(0) 收藏举报
刷新页面返回顶部
布都御魂

微博搜索

公告