b站评论 检测重复性 水军检测器
功能:检测评论水军
号码转换(代码来自知乎)
table = 'fZodR9XQDSUm21yCkr6zBqiveYah8bt4xsWpHnJE7jL5VG3guMTKNPAwcF'
tr = {}
for i in range(58):
    tr[table[i]] = i
s = [11, 10, 3, 8, 4, 6]
xor = 177451812
add = 8728348608
def dec(x):
    r = 0
    for i in range(6):
        r += tr[x[s[i]]] * 58 ** i
    return (r - add) ^ xor
def enc(x):
    x = (x ^ xor) + add
    r = list('BV1  4 1 7  ')
    for i in range(6):
        r[s[i]] = table[x // 58 ** i % 58]
    return ''.join(r)
dec() bv转av,enc()av转bv
核心模块
url = 'https://api.bilibili.com/x/v2/reply/main?callback'
head = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.88 Safari/537.36'
}
def reply(oid, num):
    next_page = 1
    with open('./{}_reply.csv'.format(num), 'a+', encoding='utf-8') as f:
        f.truncate(0)
        md5_1 = 0
        md5_2 = 0
        while 1:
            resp = requests.get(url, params=get_param(next_page, oid=oid), headers=head)
            json = resp.json()
            # print(resp.text)
            # pprint.pprint(json)
            _reply = json['data']['replies']
            if _reply is None:
                print('评论全部爬取完毕')
                break
            str = ''
            for i in _reply:
                rep = i['content']['message'].replace('\n', ' ').replace('\r', ' ').replace('"', '”').strip()
                user = i['member']['uname'].strip()
                level = i['member']['level_info']['current_level']
                # print(level)
                str += '{} ,,, {} ,,, {}\n'.format(user, rep, level)
            # print(str)
            md5_1 = hashlib.md5(str.encode('utf-8')).hexdigest()
            if md5_1 == md5_2:
                print('评论全部爬取完毕')
                next_page += 1
            else:
                md5_2 = md5_1
                next_page += 1
            f.write(str)
            print('第{}页评论爬取完毕'.format(next_page - 1))
            time.sleep(0.5)使用md5验证重复性(经测试不加好像也行),此方法是因为个人测试时is_end不返回true
注意:由于我没有稳定的代理来源,所以为了避免被封ip,我设置了0.5s的延迟,并且没有使用异步,多线程等,导致效率偏低,有稳定代理的可以自己实现多线程
数据分析模块
import numpy as np
import xlwt
def user(data, _name):
    my_dict = dict()
    for i in range(len(data)):
        reply = data[i][0]
        if reply not in my_dict:
            my_dict[reply] = [1, data[i][2]]
        else:
            my_dict[reply][0] += 1
    work = xlwt.Workbook()
    sheet = work.add_sheet('用户发言次数统计')
    sheet.write(0, 0, '用户id')
    sheet.write(0, 1, '用户等级')
    sheet.write(0, 2, '用户发言次数')
    flag = 1
    for _list in sorted(my_dict.items(), key=lambda x: x[1][0], reverse=True):
        # 对字典进行降序排序
        sheet.write(flag, 0, _list[0])
        sheet.write(flag, 1, int(_list[1][1]))
        sheet.write(flag, 2, int(_list[1][0]))
        flag += 1
    file_dir = './{}_com_user.xls'.format(_name)
    work.save(file_dir)
def comment(data, _name):
    my_dict = dict()
    for i in range(len(data)):
        reply = data[i][1]
        if reply not in my_dict:
            my_dict[reply] = 1
        else:
            my_dict[reply] += 1
    work = xlwt.Workbook()
    sheet = work.add_sheet('评论统计')
    sheet.write(0, 0, '评论详情')
    sheet.write(0, 1, '出现次数')
    flag = 1
    for _list in sorted(my_dict.items(), key=lambda x: x[1], reverse=True):
        # 降序排序
        sheet.write(flag, 0, _list[0])
        sheet.write(flag, 1, int(_list[1]))
        flag += 1
    work.save('./{}_comment.xls'.format(_name))两个函数分别统计相同ID的发言次数(顺便保存了等级)和相同评论在评论区的出现次数,降序排序后输出到xls文件中
以下为效果:


具体数据放不出
写在最后:
别问我为什么文字解释这么少,现在csdn已经不准发pa比站 评论的文章了,链接也不能带,我删了一堆有关的文字才发出来
本文来自博客园,作者:Hello418,转载请注明原文链接:https://www.cnblogs.com/janitor/p/16390831.html

 
                
            
         
         浙公网安备 33010602011771号
浙公网安备 33010602011771号