b站评论 检测重复性 水军检测器

功能:检测评论水军

号码转换(代码来自知乎)

table = 'fZodR9XQDSUm21yCkr6zBqiveYah8bt4xsWpHnJE7jL5VG3guMTKNPAwcF'
tr = {}
for i in range(58):
    tr[table[i]] = i
s = [11, 10, 3, 8, 4, 6]
xor = 177451812
add = 8728348608


def dec(x):
    r = 0
    for i in range(6):
        r += tr[x[s[i]]] * 58 ** i
    return (r - add) ^ xor


def enc(x):
    x = (x ^ xor) + add
    r = list('BV1  4 1 7  ')
    for i in range(6):
        r[s[i]] = table[x // 58 ** i % 58]
    return ''.join(r)

dec() bv转av,enc()av转bv

核心模块

url = 'https://api.bilibili.com/x/v2/reply/main?callback'
head = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.88 Safari/537.36'
}

def reply(oid, num):
    next_page = 1
    with open('./{}_reply.csv'.format(num), 'a+', encoding='utf-8') as f:
        f.truncate(0)
        md5_1 = 0
        md5_2 = 0
        while 1:
            resp = requests.get(url, params=get_param(next_page, oid=oid), headers=head)
            json = resp.json()
            # print(resp.text)
            # pprint.pprint(json)
            _reply = json['data']['replies']

            if _reply is None:
                print('评论全部爬取完毕')
                break

            str = ''
            for i in _reply:
                rep = i['content']['message'].replace('\n', ' ').replace('\r', ' ').replace('"', '”').strip()
                user = i['member']['uname'].strip()
                level = i['member']['level_info']['current_level']
                # print(level)
                str += '{} ,,, {} ,,, {}\n'.format(user, rep, level)
            # print(str)

            md5_1 = hashlib.md5(str.encode('utf-8')).hexdigest()
            if md5_1 == md5_2:
                print('评论全部爬取完毕')
                next_page += 1
            else:
                md5_2 = md5_1
                next_page += 1

            f.write(str)
            print('第{}页评论爬取完毕'.format(next_page - 1))

            time.sleep(0.5)

使用md5验证重复性(经测试不加好像也行),此方法是因为个人测试时is_end不返回true

注意:由于我没有稳定的代理来源,所以为了避免被封ip,我设置了0.5s的延迟,并且没有使用异步,多线程等,导致效率偏低,有稳定代理的可以自己实现多线程

数据分析模块

import numpy as np
import xlwt


def user(data, _name):
    my_dict = dict()
    for i in range(len(data)):
        reply = data[i][0]
        if reply not in my_dict:
            my_dict[reply] = [1, data[i][2]]
        else:
            my_dict[reply][0] += 1

    work = xlwt.Workbook()
    sheet = work.add_sheet('用户发言次数统计')
    sheet.write(0, 0, '用户id')
    sheet.write(0, 1, '用户等级')
    sheet.write(0, 2, '用户发言次数')
    flag = 1
    for _list in sorted(my_dict.items(), key=lambda x: x[1][0], reverse=True):
        # 对字典进行降序排序
        sheet.write(flag, 0, _list[0])
        sheet.write(flag, 1, int(_list[1][1]))
        sheet.write(flag, 2, int(_list[1][0]))
        flag += 1
    file_dir = './{}_com_user.xls'.format(_name)
    work.save(file_dir)



def comment(data, _name):
    my_dict = dict()
    for i in range(len(data)):
        reply = data[i][1]
        if reply not in my_dict:
            my_dict[reply] = 1
        else:
            my_dict[reply] += 1

    work = xlwt.Workbook()
    sheet = work.add_sheet('评论统计')
    sheet.write(0, 0, '评论详情')
    sheet.write(0, 1, '出现次数')
    flag = 1
    for _list in sorted(my_dict.items(), key=lambda x: x[1], reverse=True):
        # 降序排序
        sheet.write(flag, 0, _list[0])
        sheet.write(flag, 1, int(_list[1]))
        flag += 1

    work.save('./{}_comment.xls'.format(_name))

两个函数分别统计相同ID的发言次数(顺便保存了等级)相同评论在评论区的出现次数,降序排序后输出到xls文件中

以下为效果:

具体数据放不出

写在最后:

别问我为什么文字解释这么少,现在csdn已经不准发pa比站 评论的文章了,链接也不能带,我删了一堆有关的文字才发出来

posted @ 2022-06-19 17:04  Hello418  阅读(79)  评论(0编辑  收藏  举报