b站评论 检测重复性 水军检测器
功能:检测评论水军
号码转换(代码来自知乎)
table = 'fZodR9XQDSUm21yCkr6zBqiveYah8bt4xsWpHnJE7jL5VG3guMTKNPAwcF'
tr = {}
for i in range(58):
tr[table[i]] = i
s = [11, 10, 3, 8, 4, 6]
xor = 177451812
add = 8728348608
def dec(x):
r = 0
for i in range(6):
r += tr[x[s[i]]] * 58 ** i
return (r - add) ^ xor
def enc(x):
x = (x ^ xor) + add
r = list('BV1 4 1 7 ')
for i in range(6):
r[s[i]] = table[x // 58 ** i % 58]
return ''.join(r)
dec() bv转av,enc()av转bv
核心模块
url = 'https://api.bilibili.com/x/v2/reply/main?callback'
head = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.88 Safari/537.36'
}
def reply(oid, num):
next_page = 1
with open('./{}_reply.csv'.format(num), 'a+', encoding='utf-8') as f:
f.truncate(0)
md5_1 = 0
md5_2 = 0
while 1:
resp = requests.get(url, params=get_param(next_page, oid=oid), headers=head)
json = resp.json()
# print(resp.text)
# pprint.pprint(json)
_reply = json['data']['replies']
if _reply is None:
print('评论全部爬取完毕')
break
str = ''
for i in _reply:
rep = i['content']['message'].replace('\n', ' ').replace('\r', ' ').replace('"', '”').strip()
user = i['member']['uname'].strip()
level = i['member']['level_info']['current_level']
# print(level)
str += '{} ,,, {} ,,, {}\n'.format(user, rep, level)
# print(str)
md5_1 = hashlib.md5(str.encode('utf-8')).hexdigest()
if md5_1 == md5_2:
print('评论全部爬取完毕')
next_page += 1
else:
md5_2 = md5_1
next_page += 1
f.write(str)
print('第{}页评论爬取完毕'.format(next_page - 1))
time.sleep(0.5)
使用md5验证重复性(经测试不加好像也行),此方法是因为个人测试时is_end不返回true
注意:由于我没有稳定的代理来源,所以为了避免被封ip,我设置了0.5s的延迟,并且没有使用异步,多线程等,导致效率偏低,有稳定代理的可以自己实现多线程
数据分析模块
import numpy as np
import xlwt
def user(data, _name):
my_dict = dict()
for i in range(len(data)):
reply = data[i][0]
if reply not in my_dict:
my_dict[reply] = [1, data[i][2]]
else:
my_dict[reply][0] += 1
work = xlwt.Workbook()
sheet = work.add_sheet('用户发言次数统计')
sheet.write(0, 0, '用户id')
sheet.write(0, 1, '用户等级')
sheet.write(0, 2, '用户发言次数')
flag = 1
for _list in sorted(my_dict.items(), key=lambda x: x[1][0], reverse=True):
# 对字典进行降序排序
sheet.write(flag, 0, _list[0])
sheet.write(flag, 1, int(_list[1][1]))
sheet.write(flag, 2, int(_list[1][0]))
flag += 1
file_dir = './{}_com_user.xls'.format(_name)
work.save(file_dir)
def comment(data, _name):
my_dict = dict()
for i in range(len(data)):
reply = data[i][1]
if reply not in my_dict:
my_dict[reply] = 1
else:
my_dict[reply] += 1
work = xlwt.Workbook()
sheet = work.add_sheet('评论统计')
sheet.write(0, 0, '评论详情')
sheet.write(0, 1, '出现次数')
flag = 1
for _list in sorted(my_dict.items(), key=lambda x: x[1], reverse=True):
# 降序排序
sheet.write(flag, 0, _list[0])
sheet.write(flag, 1, int(_list[1]))
flag += 1
work.save('./{}_comment.xls'.format(_name))
两个函数分别统计相同ID的发言次数(顺便保存了等级)和相同评论在评论区的出现次数,降序排序后输出到xls文件中
以下为效果:
具体数据放不出
写在最后:
别问我为什么文字解释这么少,现在csdn已经不准发pa比站 评论的文章了,链接也不能带,我删了一堆有关的文字才发出来
本文来自博客园,作者:Hello418,转载请注明原文链接:https://www.cnblogs.com/janitor/p/16390831.html