B站评论爬虫实战:从数据获取到情感分析 - 实践
一、B站评论数据结构解析
1.1 API接口分析
B站评论采用RESTful API设计,主要接口:
https://api.bilibili.com/x/v2/reply?type=1&oid={视频aid}&pn={页码}
核心参数说明:
type: 评论区类型(1=视频,11=专栏,17=动态)oid: 对象ID(视频为aid,动态为dynamic_id)pn: 页码(从1开始)ps: 每页数量(默认20,最大49)sort: 排序方式(0=时间,2=热度)
1.2 响应数据结构
{
"code": 0,
"data": {
"page": {
"num": 1,
"size": 20,
"count": 2500,
"acount": 2500
},
"replies": [
{
"rpid": 123456789, // 评论ID
"oid": 987654321, // 视频aid
"type": 1,
"mid": 111222333, // 用户ID
"root": 0, // 根评论ID(0表示是一级评论)
"parent": 0, // 父评论ID
"dialog": 0, // 对话ID
"count": 15, // 回复数
"rcount": 15, // 实际回复数
"like": 520, // 点赞数
"ctime": 1634567890, // 发布时间戳
"content": {
"message": "评论内容",
"emote": { // 表情数据
"[doge]": {
"id": 26,
"url": "https://..."
}
}
},
"member": {
"uname": "用户名",
"avatar": "头像URL",
"level_info": {
"current_level": 6 // 用户等级
},
"vip": {
"vipStatus": 1,
"vipType": 2
}
},
"replies": [...] // 二级评论(最多显示3条)
}
]
}
}
1.3 二级评论获取
二级评论需要单独请求:
https://api.bilibili.com/x/v2/reply/reply?type=1&oid={视频aid}&root={根评论rpid}&pn={页码}
二、技术难点与解决方案
2.1 反爬虫机制
难点1:Wbi签名验证
B站从2023年开始对部分接口启用Wbi签名机制,需要:
- 获取img_key和sub_key(从用户信息接口)
- 按规则打乱密钥
- 计算参数的MD5签名
import hashlib
import urllib.parse
from functools import reduce
def get_wbi_keys():
"""获取最新的img_key和sub_key"""
headers = {'User-Agent': 'Mozilla/5.0 ...'}
url = 'https://api.bilibili.com/x/web-interface/nav'
resp = requests.get(url, headers=headers).json()
img_url = resp['data']['wbi_img']['img_url']
sub_url = resp['data']['wbi_img']['sub_url']
img_key = img_url.split('/')[-1].split('.')[0]
sub_key = sub_url.split('/')[-1].split('.')[0]
return img_key, sub_key
def gen_wbi_sign(params, img_key, sub_key):
"""生成Wbi签名"""
mixin_key_enc_tab = [
46, 47, 18, 2, 53, 8, 23, 32, 15, 50, 10, 31, 58, 3, 45, 35, 27, 43, 5, 49,
33, 9, 42, 19, 29, 28, 14, 39, 12, 38, 41, 13, 37, 48, 7, 16, 24, 55, 40,
61, 26, 17, 0, 1, 60, 51, 30, 4, 22, 25, 54, 21, 56, 59, 6, 63, 57, 62, 11,
36, 20, 34, 44, 52
]
orig = img_key + sub_key
mixin_key = reduce(lambda s, i: s + orig[i], mixin_key_enc_tab, '')[:32]
# 按key排序并拼接
params['wts'] = int(time.time())
sorted_params = sorted(params.items())
query = urllib.parse.urlencode(sorted_params)
sign = hashlib.md5((query + mixin_key).encode()).hexdigest()
return query + '&w_rid=' + sign
难点2:Cookie与User-Agent校验
必需的请求头配置:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Referer': 'https://www.bilibili.com',
'Origin': 'https://www.bilibili.com',
'Cookie': 'buvid3=...; SESSDATA=...' # 登录后获取
}
2.2 频率限制与并发控制
问题: 短时间大量请求会触发429错误或IP封禁
解决方案:
import asyncio
import aiohttp
from asyncio import Semaphore
class BilibiliCrawler:
def __init__(self, max_concurrent=5, delay=1.0):
self.semaphore = Semaphore(max_concurrent)
self.delay = delay
async def fetch_with_limit(self, session, url):
async with self.semaphore:
await asyncio.sleep(self.delay)
async with session.get(url, headers=headers) as resp:
return await resp.json()
async def crawl_comments(self, oid, max_pages=10):
async with aiohttp.ClientSession() as session:
tasks = []
for pn in range(1, max_pages + 1):
url = f'https://api.bilibili.com/x/v2/reply?type=1&oid={oid}&pn={pn}'
tasks.append(self.fetch_with_limit(session, url))
results = await asyncio.gather(*tasks, return_exceptions=True)
return results
2.3 数据完整性保障
难点: 评论总数与实际可获取数量不一致
B站API存在以下限制:
- 单视频最多返回前5000条一级评论
- 二级评论只展示部分,需额外请求
- 部分评论被折叠或删除
策略:
- 按时间和热度两种排序分别爬取
- 结合去重保证数据完整性
- 记录爬取进度,支持断点续爬
import sqlite3
class CommentStorage:
def __init__(self, db_path='bilibili.db'):
self.conn = sqlite3.connect(db_path)
self.create_table()
def create_table(self):
self.conn.execute('''
CREATE TABLE IF NOT EXISTS comments (
rpid INTEGER PRIMARY KEY,
oid INTEGER,
content TEXT,
like_count INTEGER,
reply_count INTEGER,
ctime INTEGER,
mid INTEGER,
uname TEXT,
user_level INTEGER,
is_vip INTEGER,
UNIQUE(rpid)
)
''')
self.conn.commit()
def save_comment(self, comment_data):
try:
self.conn.execute('''
INSERT OR IGNORE INTO comments VALUES (?,?,?,?,?,?,?,?,?,?)
''', comment_data)
self.conn.commit()
except Exception as e:
print(f"保存失败: {e}")
2.4 动态评论加载
问题: 部分热门视频评论采用动态加载,需要cursor分页
新版接口使用cursor模式:
def crawl_with_cursor(oid, cursor=None):
url = 'https://api.bilibili.com/x/v2/reply/main'
params = {
'type': 1,
'oid': oid,
'mode': 3, # 热度排序
'next': cursor if cursor else 0
}
# 添加Wbi签名
signed_params = gen_wbi_sign(params, img_key, sub_key)
response = requests.get(url + '?' + signed_params, headers=headers)
return response.json()
三、情感分析实战
3.1 数据预处理
import re
import jieba
class CommentPreprocessor:
def __init__(self):
# 加载停用词
with open('stopwords.txt', 'r', encoding='utf-8') as f:
self.stopwords = set(f.read().splitlines())
# B站特有表情过滤
self.emote_pattern = re.compile(r'\[.*?\]')
def clean_text(self, text):
# 移除表情
text = self.emote_pattern.sub('', text)
# 移除URL
text = re.sub(r'http[s]?://\S+', '', text)
# 移除@用户
text = re.sub(r'@\S+', '', text)
# 移除特殊字符
text = re.sub(r'[^\w\s]', '', text)
return text.strip()
def tokenize(self, text):
cleaned = self.clean_text(text)
words = jieba.cut(cleaned)
return [w for w in words if w not in self.stopwords and len(w) > 1]
3.2 情感分析模型
方案1:基于词典的情感分析
class SentimentAnalyzer:
def __init__(self):
self.load_lexicon()
def load_lexicon(self):
"""加载情感词典"""
self.positive_words = set(line.strip() for line in open('positive.txt', encoding='utf-8'))
self.negative_words = set(line.strip() for line in open('negative.txt', encoding='utf-8'))
self.degree_words = {
'非常': 2.0, '特别': 2.0, '十分': 1.8,
'很': 1.5, '比较': 1.2, '稍微': 0.8
}
self.negation_words = {'不', '没', '无', '非', '莫'}
def analyze(self, words):
score = 0
i = 0
while i < len(words):
word = words[i]
degree = 1.0
negation = 1
# 检查程度副词
if i > 0 and words[i-1] in self.degree_words:
degree = self.degree_words[words[i-1]]
# 检查否定词
if i > 0 and words[i-1] in self.negation_words:
negation = -1
# 计算情感值
if word in self.positive_words:
score += degree * negation
elif word in self.negative_words:
score -= degree * negation
i += 1
return 1 if score > 0 else (-1 if score < 0 else 0)
方案2:基于预训练模型
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
class BertSentiment:
def __init__(self, model_name='uer/roberta-base-finetuned-chinanews-chinese'):
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
self.model.to(self.device)
def predict(self, text):
inputs = self.tokenizer(text, return_tensors='pt',
truncation=True, max_length=512)
inputs = {k: v.to(self.device) for k, v in inputs.items()}
with torch.no_grad():
outputs = self.model(**inputs)
probs = torch.softmax(outputs.logits, dim=1)
label = torch.argmax(probs, dim=1).item()
# 0=negative, 1=neutral, 2=positive
sentiment_map = {0: -1, 1: 0, 2: 1}
return sentiment_map[label], probs[0][label].item()
3.3 多维度情感分析
import numpy as np
from collections import Counter
class AdvancedAnalyzer:
def __init__(self):
self.sentiment_model = BertSentiment()
self.preprocessor = CommentPreprocessor()
def analyze_video_comments(self, comments):
"""综合分析视频评论"""
results = {
'total': len(comments),
'sentiments': {'positive': 0, 'neutral': 0, 'negative': 0},
'avg_sentiment_score': 0,
'hot_words': [],
'time_trend': [],
'user_engagement': {}
}
all_words = []
sentiment_scores = []
for comment in comments:
# 情感分析
text = comment['content']['message']
sentiment, score = self.sentiment_model.predict(text)
if sentiment == 1:
results['sentiments']['positive'] += 1
elif sentiment == -1:
results['sentiments']['negative'] += 1
else:
results['sentiments']['neutral'] += 1
sentiment_scores.append(sentiment * score)
# 词频统计
words = self.preprocessor.tokenize(text)
all_words.extend(words)
# 计算平均情感值
results['avg_sentiment_score'] = np.mean(sentiment_scores)
# 高频词
word_freq = Counter(all_words)
results['hot_words'] = word_freq.most_common(20)
return results
四、实际应用场景
4.1 舆情监控
class VideoMonitor:
def __init__(self, video_ids):
self.video_ids = video_ids
self.analyzer = AdvancedAnalyzer()
def monitor(self):
"""实时监控视频评论情绪"""
for vid in self.video_ids:
comments = self.crawl_latest_comments(vid)
analysis = self.analyzer.analyze_video_comments(comments)
# 负面情绪预警
if analysis['sentiments']['negative'] / analysis['total'] > 0.4:
self.send_alert(vid, analysis)
def send_alert(self, vid, data):
print(f"⚠️ 视频 {vid} 负面评论占比过高: {data}")
4.2 热点话题挖掘
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
class TopicExtractor:
def extract_topics(self, comments, n_topics=5):
"""提取评论热点话题"""
preprocessor = CommentPreprocessor()
texts = [' '.join(preprocessor.tokenize(c['content']['message']))
for c in comments]
# TF-IDF特征提取
vectorizer = TfidfVectorizer(max_features=200)
features = vectorizer.fit_transform(texts)
# KMeans聚类
kmeans = KMeans(n_clusters=n_topics, random_state=42)
labels = kmeans.fit_predict(features)
# 提取每个主题的关键词
topics = []
for i in range(n_topics):
indices = np.where(labels == i)[0]
topic_words = Counter()
for idx in indices:
words = texts[idx].split()
topic_words.update(words)
topics.append({
'topic_id': i,
'keywords': topic_words.most_common(10),
'comment_count': len(indices)
})
return topics
4.3 用户画像分析
class UserProfiler:
def analyze_active_users(self, comments):
"""分析活跃用户特征"""
user_stats = {}
for comment in comments:
mid = comment['mid']
if mid not in user_stats:
user_stats[mid] = {
'username': comment['member']['uname'],
'level': comment['member']['level_info']['current_level'],
'is_vip': comment['member']['vip']['vipStatus'],
'comment_count': 0,
'total_likes': 0,
'avg_sentiment': []
}
user_stats[mid]['comment_count'] += 1
user_stats[mid]['total_likes'] += comment['like']
# 计算用户情感倾向
sentiment, _ = self.sentiment_model.predict(
comment['content']['message']
)
user_stats[mid]['avg_sentiment'].append(sentiment)
# 找出最活跃用户
top_users = sorted(user_stats.items(),
key=lambda x: x[1]['comment_count'],
reverse=True)[:10]
return top_users
4.4 数据可视化
import matplotlib.pyplot as plt
from wordcloud import WordCloud
class CommentVisualizer:
def plot_sentiment_distribution(self, analysis_result):
"""绘制情感分布饼图"""
sentiments = analysis_result['sentiments']
labels = ['正面', '中性', '负面']
sizes = [sentiments['positive'], sentiments['neutral'], sentiments['negative']]
colors = ['#66c2a5', '#8da0cb', '#fc8d62']
plt.figure(figsize=(8, 6))
plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%')
plt.title('评论情感分布')
plt.savefig('sentiment_distribution.png', dpi=300, bbox_inches='tight')
def generate_wordcloud(self, hot_words):
"""生成词云图"""
word_freq = dict(hot_words)
wc = WordCloud(
font_path='simhei.ttf', # 中文字体
width=1200,
height=600,
background_color='white',
max_words=100
).generate_from_frequencies(word_freq)
plt.figure(figsize=(12, 6))
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.savefig('wordcloud.png', dpi=300, bbox_inches='tight')
五、完整示例
import asyncio
async def main():
# 初始化爬虫
crawler = BilibiliCrawler(max_concurrent=5, delay=1.5)
storage = CommentStorage()
analyzer = AdvancedAnalyzer()
# 爬取评论
video_id = 'BV1xx411c7mD' # 示例视频
oid = bv_to_aid(video_id) # 需要BV号转AV号
print(f"开始爬取视频 {video_id} 的评论...")
results = await crawler.crawl_comments(oid, max_pages=50)
# 保存数据
all_comments = []
for result in results:
if isinstance(result, dict) and result.get('code') == 0:
comments = result['data']['replies']
for comment in comments:
storage.save_comment(extract_comment_data(comment))
all_comments.append(comment)
print(f"共爬取 {len(all_comments)} 条评论")
# 情感分析
print("开始情感分析...")
analysis = analyzer.analyze_video_comments(all_comments)
print(f"\n=== 分析结果 ===")
print(f"总评论数: {analysis['total']}")
print(f"正面: {analysis['sentiments']['positive']} "
f"({analysis['sentiments']['positive']/analysis['total']*100:.1f}%)")
print(f"中性: {analysis['sentiments']['neutral']} "
f"({analysis['sentiments']['neutral']/analysis['total']*100:.1f}%)")
print(f"负面: {analysis['sentiments']['negative']} "
f"({analysis['sentiments']['negative']/analysis['total']*100:.1f}%)")
print(f"\n平均情感分: {analysis['avg_sentiment_score']:.3f}")
print(f"\n高频词TOP10:")
for word, count in analysis['hot_words'][:10]:
print(f" {word}: {count}")
if __name__ == '__main__':
asyncio.run(main())
六、注意事项与最佳实践
合规性要求
- 遵守B站robots.txt协议
- 控制爬取频率,避免对服务器造成压力
- 不得用于商业用途或侵犯用户隐私
性能优化
- 使用异步请求提升效率
- 实现增量爬取,避免重复抓取
- 合理设置请求超时和重试机制
数据质量
- 过滤垃圾评论和机器人评论
- 处理表情符号和特殊字符
- 去除重复评论
长期维护
- API接口可能变更,需定期更新
- 反爬虫策略可能升级,需持续监控
- 定期更新情感词典和模型
总结
B站评论爬虫涉及API解析、反爬对抗、数据处理、情感分析等多个技术环节。通过合理的架构设计和算法选择,可以实现高效的数据采集和深度分析,为内容运营、舆情监控等场景提供数据支持。
在实践中需要注意平衡技术实现与合规要求,确保爬虫程序的稳定性和可维护性。
浙公网安备 33010602011771号