解析抖音评论采集器|爬虫|c#
一、系统概述
本系统为 视频评论区采集的网络版架构,包含客户端与服务端两部分。
单机版与网络版的核心采集逻辑一致,但本文仅讨论网络版的设计与实现。
网络版的核心目标是:
支持多用户任务提交
服务端自动化执行采集流程
稳定、可控、可监控的评论抓取能力
支持关键词、博主、单视频三类任务模型
二、系统结构
1.客户端(BS架构)
客户端主要提供可视化操作界面,供用户提交采集任务并查看采集结果。
核心作用包括:
任务创建
任务管理
评论数据查询(命中/未命中、按地区筛选)
任务状态监控
界面包含关键词任务、博主任务、单视频任务的创建与查询模块。


2.服务端
服务端部署于云服务器,负责执行所有采集任务,包括:
-
接收客户端提交的任务
-
自动化执行采集流程
-
监控任务状态
-
存储采集结果
-
处理异常与重试机制
-
服务端是整个系统的核心执行层,承担所有与目标平台交互的逻辑。
三、客户端功能说明
客户端提供三类任务的创建与管理能力:
- 任务创建
关键词搜索任务
博主主页采集任务
单视频评论采集任务
- 任务管理
关键词任务管理
博主任务管理
单视频任务管理
- 评论数据查询
每类任务均支持:
-
命中评论查询(可按地区筛选)
-
未命中评论查询(可按地区筛选)
四、服务端采集流程(以关键词任务为例)
关键词任务是系统中最复杂、最具代表性的流程,因此以此作为说明。
1.通用技术组件
服务端采集流程依赖以下核心技术:
(1)字符串解析函数
用于从 HTML 文本中提取节点、属性、文本内容等。
(2)正则表达式
用于从复杂结构中提取视频 ID、评论节点、标签内容等关键字段。
(3)JavaScript 自动化脚本
-
下拉页面
-
点击筛选按钮
-
展开评论
-
刷新视频列表
-
触发动态加载
这是整个系统中最关键的自动化能力。
五、关键词搜索采集流程(核心)
- 构造搜索 URL
根据任务关键词拼接目标平台的搜索 URL,通过 GET 或内置浏览器打开页面。
若任务要求按时间范围筛选(如一天内、一周内、半年内、不限),则执行以下步骤:
-
通过 JavaScript 定位筛选节点
-
自动触发点击事件
-
等待页面刷新完成
-
筛选逻辑完全自动化,无需人工干预。
3.页面下拉与刷新
为了确保获取到所有视频,需要执行循环下拉:
-
执行 JavaScript 将页面滚动到底部
-
检测页面是否出现“暂时没有更多了”
-
若未出现,则继续下拉
-
若出现,则停止下拉
此步骤确保视频列表完整加载。
- 视频 ID 提取
页面加载完成后,开始解析视频列表:
-
先定位视频主节点(每个视频的 HTML 块)
-
使用字符串函数切割节点
-
使用正则表达式提取视频 ID
此阶段只提取视频 ID,不采集视频标题或作者信息,这些信息在评论采集阶段获取。
- 评论采集
对每个视频 ID 执行评论抓取流程:
-
拼接评论接口或评论页面 URL
-
自动化点击评论区域
-
循环下拉评论列表
-
检测评论加载完成标识
-
使用字符串函数与正则表达式解析评论节点
-
提取评论内容、作者、时间、地区等字段
评论采集是整个任务的最终目标,也是数据价值所在。
六:代码
(这里只给出自动操作代码和节点获取代码)
6.1:搜索视频筛选视频条件代码
//可筛选
//鼠标悬停进行筛选
shaixuan_xuanting = 1;
string script = "var element = document.querySelector('.jjU9T0dQ');" +
"var event = new MouseEvent('mouseover', { bubbles: true });" +
"element.dispatchEvent(event);";
if (t_shaixuan.Text.Trim() == "一天内")
{
shaixuan_dianji_yitiannei();
Thread.Sleep(3000);
}
if (t_shaixuan.Text.Trim() == "一周内")
{
shaixuan_dianji_yizhou();
Thread.Sleep(3000);
}
if (t_shaixuan.Text.Trim() == "半年内")
{
shaixuan_dianji_bannian();
Thread.Sleep(3000);
}
private void shaixuan_dianji_yitiannei()//一天内
{
string script1 = @"
var latestRelease = Array.from(document.querySelectorAll('.eXMmo3JR')).find(el => el.textContent.trim() === '一天内');
if (latestRelease) {
latestRelease.click();
} else {
console.log('Element not found');
}
";
chromeBrowser.EvaluateScriptAsync(script1);
}
private void shaixuan_dianji_yizhou()//一周内
{
string script1 = @"
var latestRelease = Array.from(document.querySelectorAll('.eXMmo3JR')).find(el => el.textContent.trim() === '一周内');
if (latestRelease) {
latestRelease.click();
} else {
console.log('Element not found');
}
";
chromeBrowser.EvaluateScriptAsync(script1);
}
private void shaixuan_dianji_bannian()//一天内
{
string script1 = @"
var latestRelease = Array.from(document.querySelectorAll('.eXMmo3JR')).find(el => el.textContent.trim() === '半年内');
if (latestRelease) {
latestRelease.click();
} else {
console.log('Element not found');
}
";
chromeBrowser.EvaluateScriptAsync(script1);
}
6.2:视频下拉刷新
string script = "window.scrollTo(0, document.body.scrollHeight);";//循环下拉
6.3:循环获取搜索到的视频ID
string htmlContent = chromeBrowser.GetSourceAsync().Result;
jieshu = htmlContent;
string input = htmlContent;
string input1 = input;
// < li class="MgWTwktU B9KMVC9A">
string pattern = "<li class=\"SwZLHMKk SEbmeLLH\">(.*?)</li>";
// string pattern = "<li class=\"HN50D2ec Z3LKqldT\">(.*?)</li>";
// string pattern = "<li class=\"MgWTwktU B9KMVC9A\">(.*?)</li>";
// string pattern = "<li class=\"MgWTwktU search-result-card B9KMVC9A\">(.*?)</li>";
MatchCollection matches = Regex.Matches(input1, pattern);
foreach (Match match in matches)
{
string aaaaa = match.Groups[1].Value;
string url = aaaaa;
string pattern1 = @"/video/(\d+)";
Match match1 = Regex.Match(url, pattern1);
if (match1.Success)
{
string id = match1.Groups[1].Value;
//里面可以自己写逻辑 例如存入数据库等
}
6.4:评论提取中的视频信息提取
public string title_ceng(string html)//视频名称层 (抖音专用)
{
string title_b = "";
string htmlContent = html;
try
{
// string title = "";//获取title值 标题 视频名称\
//arnSiSbK hT34TYMB ONzzdL2F
// Regex regex = new Regex(@"<span class=""arnSiSbK hT34TYMB ONzzdL2F"">(.*?)</span>", RegexOptions.IgnoreCase);
// Regex regex = new Regex(@"<span class=""cx3p4vL2 NhsqQqNv BWrTO7Je"">(.*?)</span>", RegexOptions.IgnoreCase);
Regex regex = new Regex(@"<span class="+biaoqian.title_dat+">(.*?)</span>", RegexOptions.IgnoreCase);
Match match = regex.Match(htmlContent);
if (match.Success)
{
// 获取匹配到的第一个组(即<title>和</title>之间的内容)
title_b = match.Groups[1].Value;
//发布时间:
title_b = title_b.Replace("<span>", "");
title_b = title_b.Replace("/", "");
string pattern = @"<[^>]*>";
title_b = Regex.Replace(title_b, pattern, string.Empty);
}
}
catch
{
//MessageBox.Show("608");
}
return title_b;
}
public string shipin_shijian_ceng(string html)
{
string shipin_dates = "";
string htmlContent = html;
try
{
// string title = "";//获取title值 标题 视频名称
//视频日期
Regex regex = new Regex(@"(.?)", RegexOptions.IgnoreCase);
// Regex regex = new Regex(@"<span class=""time"">(.?)", RegexOptions.IgnoreCase);
Match match = regex.Match(htmlContent);
if (match.Success)
{
// 获取匹配到的第一个组(即
shipin_dates = match.Groups[1].Value.Trim();
//发布时间:
shipin_dates = shipin_dates.Replace("", "");
shipin_dates = shipin_dates.Replace("/", "");
shipin_dates = shipin_dates.Replace("·", "");
shipin_dates = shipin_dates.Replace("日", "");
shipin_dates = shipin_dates.Replace("年", "-");
shipin_dates = shipin_dates.Replace("月", "-");
string day = "";
// try
// {
Regex yearRegex = new Regex(@"\b\d{4}\b");
Regex dateRegex = new Regex(@"\b\d{1,2}-\d{1,2}\b");
// 判断字符串中是否包含年份信息
if (yearRegex.IsMatch(shipin_dates.Trim()))
{
// Console.WriteLine("输入字符串包含年份信息");
}
else if (dateRegex.IsMatch(shipin_dates.Trim()))
{
// Console.WriteLine("输入字符串不包含年份信息,但包含日期信息");
shipin_dates = "2024-" + shipin_dates.Trim();
}
else
{
Console.WriteLine("输入字符串既没有年份信息,也不符合日期格式");
#region
//DateTime shipin_dates_y = Convert.ToDateTime(shipin_dates);
//if (shipin_dates_y.Year != 1)
//{
// shipin_dates = "2004-" + shipin_dates.Trim ();
// Console.WriteLine("这个日期变量包含年份。");
//}
//else
//{
// shipin_dates = "2004-" + shipin_dates.Trim ();
// //Console.WriteLine("这个日期变量不包含年份。");
//}
#endregion
// }
// catch
// {
char delimiter = '·';
int index1 = shipin_dates.IndexOf(delimiter);
if (index1 != -1)
{
string textBeforeDelimiter = shipin_dates.Substring(0, index1);
shipin_dates = textBeforeDelimiter;
Console.WriteLine("Text before delimiter: " + textBeforeDelimiter);
}
if (shipin_dates.Contains("天"))
{
// pinglun_riqi_yuanshi = extraInfo;
int index = shipin_dates.IndexOf("天");
day = shipin_dates.Substring(0, index);
DateTime dt = DateTime.Now.Date.AddDays(-Convert.ToInt32(Convert.ToInt32(day)));
shipin_dates = dt.ToShortDateString();
}
if (shipin_dates.Contains("月"))
{
//pinglun_riqi_yuanshi = extraInfo;
int index = shipin_dates.IndexOf("月");
day = shipin_dates.Substring(0, index);
DateTime dt = DateTime.Now.Date.AddMonths(-Convert.ToInt32(Convert.ToInt32(day)));
shipin_dates = dt.ToShortDateString();
}
if (shipin_dates.Contains("小时"))
{
// pinglun_riqi_yuanshi = extraInfo;
int index = shipin_dates.IndexOf("小时");
day = shipin_dates.Substring(0, index);
DateTime dt = DateTime.Now.Date.AddHours(-Convert.ToInt32(Convert.ToInt32(day)));
shipin_dates = dt.ToString();
}
if (shipin_dates.Contains("分钟"))
{
//pinglun_riqi_yuanshi = extraInfo;
int index = shipin_dates.IndexOf("分钟");
day = shipin_dates.Substring(0, index);
DateTime dt = DateTime.Now.Date.AddMinutes(-Convert.ToInt32(Convert.ToInt32(day)));
shipin_dates = dt.ToString();
}
if (shipin_dates.Contains("周"))
{
// pinglun_riqi_yuanshi = extraInfo;
int index = shipin_dates.IndexOf("周");
day = shipin_dates.Substring(0, index);
int week = (Convert.ToInt32(day) * 7);
DateTime dt = DateTime.Now.Date.AddDays(-Convert.ToInt32(week));
shipin_dates = dt.ToShortDateString();
}
if (shipin_dates.Contains("年"))
{
// pinglun_riqi_yuanshi = extraInfo;
int index = shipin_dates.IndexOf("年");
day = shipin_dates.Substring(0, index);
DateTime dt = DateTime.Now.AddYears(-Convert.ToInt32(Convert.ToInt32(day)));
shipin_dates = dt.ToShortDateString();
}
//判断当前时间是否和视频时间 是否大于
DateTime a = DateTime.Now; // 当前时间
DateTime b = DateTime.ParseExact(shipin_dates, "yyyy-MM-dd", System.Globalization.CultureInfo.InvariantCulture);// 视频时间,假设为 2022-05-27
TimeSpan interval = a - b; // 计算时间间隔
if (Math.Abs(interval.TotalDays) <= 730) // 判断时间间隔是否小于等于两年//这个里面的值 通过字段获取
{
Console.WriteLine("视频时间和当前时间在两年内");
}
else
{
Console.WriteLine("视频时间和当前时间不在两年内");
}
}
}
}
catch
{
//MessageBox.Show("608");
}
return shipin_dates;
}//视频时间
public string zuozhe_ceng(string html)
{
string zuozhe = "";
string htmlContent = html;
try
{
// string title = "";//获取title值 标题 视频名称\
// Regex regex = new Regex(@"<div class=""account-name"">(.*?)</div>", RegexOptions.IgnoreCase);
//arnSiSbK ypGAC_xH ONzzdL2F
// Regex regex = new Regex(@"<span class="+biaoqian.title_dat.Trim ()+">(.*?)</span>", RegexOptions.IgnoreCase);
Regex regex = new Regex(@"<span class="+biaoqian.zuozhe_dat.Trim ()+">(.*?)</span>", RegexOptions.IgnoreCase);
//Regex regex = new Regex(@"<span class=""cx3p4vL2 JO0TrMZd BWrTO7Je"">(.*?)</span>", RegexOptions.IgnoreCase);
Match match = regex.Match(htmlContent);
if (match.Success)
{
// 获取匹配到的第一个组(即<title>和</title>之间的内容)
zuozhe = match.Groups[1].Value;
//发布时间:
zuozhe = zuozhe.Replace("<span>", "");
zuozhe = zuozhe.Replace("/", "");
}
}
catch
{
//MessageBox.Show("608");
}
return zuozhe;
}//视频作者
public string zuozhe_url_ceng(string html)
{
string zuozhe_url = "";
string htmlContent = html;
//B0JKdzQ8 sVGJfEdt KsoclCZj
string pattern = "<div class="+biaoqian.zuozhe_url_dat.Trim ()+">.*?href=\"([^\"]+)\"";
// string pattern = "<div class=\"B0JKdzQ8 KsoclCZj sVGJfEdt\">.*?href=\"([^\"]+)\"";
// string pattern = "<div class=\"z726S1Si NxuHjHJZ vog0e6l7\">.*?href=\"([^\"]+)\"";
Regex regex = new Regex(pattern);
Match match = regex.Match(html);
if (match.Success)
{
string href = match.Groups[1].Value;
zuozhe_url = href;
}
return zuozhe_url;
}
6.5:评论点击代码(点击评论按钮后才可以查询到评论,也可以用详情页不需要点击但是这样会更稳定)
chromeBrowser2.ExecuteScriptAsync(@"
var selector = '" + biaoqian.dianji_ceng2_dat.Trim() + @"';
var divElement = document.querySelector(selector);
if (divElement) {
divElement.click();
}
");
6.6:评论刷新代码
chromeBrowser2.ExecuteScriptAsync(@"
var commentList = document.querySelector('div[data-e2e=""comment-list""]');
if (commentList) {
commentList.scrollTop = commentList.scrollHeight;
commentList.scrollBottom = commentList.scrollHeight;
}
");
6.7:视频评论提取
MatchCollection matches = Regex.Matches(htmlContent, "<div class="ax6MlHvK">.?<a\s+href="(.?)">.?<span class="j5WZzJdp">(.?).?<div class="GOkWHE6S">(.?)", RegexOptions.Singleline);
// MatchCollection matches = Regex.Matches(htmlContent, "<div class=\"ax6MlHvK\">.*?<a\\s+href=\"(.*?)\">.*?<span class=\"j5WZzJdp\">(.*?)</span>.*?<div class=\"GOkWHE6S\"><span>(.*?)</span></div>", RegexOptions.Singleline);
// "<div class=\"ax6MlHvK\">.*?<a\\s+href=\"(.*?)\">.*?<span class=\"j5WZzJdp\">(.*?)</span>", RegexOptions.Singleline);
int i1 = 0;
// for (int i = 0; i < matches.Count; i++)
yichang = 1;
label2.Text = DateTime.Now.ToString();
foreach (Match match in matches)
{
}
string nicheng = match.Value;
string nicheng1 = "";
int startDivIndex = nicheng.IndexOf("<span class=\"j5WZzJdp E7y2ZDk0\">");
if (startDivIndex != -1)
{
// 从<div class="ax6MlHvK">之后开始找</div>的位置
int endDivIndex = nicheng.IndexOf("</div>", startDivIndex);
if (endDivIndex != -1)
{
// 获取<div class="ax6MlHvK">和</div>之间的内容
string divContent = nicheng.Substring(startDivIndex, endDivIndex - startDivIndex + 6); // 6 是 "</div>" 的长度
// 提取<a>标签中的href和文本内容
int startHrefIndex = divContent.IndexOf("href=\"") + 6;
int endHrefIndex = divContent.IndexOf("\"", startHrefIndex);
string href = divContent.Substring(startHrefIndex, endHrefIndex - startHrefIndex);
int startTextIndex = divContent.IndexOf('>', endHrefIndex) + 1;
int endTextIndex = divContent.IndexOf("</a>", startTextIndex);
string text = divContent.Substring(startTextIndex, endTextIndex - startTextIndex);
text = text.Replace("<span>", "");
text = text.Replace("</span>", "");
nicheng1 = text;
if (nicheng1 == "")
{
string nn = "123";
}
string extraInfo = match.Groups[3].Value;//日期
string day = "";
if (extraInfo.Contains("·"))
{
try
{
string[] parts = extraInfo.Split('·');//获得地区
diqu = parts[1].Trim();
}
catch
{
diqu = "未知";
}
}
//计算天数
//先去掉后点后面得字符
char delimiter = '·';
int index1 = extraInfo.IndexOf(delimiter);
if (index1 != -1)
{
string textBeforeDelimiter = extraInfo.Substring(0, index1);
extraInfo = textBeforeDelimiter;
Console.WriteLine("Text before delimiter: " + textBeforeDelimiter);
}
if (extraInfo.Contains("天"))
{
pinglun_riqi_yuanshi = extraInfo;
int index = extraInfo.IndexOf("天");
day = extraInfo.Substring(0, index);
DateTime dt = DateTime.Now.AddDays(-Convert.ToInt32(Convert.ToInt32(day)));
pinglun_riqi = dt.ToShortDateString();
}
if (extraInfo.Contains("月"))
{
pinglun_riqi_yuanshi = extraInfo;
int index = extraInfo.IndexOf("月");
day = extraInfo.Substring(0, index);
DateTime dt = DateTime.Now.AddMonths(-Convert.ToInt32(Convert.ToInt32(day)));
pinglun_riqi = dt.ToShortDateString();
}
if (extraInfo.Contains("小时"))
{
pinglun_riqi_yuanshi = extraInfo;
int index = extraInfo.IndexOf("小时");
day = extraInfo.Substring(0, index);
DateTime dt = DateTime.Now.AddHours(-Convert.ToInt32(Convert.ToInt32(day)));
pinglun_riqi = dt.ToString();
}
if (extraInfo.Contains("分钟"))
{
pinglun_riqi_yuanshi = extraInfo;
int index = extraInfo.IndexOf("分钟");
day = extraInfo.Substring(0, index);
DateTime dt = DateTime.Now.AddMinutes(-Convert.ToInt32(Convert.ToInt32(day)));
pinglun_riqi = dt.ToString();
}
if (extraInfo.Contains("周"))
{
pinglun_riqi_yuanshi = extraInfo;
int index = extraInfo.IndexOf("周");
day = extraInfo.Substring(0, index);
int week = (Convert.ToInt32(day) * 7);
DateTime dt = DateTime.Now.AddDays(-Convert.ToInt32(week));
pinglun_riqi = dt.ToShortDateString();
}
if (extraInfo.Contains("年"))
{
pinglun_riqi_yuanshi = extraInfo;
int index = extraInfo.IndexOf("年");
day = extraInfo.Substring(0, index);
DateTime dt = DateTime.Now.AddYears(-Convert.ToInt32(Convert.ToInt32(day)));
pinglun_riqi = dt.ToShortDateString();
}
Console.WriteLine("Href: " + href);
Console.WriteLine("Text: " + text);
}
else
{
Console.WriteLine("未找到 </div> 结束标签");
}
}
string href1 = match.Groups[1].Value;
//string comment = match.Groups[0].Value;
string comment = match.Groups[2].Value;
//nicheng = RemoveHtmlTags(nicheng);
//comment = RemoveHtmlTags(comment);
//href1 = RemoveHtmlTags(href1);
href1 = href1.Replace("'", "");
href1 = href1.Replace("class=\"hY8lWHgA\" target=\"_blank\" rel=\"noopener noreferrer", "");
nicheng = nicheng.Replace("'", "");
comment = comment.Replace("'", "");
nicheng = RemoveHtmlTags(nicheng);
comment = RemoveHtmlTags(comment);
href1 = RemoveHtmlTags(href1);
href1 = href1.Replace("'", "");
href1 = href1.Replace("class=\"hY8lWHgA\" target=\"_blank\" rel=\"noopener noreferrer", "");
nicheng = nicheng.Replace("'", "");
comment = comment.Replace("'", "");
// string newString = originalString.Substring(0, originalString.IndexOf("..."));
//nicheng = nicheng.Substring(0, nicheng.IndexOf("..."));
//nicheng = nicheng.Replace("...", "");
//item1.Text = nicheng;
//item1.SubItems.Add(comment);
//item1.SubItems.Add(href1);
//listView1.Items.Add(item1);
//添加数据库
if (comment.Trim() == "速度更新哦!")
{
string bbb = "123";
}
string mingzhong_int = "未命中";
string mingzhongci = "";
int list_box2 = 0;
string list_box2_mess = "";
try
{
while (list_box2 < listBox2.Items.Count)
{
list_box2_mess = listBox2.Items[list_box2].ToString().Trim();
if (comment.Contains(list_box2_mess))
{
mingzhongci = list_box2_mess.Trim();
mingzhong_int = "命中";
list_box2 = listBox2.Items.Count;
}
else
{
mingzhong_int = "未命中";
list_box2 = list_box2 + 1;
}
}
}
catch
{
mingzhong_int = "未命中";
}
int xuanhuan_count_add = 0;
if (xuanhuan_count_add == 0)
{
try
{
int index = zuozhe_name.IndexOf('<');
string result = zuozhe_name.Substring(0, index);
zuozhe_name = result.Trim();
}
catch
{ }
// 使用正则表达式替换尖括号及其中的内容为空字符串
nicheng1 = Regex.Replace(nicheng1, "<.*?>", "");
//try
//{
// int index = nicheng1.IndexOf('<');
// string result = nicheng1.Substring(0, index);
// nicheng1 = result.Trim();
//}
//catch { }
int panduan_count = 0;
try
{
nicheng1 = nicheng1.Replace("'", "");
comment = comment.Replace("'", "");
href1 = href1.Replace("'", "");
//title = href1.Replace("'", "");
zuozhe_name = zuozhe_name.Replace("'", "");
zuozhe_url = zuozhe_url.Replace("'", "");
mp4_url = mp4_url.Replace("'", "");
if (mingzhong_int.Trim() == "命中")

浙公网安备 33010602011771号