/// <summary>
/// 函数名称:ItemRetrival_1
/// 功能说明:用于提取帖子列表页面的url,帖子标题,帖子时间
/// 参数:string url表示帖子列表url
/// 参数 ref Encoding encode 用于获取网页字符集编码
/// 参数: ref List<string> listUrl,listTitle,listTime用于存放提取出的各项信息
///
/// </summary>
/// <param name="url"></param>
/// <param name="encode"></param>
/// <param name="listurl"></param>
/// <param name="listtitle"></param>
/// <param name="listtime"></param>
public static void ItemRetrival_1(string url, ref Encoding encode, ref List<string> listUrl, ref List<string> listTitle,
ref List<string> listTime)
{
//获取网页源码;
string rawtext = GetDataFromUrl(url, ref encode);
//将无关的style,script等标签去掉;
string reg1 = @"<style[\s\S]+?/style>|<select[\s\S]+?/select>|<script[\s\S]+?/script>|<\!\-\-[\s\S]*?\-\->";
rawtext = new Regex(reg1, RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(rawtext, "");
//以下用htmlparser提取源码中的目标table;
Lexer lexer = new Lexer(rawtext);
//解析出其中的table元素
Parser parser = new Parser(lexer);
NodeFilter filter = new TagNameFilter("table");
NodeList htmlNodes = parser.Parse(filter);
//去除嵌套式table
Regex f1 = new Regex(@"<table.*?>");
for (int i = htmlNodes.Count - 1; i >= 0; i--)
{
MatchCollection myCollection = f1.Matches(htmlNodes[i].ToHtml());
if (myCollection.Count > 1)
htmlNodes.Remove(i);
}
//去除没有时间的table,认为这种table是无效table
Regex f2 = new Regex(@"\d\d:\d\d");
for (int i = htmlNodes.Count - 1; i >= 0; i--)
{
if (!f2.IsMatch(htmlNodes[i].ToHtml()))
htmlNodes.Remove(i);
}
//以下程序解析出以上三种目标信息
string final = htmlNodes.ToHtml();
Lexer lex2 = new Lexer(final);
Parser par2 = new Parser(lex2);
NodeFilter filter2 = new TagNameFilter("tr");
NodeList finalNodes = par2.Parse(filter2);
//提取发帖时间信息
RegexFilter rf = new RegexFilter(@"\d\d:\d\d");
for (int i = 0; i < finalNodes.Count; i++)
{
Lexer lexerTmp = new Lexer(finalNodes[i].ToHtml());
Parser parserTmp = new Parser(lexerTmp);
NodeList tmp = parserTmp.Parse(rf);
if (tmp.Count > 0)
for (int j = 0; j < tmp.Count; j++)
{
string temp = tmp[j].ToHtml();
ModifyRawText(ref temp);
listTime.Add(temp);
}
}
//提取帖子URL以及帖子标题
string atagAssist = finalNodes.ToHtml();
Lexer lex3 = new Lexer(atagAssist);
Parser par3 = new Parser(lex3);
NodeFilter filter3 = new TagNameFilter("a");
NodeList atagNodes = par3.Parse(filter3);
string urlpart = new Regex(@"http://.*?(?=/)").Match(url).Value;
for (int i = 0; i < atagNodes.Count; i++)
{
ATag link = (ATag)atagNodes.ElementAt(i);
string temp1 = link.GetAttribute("href");
string temp2 = link.StringText;
if (!new Regex("http").IsMatch(temp1))//如果提取出的url为相对url,则加上域名补全为绝对url
{
temp1 = urlpart + temp1;//将提取出的url构造完整,形成完整的url
}
ModifyRawText(ref temp2);
listUrl.Add(temp1);
listTitle.Add(temp2);
}
}