//抓取文字方法,参数为网页源代码
public string ExtractText(string strHtml)
{
string result = strHtml;
result = RemoveComment(result); //调用去掉注释等方法
result = RemoveScript(result); //调用去除js 方法
result = RemoveStyle(result); //调用去除样式表方法
result = RemoveTags(result); //调用去掉符号方法
return result.Trim();
}
#region
//去除符号方法。把网页源代码作为参数,根据正则表达式去除相应符号。代码需要背过
private string RemoveComment(string input)
{
string result = input;
result = Regex.Replace(result, @"<!--[^-]*-->", string.Empty, RegexOptions.IgnoreCase);
return result;
}
private string RemoveStyle(string input)
{
string result = input;
//remove all styles
result = Regex.Replace(result, @"<style[^>]*?>.*?</style>", string.Empty, RegexOptions.IgnoreCase | RegexOptions.Singleline);
return result;
}
//去掉js方法
private string RemoveScript(string input)
{
string result = input;
result = Regex.Replace(result, @"<script[^>]*?>.*?</script>", string.Empty, RegexOptions.IgnoreCase | RegexOptions.Singleline);
return result;
}
//去掉标点符号方法
private string RemoveTags(string input)
{
string result = input;
result = result.Replace(" ", " ");
result = result.Replace("<", "<");
result = result.Replace(">", ">");
result = result.Replace("&", "&");
result = result.Replace("<br>", "\r\n");
result = Regex.Replace(result, @"<[\s\S]*?>", string.Empty, RegexOptions.IgnoreCase);
return result;
}
#endregion
//批量抓取邮箱
private void 转换工具ZToolStripMenuItem_Click(object sender, EventArgs e)
{
//MatchCollection 通过不停的替换将正则表达式模式应用于输入字符串所找到的成功匹配的集合
textBox2.Clear();
MatchCollection mc = Regex.Matches(respHtml, @"[a-zA-Z0-9_\-\.]+@\w+(\.\w+)+");
StringBuilder sb = new StringBuilder(); //可变字符串
foreach (Match mm in mc) //Macth这是一个匹配类
{
sb.AppendLine(mm.Value);
}
textBox2.Text = sb.ToString();
}