asp.net抓取网页Email并存入记事本

//从返回的网页源代码中过滤出Email并存入记事本 

private void GetEmail()
    {
        int strEndId = Convert.ToInt32(txtEnd.Text.Trim());
        int strDiZengId = Convert.ToInt32(txtDiZengId.Text.Trim());
        string Url = this.txtUrl.Text.Trim();

        int urlStar;
        int strJieQuUrl = Convert.ToInt32(TruncStr(TruncBeginStr(Url, "cn/", 3), ".html"));
        urlStar = strJieQuUrl;
     
        string strWeb = string.Empty;
        for (int i = urlStar; i < strEndId + 1; i++)
        {
            string aa = string.Format("http://www.yellowurl.cn/{0}.html", strJieQuUrl.ToString());
            string strWebContent = GetWebContent(aa);
            if (!string.IsNullOrEmpty(strWebContent))
            {
                int iBodyStart = strWebContent.IndexOf("<body", 0);
                int iStart = strWebContent.IndexOf("<table", iBodyStart);
                int iTableStart = strWebContent.IndexOf("电子邮件", iStart);
                int iTableEnd = strWebContent.IndexOf("Q Q", iTableStart);
                strWeb = StripAllTags(strWebContent.Substring(iTableStart+5, (iTableEnd - 3) - (iTableStart + 5)));
                StreamWriterMetod(strWeb);
                strJieQuUrl = strJieQuUrl + strDiZengId;
            }
        }
    }

//取得网页源代码

 private string GetWebContent(string Url)
    {
        string strResult = "";

        //测试用网址
        string urlTest = "http://www.yellowurl.cn/1581812.html";
            HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url);

            request.Headers.Set("Pragma", "no-cache");
            HttpWebResponse response = (HttpWebResponse)request.GetResponse();
            Stream streamReceive = response.GetResponseStream();
            Encoding encoding = Encoding.GetEncoding("GB2312");
            StreamReader streamReader = new StreamReader(streamReceive, encoding);
            strResult = streamReader.ReadToEnd();
        return strResult;
    }

//将取出的Email写入记事本 

private void StreamWriterMetod(string strEmail)
    {
        //try
        //{
            //FileStream fsFile = new FileStream(@"e:\log.txt", FileMode.OpenOrCreate);
            StreamWriter swWriter = File.AppendText(@"e:\log.txt");
            //StreamWriter swWriter = new StreamWriter(fsFile);
            swWriter.WriteLine(strEmail);
            swWriter.Flush();
            swWriter.Close();
        //}
        //catch (Exception e)
        //{

        //    throw e;
        //}
    }

//去除HTML标记

 public static string StripAllTags(string stringToStrip)
    {
        stringToStrip = Regex.Replace(stringToStrip, "</p(?:\\s*)>(?:\\s*)<p(?:\\s*)>", "\n\n", RegexOptions.IgnoreCase | RegexOptions.Compiled);
        stringToStrip = Regex.Replace(stringToStrip, "<br(?:\\s*)/>", "\n", RegexOptions.IgnoreCase | RegexOptions.Compiled);
        stringToStrip = Regex.Replace(stringToStrip, "\"", "''", RegexOptions.IgnoreCase | RegexOptions.Compiled);
        stringToStrip = Regex.Replace(stringToStrip, "<[^>]+>", "", RegexOptions.IgnoreCase | RegexOptions.Compiled);
        stringToStrip = Regex.Replace(stringToStrip, "&[^;]+;", "", RegexOptions.IgnoreCase | RegexOptions.Compiled);

        return stringToStrip;
    }

 

==================优化提高效率的分割线============================ 

public void test()
{
string str = "<table><tr><td>123@qq.com</td></tr></table>";//网页代码
string pattern = "";//根据你自己的需求写上正则
//Match match = Regex.Match(str,pattern);
//match.Groups[1].Value;//这个就是你要取的值

//上面注释的代码是匹配单条数据情况下,下面匹配多条数据
MatchCollection match = Regex.Matches(str,pattern);
for (int i = 0; i < match.Count; i++)
{
//循环读取match内容
}
}
 

posted on 2010-09-29 10:46  blair0807  阅读(282)  评论(0)    收藏  举报