抓取网页并用正则表达式匹配邮箱地址

 

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Net;
using System.IO;

namespace _07正则_匹配邮箱
{
    class Program
    {
        static void Main(string[] args)
        {
            List<Uri> listUrl = new List<Uri>() {
                new Uri("http://gb.corp.163.com/gb/contactus.html"),
                new Uri("https://passport.csdn.net/help/faq"),
                new Uri("http://www.kuaipan.cn/"),
                new Uri("http://www.ksyun.com/home/joinUs/campus"),
                new Uri("http://www.cnblogs.com/about/ad.aspx"),
                new Uri("http://www.cnblogs.com/about/contactus.aspx"),
                new Uri("http://www.csdn.net/company/statement.html"),
                new Uri("http://hb.qq.com/job/dczp/index.htm")
            };
            List<string> listMail = new List<string>();
            foreach (Uri ur in listUrl) 
            {
                GetMails(ur, listMail);
            }

            cw(listMail);

            Console.ReadKey();
        }

        private static void GetMails(Uri uri,List<string> list)
        {
            try
            {
                WebClient wc = new WebClient();
                Console.WriteLine("创建WebClient - [{0}]", uri.ToString());
                Stream stream = wc.OpenRead(uri);
                //Console.WriteLine("正在下载:{0}", uri.ToString());
                StreamReader reader = new StreamReader(stream, Encoding.Default);
                string input = reader.ReadToEnd();
                string reg = @"(?<mail1>[a-zA-Z0-9_]+@[a-zA-Z0-9]+(?:\.[a-zA-Z0-9]+)+)"  //zhangsan@163.com
                    + @"|((?<mail2>[a-zA-Z0-9_]+#[a-zA-Z0-9]+(?:\.[a-zA-Z0-9]+)+))"      //zhangsan#163.com
                    + @"|((?<mail3>[a-zA-Z0-9_]+\(at\)[a-zA-Z0-9]+(?:\.[a-zA-Z0-9]+)+))";//zhangsan(at)163.com
                Regex regex = new Regex(reg);
                Console.WriteLine(Regex.IsMatch(input, reg));
                MatchCollection matches = regex.Matches(input);
                for (int i = 0; i < matches.Count; i++)
                {
                    Match match = matches[i];
                    //Console.WriteLine("match: {0}",match.Value);
                    //Console.WriteLine(match.Groups.Count);
                    for (int j = 1; j < match.Groups.Count; j++)
                    {
                        string mail = match.Groups[j].Value;
                        if (!string.IsNullOrEmpty(mail))
                        {
                            mail = Regex.Replace(mail, @"(.+)(?:@)(.+)", "$1@$2");
                            mail = Regex.Replace(mail, "(.+)#(.+)", "$1@$2"); //把zhangsan#163.com替换为zhangsan@163.com
                            mail = Regex.Replace(mail, @"(.+)\(at\)(.+)", "$1@$2");

                            if (!list.Contains(mail)) 
                            {
                                list.Add(mail);
                            } 
                        }
                        //Console.WriteLine("group: {0}", match.Groups[j].Value);
                    }
                }
            }
            catch (Exception e) 
            {
                Console.WriteLine(e.Message);
            }
        }

        static void cw(List<string> list)
        {
            Console.WriteLine("长度为{0}", list.Count);
            int i = 0;
            foreach (string str in list)
            {
                i++;
                Console.WriteLine("{0} - [{1}]", i, str);
            }
            Console.WriteLine("______________________");
        }
    }
}

  

posted @ 2013-07-02 22:04  liqipeng  阅读(531)  评论(0编辑  收藏  举报