C# RSS：新闻抓取正文并转TXT

如果你喜欢手机阅读
如果宿舍手机几乎没CMNET信号
如果你不想浪费手机流量
如果你只想睡前静静浏览今天的新闻
以下程序抓取了 cnblogs,cnbeta,网易深度,南方周末的首页正文,可添加其它网站
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Net;
using System.Collections;
using System.Threading;
using System.IO;
using System.Configuration;
namespace RSS
{
    class Program
    {
      
        static void Main(string[] args)
        {
            string file="i://";

            {
                GetItem gi1 = new GetItem();
                gi1.pageUrl = "http://news.cnblogs.com/n/page/";
                gi1.prefix = "http://news.cnblogs.com";
                gi1.pageUrlsRegex = "\"(?<url>/n/[\\d]+?)\"";
                gi1.titleRegex = "<div id=\"news_title\"><a.*?>(?<title>.*?)</a>";
                gi1.timeRegex = "<span class=\"time\">(?<time>.*?)</span>";
                gi1.bodyRegex = "<div id=\"news_body\">(?<body>.*?)</div>";
                gi1.hostName = "CnBlogs";
                gi1.encoding = "utf-8";
                gi1.fileSave = string.Format("{2}{0}_{1}.txt", gi1.hostName, String.Format("{0:yyMMdd_HH-mm}", DateTime.Now), file);
                Console.WriteLine(gi1.fileSave);
                gi1.pageWantToGet = 20;
                gi1.threadStart();

            }

            //{
            //    GetItem gi2 = new GetItem();
            //    gi2.prefix = "http://www.cnbeta.com/";
            //    gi2.pageUrlsRegex = "\"(?<url>/articles/[\\d]+.htm?)\"";
            //    gi2.titleRegex = "id=\"news_title\">(?<title>.*?)</h3>";
            //    gi2.timeRegex = "id=\"news_author\"><span>(?<time>.*?)[|]";
            //    gi2.bodyRegex = "<div id=\"news_content\">(?<body>.*?)<!-- end newsBox news -->";
            //    gi2.hostName = "CnBeta";
            //    gi2.encoding = "gb2312";
            //    gi2.fileSave = string.Format("{2}{0}_{1}.txt", gi2.hostName, String.Format("{0:yyMMdd_HH-mm}", DateTime.Now), file);
            //    Console.WriteLine(gi2.fileSave);
            //    gi2.homeOnly = true;
            //    gi2.threadStart();

            //}

            //{
            //    GetItem gi2 = new GetItem();
            //    gi2.pageUrlsRegex = "\"(?<url>http://focus.news.163.com.[^>< ]*.html?)\"";
            //    gi2.prefix = "http://focus.news.163.com/";
            //    gi2.hasPrefix = false;//default:true
            //    gi2.hasManyPage = true;//default:false
            //    gi2.manyPageRegex = "<span class=\"s1 s3\">上一页</span>(?<np>.*?)下一页</a>";
            //    gi2.titleRegex = "id=\"h1title\">(?<title>.*?)</h1>";
            //    gi2.timeRegex = "<span class=\"info\">(?<time>.*?)来源";
            //    gi2.bodyRegex = "class=\"summary\"(?<body>.*?)<!-- 分页 -->";
            //    gi2.hostName = "163";
            //    gi2.encoding = "GBK";
            //    gi2.fileSave = string.Format("{2}{0}_{1}.txt", gi2.hostName, String.Format("{0:yyMMdd_HH-mm}", DateTime.Now), file);
            //    Console.WriteLine(gi2.fileSave);
            //    gi2.homeOnly = true;
            //    gi2.threadStart();

            //}
            //{
            //    GetItem gi2 = new GetItem();
            //    gi2.pageUrlsRegex = "\"(?<url>http://www.infzm.com/content/[\\d]+?)\"";
            //    gi2.prefix = "http://www.infzm.com/";
            //    gi2.hasPrefix = false;//default:true
            //    gi2.hasManyPage = false;//default:false
            //    //gi2.manyPageRegex = "<span class=\"s1 s3\">上一页</span>(?<np>.*?)下一页</a>";
            //    gi2.titleRegex = "<div id=\"detailContent\">[\\s]*<h1>[\\s]*(?<title>.*?)[\\s]*</h1>";
            //    gi2.timeRegex = "<span class=\"pubTime\">(?<time>.*?)</span>";
            //    gi2.bodyRegex = "<div id=\"content-context\">(?<body>.*?)<!--end #text-->";
            //    gi2.hostName = "infzm";
            //    gi2.encoding = "utf-8";
            //    gi2.fileSave = string.Format("{2}{0}_{1}.txt", gi2.hostName, String.Format("{0:yyMMdd_HH-mm}", DateTime.Now), file);
            //    Console.WriteLine(gi2.fileSave);
            //    gi2.homeOnly = true;
            //    gi2.threadStart();

            //}
            //Console.Read();
            
        }
    }
    class GetItem{
        public string pageUrl;
        public bool homeOnly = false;
        public bool hasPrefix = true;
        public int pageWantToGet = 1;
        public bool hasManyPage = false;
        public string manyPageRegex;
        public string prefix;
        private List<string> pageUrls;
        public string pageUrlsRegex;
        public string titleRegex;
        public string timeRegex;
        public string bodyRegex;
        public string fileSave;
        public string hostName;
        public string encoding;
        public void threadStart() {

            if(!prefix.EndsWith("/"))prefix+="/";
            ThreadStart ts = new ThreadStart(start);
            Thread th = new Thread(ts);
            th.Start();
            
        }
        private void start() {

            if (homeOnly) { 
                
                getPageUrls(-1);

            }
            else
            {

                for (int i = 1; i <= pageWantToGet; i++)
                    getPageUrls(i);
            }
            startGetAll();
        }
        private void WriteFile(string str) {
            FileStream fs = new FileStream(fileSave, FileMode.Append);
            StreamWriter streamWriter = new StreamWriter(fs,System.Text.Encoding.GetEncoding("gb2312"));
            streamWriter.WriteLine(str);
            streamWriter.Flush();
            streamWriter.Close();
            fs.Close();
        }
        private void deleteTag(ref string str)
        {
   
            str = Regex.Replace(str, "<[\\s]*p[^>]*>?>", "");
            str = Regex.Replace(str, "</[\\s]*p[\\s]*?>", "\r\n");
            str = Regex.Replace(str, "<[\\s]*br[\\s]*/[\\s]*[^>]*>?>", "\r\n");
            str = Regex.Replace(str, "<[\\s]*br[^>]*>?>", "");
            str = Regex.Replace(str, "</[\\s]*br[^>]*>?>", "\r\n");
            
            str = Regex.Replace(str, "<[\\s]*a[\\s]*[^>]*>?>", "");
            str = Regex.Replace(str, "</[\\s]*a[\\s]*[^>]*>?>", "");
            str = Regex.Replace(str, "<[\\s]*img[\\s]*[^>]*>?>", "");
            str = Regex.Replace(str, "</[\\s]*img[\\s]*[^>]*>?>", "");
            str = Regex.Replace(str, "<[\\s]*strong[\\s]*[^>]*>?>", "");
            str = Regex.Replace(str, "</[\\s]*strong[\\s]*[^>]*>?>", "");
            str = Regex.Replace(str, "<[\\s]*div[\\s]*[^>]*>?>", "");
            str = Regex.Replace(str, "</[\\s]*div[\\s]*[^>]*>?>", "");
            str = Regex.Replace(str, "<[\\s]*b[\\s]*[^>]*>?>", "");
            str = Regex.Replace(str, "</[\\s]*b[\\s]*[^>]*>?>", "");
            str = Regex.Replace(str, "<[\\s]*span[\\s]*[^>]*>?>", "");
            str = Regex.Replace(str, "</[\\s]*span[\\s]*[^>]*>?>", "");
            str = Regex.Replace(str, "<[\\s]*script[\\s]*[^>]*>?>", "");
            str = Regex.Replace(str, "</[\\s]*script[\\s]*[^>]*>?>", "");
            str = Regex.Replace(str, "<[\\s]*li[\\s]*[^>]*>?>", "");
            str = Regex.Replace(str, "</[\\s]*li[\\s]*[^>]*>?>", "");
            str = Regex.Replace(str, "<[\\s]*img[\\s]*[^>]*>?>", "");
            str = Regex.Replace(str, "</[\\s]*img[\\s]*[^>]*>?>", "");
            str = Regex.Replace(str, "<[\\s]*style[\\s]*[^>]*>?>", "");
            str = Regex.Replace(str, "</[\\s]*style[\\s]*[^>]*>?>", "");
            str = Regex.Replace(str, "<[\\s]*i[\\s]*[^>]*>?>", "");
            str = Regex.Replace(str, "</[\\s]*i[\\s]*[^>]*>?>", "");
            str = Regex.Replace(str, "<[\\s]*h3[\\s]*[^>]*>?>", "");
            str = Regex.Replace(str, "<[\\s]*h2[\\s]*[^>]*>?>", "");
            str = Regex.Replace(str, "</[\\s]*h3[\\s]*[^>]*>?>", "");
            str = Regex.Replace(str, "</[\\s]*h2[\\s]*[^>]*>?>", "");
            str = Regex.Replace(str, "<[\\s]*font[\\s]*[^>]*>?>", "");
            str = Regex.Replace(str, "</[\\s]*font[\\s]*[^>]*>?>", "");
            str = Regex.Replace(str, "<[\\s]*q[\\s]*[^>]*>?>", "");
            str = Regex.Replace(str, "</[\\s]*q[\\s]*[^>]*>?>", "");
            str = str.Replace("&rdquo;", "\"");
            str = str.Replace("&ldquo;", "\"");
            str = str.Replace("&lsquo;", "'");
            str = str.Replace("&rsquo;", "'");
            str = str.Replace("&nbsp;", " ");
            str = str.Replace("&hellip;", "…");
            str = str.Replace("&ndash;", "-");
            str = str.Replace("&mdash;", "—");
        }
        public GetItem()
        {
            //this.homeUrl = url;
            pageUrls = new List<string>(50);
        }
        private string getNextPageContent(string url) {

            Console.WriteLine(url);

            //Console.Read();

            try
            {
                HttpWebRequest req = (HttpWebRequest)HttpWebRequest.Create(url);
                //req.SendChunked = true;
                req.Method = "get";
                req.ContentType = "text/html;charset=utf-8";

                //req.AllowAutoRedirect = false;
                // req.Timeout = 50;
                //req.CookieContainer = cc;


                StringBuilder sb = new StringBuilder("");
                StringBuilder cont = new StringBuilder("");
                using (HttpWebResponse wr = req.GetResponse() as HttpWebResponse)
                {

                    System.IO.Stream respStream = wr.GetResponseStream();
                    System.IO.StreamReader reader = new System.IO.StreamReader(respStream, System.Text.Encoding.GetEncoding(this.encoding));
                    //Regex titler = new Regex(this.titleRegex, RegexOptions.Singleline);
                    //Regex timer = new Regex(this.timeRegex, RegexOptions.Singleline);
                    Regex bodyr = new Regex(this.bodyRegex, RegexOptions.Singleline);
                    do
                    {

                        sb.Append(reader.ReadLine());



                    } while (!reader.EndOfStream);

                    string str = sb.ToString();
                    //Console.WriteLine(sb);
                    //Match m = titler.Match(str);
                    //if (m.Success)
                    //{
                    //    Console.WriteLine("title:{0}", m.Groups["title"].Value);
                    //    //streamWriter.WriteLine(m.Groups["title"].Value);
                    //    cont.AppendLine(m.Groups["title"].Value);

                    //}
                    //cont.AppendLine(url);
                    //m = timer.Match(str);
                    //if (m.Success)
                    //{
                    //    Console.WriteLine("time:{0}", m.Groups["time"].Value);
                    //    cont.AppendLine(m.Groups["time"].Value);
                    //}
                    Match m = bodyr.Match(str);
                    if (m.Success)
                    {
                        string body = m.Groups["body"].Value;

                        deleteTag(ref body);
                        Console.WriteLine("已获取下一页正文");
                        return body;
                    }
                   

                }
            }
            catch (Exception ex)
            {
                Console.WriteLine("异常:{0}",ex.Message);
                return "";
            }
            return "";
        
        
        
        }
        private void getContent(string url,int index,int total)
        {
            Console.WriteLine(url);

            //Console.Read();
            
            try
            {
                HttpWebRequest req = (HttpWebRequest)HttpWebRequest.Create(url);
                
                req.Method = "get";
                req.ContentType = "	text/html;charset=utf-8";
                
                //req.
                //req.AllowAutoRedirect = false;
                // req.Timeout = 50;
                //req.CookieContainer = cc;


                StringBuilder sb = new StringBuilder("");
                StringBuilder cont = new StringBuilder("");
                using (HttpWebResponse wr = req.GetResponse() as HttpWebResponse)
                {
                   
                    System.IO.Stream respStream = wr.GetResponseStream();
                    System.IO.StreamReader reader = new System.IO.StreamReader(respStream, System.Text.Encoding.GetEncoding(this.encoding));
                    Regex titler = new Regex(this.titleRegex, RegexOptions.Singleline);
                    Regex timer = new Regex(this.timeRegex, RegexOptions.Singleline);
                    Regex bodyr = new Regex(this.bodyRegex, RegexOptions.Singleline);
                    do
                    {

                        sb.Append(reader.ReadLine());



                    } while (!reader.EndOfStream);

                    string str = sb.ToString();
                    //Console.WriteLine(sb);
                    Match m = titler.Match(str);
                    if (m.Success) {
                        Console.WriteLine("title:{0}",m.Groups["title"].Value);
                        //streamWriter.WriteLine(m.Groups["title"].Value);
                        cont.AppendLine(m.Groups["title"].Value);
                        
                    }
                    cont.AppendLine(string.Format("({0}/{1}){2}",index,total,url));
                    m = timer.Match(str);
                    if (m.Success) {
                        Console.WriteLine("time:{0}", m.Groups["time"].Value);
                        cont.AppendLine(m.Groups["time"].Value);
                    }
                    m = bodyr.Match(str);
                    if (m.Success)
                    {
                        string body = m.Groups["body"].Value;

                        deleteTag(ref body);
                        Console.WriteLine("获取正文");
                        cont.AppendLine(body);
                    }
                    if (hasManyPage) {
                        
                        Regex mr = new Regex(this.manyPageRegex, RegexOptions.Singleline);
                        Match mm = mr.Match(str);
                        if (mm.Success) {
                            Console.WriteLine("存在多页..");
                            string pagesurl = mm.Groups["np"].Value;
                            Regex r = new Regex(this.pageUrlsRegex, RegexOptions.Singleline);
                            MatchCollection mc = r.Matches(pagesurl);
                            for (int i = 0; i < mc.Count; i++) {
                                string u = mc[i].Groups["url"].Value;
                                if (pageUrls.IndexOf(u) == -1) {

                                    pageUrls.Add(u);
                                    cont.AppendLine(getNextPageContent(u));
                                }
                            }
                        
                        }
                    
                    }
                    cont.AppendLine("--------------------------------------------------------------");
                    WriteFile(cont.ToString());
                  
                }
            }
            catch (Exception ex)
            {
                Console.WriteLine("异常:{0},{1}",ex.Source,ex.Message);
                return;
            }
            
        
        }
        private void startGetAll() {

            for (int i = 0; i < pageUrls.Count; i++)
            {
                string u;
                if (hasPrefix)
                {



                    if (pageUrls[i].StartsWith("/"))
                        u = string.Format("{0}{1}", prefix, pageUrls[i].Substring(1));
                    else u = string.Format("{0}{1}", prefix, pageUrls[i]);

                }
                else u = pageUrls[i];


                getContent(u, i, pageUrls.Count);

            }
        }
        private void getPageUrls(int pageIndex)
        {
            string url;
            if (pageIndex == -1) url = prefix;
            else url = string.Format("{0}{1}",this.pageUrl,pageIndex);
            Console.WriteLine(url);
            try
            {
                HttpWebRequest req = (HttpWebRequest)HttpWebRequest.Create(url);
                req.Method = "get";
                req.ContentType = "	text/html;charset=utf-8";

                //req.AllowAutoRedirect = false;
                // req.Timeout = 50;
                //req.CookieContainer = cc;


                StringBuilder sb = new StringBuilder("");
                using (HttpWebResponse wr = req.GetResponse() as HttpWebResponse)
                {

                    System.IO.Stream respStream = wr.GetResponseStream();
                    System.IO.StreamReader reader = new System.IO.StreamReader(respStream, System.Text.Encoding.GetEncoding(this.encoding));
                    Regex r = new Regex(this.pageUrlsRegex, RegexOptions.Singleline);
                    
                    do
                    {
                        
                        sb.Append(reader.ReadLine());



                    } while (!reader.EndOfStream);


                   // Console.WriteLine(sb);
                    MatchCollection m = r.Matches(sb.ToString());
                    //Console.WriteLine("regex:{0},matches:{1}", this.pageUrlsRegex, m.Count);
                    for (int i = 0; i < m.Count; i++) {
                        string temp = m[i].Groups["url"].Value;
                        //Console.WriteLine("index:{0},{1}", pageUrls.IndexOf(temp), temp);
                        if (pageUrls.IndexOf(temp) == -1) pageUrls.Add(temp);
                    }
                    Console.WriteLine("{0}:{1} articles.",this.hostName,pageUrls.Count);
                    
             
                    
                }
            }
            catch (Exception ex)
            {
                Console.WriteLine(ex.Message);
                Console.WriteLine("{0} end!", this.hostName);
                return;
            }
            Console.WriteLine("{0} end!", this.hostName);
        }
    }
}
注:project->add item->new xml file:app.config
like this:
<?xml version="1.0" encoding="utf-8" ?>
<configuration>
<system.net>
    <settings>
      <httpWebRequest useUnsafeHeaderParsing= "true" />
    </settings>
</system.net>
</configuration>
posted on 2011-05-15 17:37 yangyh 阅读(1872) 评论(2) 编辑收藏举报