c# 拉取网页

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Net;
using System.IO;


namespace xsharp
{
    class Program
    {

        static string sDir = "G:\\notex\\";
        static WebClient MyWebClient = new WebClient();
        static string sMain = "";

        static int writeContent(string sHtml, int idx)
        {
            int iBgnIdx = sHtml.IndexOf("<meta name=\"keywords\" content=\"");
            if (iBgnIdx <= 0)
                return -1;
            int iEndIdx = sHtml.IndexOf("\"", iBgnIdx + 31);
            string sTitle = sHtml.Substring(iBgnIdx + 31, iEndIdx - iBgnIdx - 31);
            Console.WriteLine(sTitle);

            iBgnIdx = sHtml.IndexOf("<div id=\"chaptercontent");
            if (iBgnIdx <= 0)
                return -1;
            iEndIdx = sHtml.IndexOf("</div>", iBgnIdx + 1);
            if (iEndIdx <= 0)
                return -1;

            string sDivSub = sHtml.Substring(iBgnIdx, iEndIdx - iBgnIdx);

            iBgnIdx = sDivSub.IndexOf("</p>");
            if (iBgnIdx <= 0)
                return -1;
            iEndIdx = sDivSub.IndexOf("<p style", iBgnIdx + 5);
            if (iEndIdx <= 0)
                return -1;
            string sContentSub = sDivSub.Substring(iBgnIdx + 5, iEndIdx - iBgnIdx - 5);
            sContentSub = sContentSub.Replace(" ", " ");
            sContentSub = sContentSub.Replace("<br />", Environment.NewLine);

            Console.WriteLine(sContentSub);
            string sHtmlPath = sDir + idx.ToString() + ".html";
            using (StreamWriter sw = new StreamWriter(sHtmlPath))//将获取的内容写入文本
            {
                sw.Write(sContentSub);
            }
            return 0;
        }

        static int downPage(string sUrl, ref string pageHtml)
        {
            Byte[] pageData = MyWebClient.DownloadData(sUrl); //从指定网站下载数据
            //string pageHtml = Encoding.Default.GetString(pageData);  //如果获取网站页面采用的是GB2312,则使用这句   
            pageHtml = Encoding.UTF8.GetString(pageData); //如果获取网站页面采用的是UTF-8,则使用这句
                                                          //Console.WriteLine(pageHtml);//在控制台输入获取的内容
            return 0;
        }

        static string getNextPageUrl(string se)
        {
            int iLastPage = se.IndexOf("下一页</a>");
            string sCut = se.Substring(iLastPage-12, 12);
            if (sCut.CompareTo("=\"disabled\">") == 0)      //最后一页
                return "";

            int iHrefBgn = se.LastIndexOf("a href=", iLastPage);
            if (iHrefBgn <= 0)
                return "";

            iHrefBgn = iHrefBgn + 8;
            iLastPage = se.IndexOf('\"', iHrefBgn);
            if (iLastPage <= 0)
                return "";

            iHrefBgn = se.LastIndexOf('/', iLastPage);
            if (iHrefBgn <= 0)
                return "";

            return se.Substring(iHrefBgn, iLastPage - iHrefBgn);
        }

        static int getContentUrl(string sKeys, ref string sHref, ref string sTitle)
        {
            int iHrefBgn = sKeys.IndexOf("a href=");
            if (iHrefBgn <= 0)
                return -1;

            iHrefBgn = iHrefBgn + 8;
            int iLastPage = sKeys.IndexOf('\"', iHrefBgn);
            if (iLastPage <= 0)
                return -2;

            sHref = sKeys.Substring(iHrefBgn, iLastPage - iHrefBgn);

            iHrefBgn = iLastPage + 2;
            iLastPage = sKeys.IndexOf("</a>");
            if (iLastPage <= 0)
                return -3;

            sTitle = sKeys.Substring(iHrefBgn, iLastPage - iHrefBgn);
            return 0;
        }

        static int dealIndexPage(string sIndexUrl)
        {
            string pageHtml = "";
            downPage(sMain + sIndexUrl, ref pageHtml);

            string sRecommendKey = "<div class=\"recommend\">";
            int iBgnIdx = pageHtml.IndexOf(sRecommendKey);
            if (iBgnIdx <= 0)
                return -1;
            iBgnIdx = pageHtml.IndexOf(sRecommendKey, iBgnIdx + 10);
            if (iBgnIdx <= 0)
                return -2;

            string sNoteKey = "<p class=\"note\">";
            int iEndIdx = pageHtml.IndexOf(sNoteKey, iBgnIdx + 10);
            if (iEndIdx <= 0)
                return -3;

            string sHrefArray = pageHtml.Substring(iBgnIdx, iEndIdx - iBgnIdx);
            //sHrefArray.Split(new string[] { "\\r\\n", });
            string[] sTmpArray = sHrefArray.Split("\r\n".ToCharArray(), StringSplitOptions.RemoveEmptyEntries);
            foreach (string se in sTmpArray)
            {
                if (se.IndexOf("a href=") > 0)
                {
                    Console.Write(se + "\r\n");
                    int iLastPage = se.IndexOf("下一页</a>");
                    if (iLastPage > 0)          //下一页
                    {
                        string sNext = getNextPageUrl(se);
                        if (sNext != "")
                        {
                            Console.Write("nextpage  "+sNext+"  \r\n");
                            dealIndexPage(sNext);
                        }
                        else
                            Console.Write("Finish..........\r\n");
                    }
                    else                        //普通链接页
                    {
                        string sUrl = "";
                        string sTitle = "";
                        if ( 0 == getContentUrl(se, ref sUrl, ref sTitle) )
                            Console.Write("   "+ sUrl + "  " + sTitle + "\r\n");
                        else
                            Console.Write("   deal...error \r\n");
                    }
                }
            }
            Console.Write("aaaaaaaaaaaaa\r\n");
            return 0;
        }

        static void write2File(ref string pageContext, ref string sPath)
        {
            using (StreamWriter sw = new StreamWriter(sPath))//将获取的内容写入文本
            {
                sw.Write(pageContext);
            }
        }

        static void Main(string[] args)
        {
            try
            {
                MyWebClient.Credentials = CredentialCache.DefaultCredentials;//获取或设置用于对向Internet资源的请求进行身份验证的网络凭据。
                //string pageHtml = "";
                //downPage("http://wap.xxbiquge.com/59_59865/index_49.html", ref pageHtml);
                //downPage("http://wap.xxbiquge.com/59_59865/3184122.html", ref pageHtml);
                sMain = "http://wap.xxbiquge.com/59_59865/";
                dealIndexPage("");
                //write2File(pageHtml, "H:\\page\\ouput.html");
                //writeContent(pageHtml, 0);
                
            }
            catch (WebException webEx)
            {
                Console.WriteLine(webEx.Message.ToString());
            }


            Console.ReadLine(); //让控制台暂停,否则一闪而过了
        }
    }
}




///<summary>
/// 序列化
/// </summary>
/// <param name="data">要序列化的对象</param>
/// <returns>返回存放序列化后的数据缓冲区</returns>
public byte[] Serialize(object data)
{
BinaryFormatter formatter = new BinaryFormatter();
MemoryStream rems = new MemoryStream();
formatter.Serialize(rems, data);
return rems.GetBuffer();
}

/// <summary>
/// 反序列化
/// </summary>
/// <param name="data">数据缓冲区</param>
/// <returns>对象</returns>
public object Deserialize(byte[] data)
{
BinaryFormatter formatter = new BinaryFormatter();
MemoryStream rems = new MemoryStream(data);
data = null;
object obj = new object();
try
{
obj = formatter.Deserialize(rems);
}
catch (Exception ex)
{
Console.Write("BaseAction序列化bug:" + ex.ToString());
}
return obj;
}

  

posted @ 2017-07-13 18:34  掉头发的666  阅读(481)  评论(0编辑  收藏  举报