自己写的抓取Google赞助商链接的页面源码

        private void GetInfo()
        {
            WebClient InfoWclient = new WebClient();

            string StrGatherURL = TextBoxGatherURL.Text + HttpUtility.UrlEncode(TextBoxAdWord.Text);

            string StrGatherBaseURL = StrGatherURL.Substring(0, StrGatherURL.Replace("//", "").IndexOf("/") + 2);

            Stream InfoStream = InfoWclient.OpenRead(StrGatherURL);
            StreamReader InfoStreamReader = new StreamReader(InfoStream, Encoding.GetEncoding("GB2312"));
            string StrFullInfo = InfoStreamReader.ReadToEnd();

            Regex AdListTypeRegex = new Regex("更多赞助商链接");
            Regex AdListInfoRegex;

            if (AdListTypeRegex.IsMatch(StrFullInfo))
            {
                AdListInfoRegex = new Regex("<table cellspacing=0 cellpadding=0 width=25% align=right bgcolor=#ffffff border=0>.*更多赞助商链接 &raquo;</a></font></td></tr></table>", RegexOptions.IgnoreCase | RegexOptions.Compiled);

            }
            else
            {
                AdListInfoRegex = new Regex("<table cellspacing=0 cellpadding=0 width=25% align=right bgcolor=#ffffff border=0>.*<font size=-1></font></td></tr></table>", RegexOptions.IgnoreCase | RegexOptions.Compiled);
            }

            string StrAdList = AdListInfoRegex.Match(StrFullInfo).ToString();

            Regex AdLinkTitleRegex = new Regex("<font size=.?0.*</font></a>", RegexOptions.IgnoreCase | RegexOptions.Compiled);
            Regex AdLinkSrcRegex = new Regex("/url.*target=nw", RegexOptions.IgnoreCase | RegexOptions.Compiled);

            MatchCollection AdLinkSrcMatchCollection = AdLinkSrcRegex.Matches(StrAdList.Replace("target=nw>", "target=nw\n"));

            MatchCollection AdLinkTitleMatchCollection = AdLinkTitleRegex.Matches(StrAdList.ToLower().Replace("<br>", "\n"));

            DataTable AdListDt = new DataTable();
            AdListDt.Columns.Add("Place");
            AdListDt.Columns.Add("Title");
            AdListDt.Columns.Add("Value");


            for (int i = 0; i < AdLinkSrcMatchCollection.Count; i++)
            {
                string StrAdLinkSrc = AdLinkSrcMatchCollection[i].ToString();
                string StrAdListItemValue = StrGatherBaseURL + StrAdLinkSrc.Replace("   target=nw", "");

                DataRow AdAddRow = AdListDt.NewRow();
                AdAddRow["Place"] = (i+1).ToString();
                AdAddRow["Title"] = AdLinkTitleMatchCollection[i].ToString().Replace("<font size=+0>", "").Replace("</font></a>", "");
                AdAddRow["Value"] = StrAdListItemValue;
                AdListDt.Rows.Add(AdAddRow);
            }
            GridViewAdList.DataSource = AdListDt;
            GridViewAdList.Columns[0].HeaderText = "位置";
            GridViewAdList.Columns[0].Width = 40;
            GridViewAdList.Columns[1].HeaderText = "标题";
            GridViewAdList.Columns[1].Width = 200;
            GridViewAdList.Columns[2].HeaderText = "链接";
            GridViewAdList.Columns[2].Visible = false;

        }

posted @ 2007-02-02 18:56 我爱家和生活18e3.com 阅读(287) 评论(1) 编辑 收藏