西轩

导航

用asp.net提取网页中的URL

(在运行时需要连接网络)

网页的设计取一个Dafault.aspx

代码为:

<%@ Page Language="C#" AutoEventWireup="true"  CodeFile="Default.aspx.cs" Inherits="_Default" %>

 

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">

 

<html xmlns="http://www.w3.org/1999/xhtml">

<head runat="server">

    <title></title>

</head>

<body>

    <form id="form1" runat="server">

    <div>

      请输网址<asp:TextBox ID="t1" runat="server"></asp:TextBox><asp:RegularExpressionValidator

          ID="RegularExpressionValidator1" runat="server" ErrorMessage="请输入正y确的ÌURL!"

            ControlToValidate="t1"

            ValidationExpression="http(s)?://([\w-]+\.)+[\w-]+(/[\w- ./?%&amp;=]*)?"></asp:RegularExpressionValidator>

    </div>

    <asp:Button ID="Button1" runat="server" onclick="Button1_Click" Text="Button" />

    <p></p>

    <asp:TextBox ID="t2" runat="server" TextMode="MultiLine" MaxLength="101"

        Rows="12" Height="210px" ReadOnly="True" Width="847px"></asp:TextBox>

        <p></p>

    <asp:ListBox ID="ListBox1" runat="server" Height="98px" Width="309px"></asp:ListBox>

 

    </form>

 

</body>

</html>

其页面为:

 

本网页的后台代码为:

using System;

using System.Collections;

using System.Collections.Generic;

using System.IO;

using System.Net;

using System.Text.RegularExpressions;

using System.Web;

using System.Web.UI;

using System.Web.UI.WebControls;

using System.Xml;

 

 

public partial class _Default : System.Web.UI.Page

{

    protected void Page_Load(object sender, EventArgs e)

    {

    }

    protected void Button1_Click(object sender, EventArgs e)

    {

        ArrayList allinks;

      //获取网页中的代码

        if (t1.Text == "")

            Response.Write("<script>alert('请入网址')</script>");

        else

        {

            try

            {

                string str1 = t1.Text;

                HttpWebRequest req = WebRequest.Create(t1.Text) as HttpWebRequest;

                HttpWebResponse res = req.GetResponse() as HttpWebResponse;

                StreamReader sr = new StreamReader(res.GetResponseStream(), System.Text.Encoding.Default);

                string html = sr.ReadToEnd();

                t2.Text = html;

                //获取超链接

                allinks = GetHyperLinks(html);

                for (int i = 0; i < allinks.Count; i++)

                {

                    ListBox1.Items.Add(allinks[i].ToString()); ;

                }

            }

            catch (Exception ee)

            {

               string ss= ee.Message.ToString();

               Response.Write(ss);

               //Response.Write(ss);  

            }

        }

        /*//获取网页中的超链接

        ArrayList al = new ArrayList();

        string strRegex = @"http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?";

        Regex r = new Regex(strRegex,RegexOptions.IgnoreCase);

        MatchCollection m = r.Matches();*/

    }

    // 提取HTML代码中的网址

    static ArrayList GetHyperLinks(string htmlCode)

    {

        ArrayList al = new ArrayList();

        //定义网页的正则表达式

        string strRegex = @"http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?";

        Regex r = new Regex(strRegex, RegexOptions.IgnoreCase);

        MatchCollection m = r.Matches(htmlCode);

        for (int i = 0; i <= m.Count - 1; i++)

        {

            bool rep = false;

            string strNew = m[i].ToString();

            // 过滤重复的URL

            foreach (string str in al)

            {

                if (strNew == str)

                {

                    rep = true;

                    break;

                }

            }

            if (!rep) al.Add(strNew);

        }

        al.Sort();

        return al;

    }

}

posted on 2012-04-27 11:16  AQinb  阅读(566)  评论(1)    收藏  举报