用asp.net提取网页中的URL

（在运行时需要连接网络）

网页的设计取一个Dafault.aspx

代码为：

<%@ Page Language="C#" AutoEventWireup="true" CodeFile="Default.aspx.cs" Inherits="_Default" %>

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">

</head>

<body>

<div>

请输网址<asp:TextBox ID="t1" runat="server"></asp:TextBox><asp:RegularExpressionValidator

ID="RegularExpressionValidator1" runat="server" ErrorMessage="请输入正y确的ÌURL！"

ControlToValidate="t1"

ValidationExpression="http(s)?://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?"></asp:RegularExpressionValidator>

</div>

<asp:Button ID="Button1" runat="server" onclick="Button1_Click" Text="Button" />

<p></p>

<asp:TextBox ID="t2" runat="server" TextMode="MultiLine" MaxLength="101"

Rows="12" Height="210px" ReadOnly="True" Width="847px"></asp:TextBox>

<p></p>

<asp:ListBox ID="ListBox1" runat="server" Height="98px" Width="309px"></asp:ListBox>

</form>

</body>

</html>

其页面为：

本网页的后台代码为：

using System;

using System.Collections;

using System.Collections.Generic;

using System.IO;

using System.Net;

using System.Text.RegularExpressions;

using System.Web;

using System.Web.UI;

using System.Web.UI.WebControls;

using System.Xml;

public partial class _Default : System.Web.UI.Page

{

protected void Page_Load(object sender, EventArgs e)

{

}

protected void Button1_Click(object sender, EventArgs e)

{

ArrayList allinks;

//获取网页中的代码

if (t1.Text == "")

Response.Write("<script>alert('请入网址')</script>");

else

{

try

{

string str1 = t1.Text;

HttpWebRequest req = WebRequest.Create(t1.Text) as HttpWebRequest;

HttpWebResponse res = req.GetResponse() as HttpWebResponse;

StreamReader sr = new StreamReader(res.GetResponseStream(), System.Text.Encoding.Default);

string html = sr.ReadToEnd();

t2.Text = html;

//获取超链接

allinks = GetHyperLinks(html);

for (int i = 0; i < allinks.Count; i++)

{

ListBox1.Items.Add(allinks[i].ToString()); ;

}

catch (Exception ee)

{

string ss= ee.Message.ToString();

Response.Write(ss);

//Response.Write(ss);

}

/*//获取网页中的超链接

ArrayList al = new ArrayList();

string strRegex = @"http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?";

Regex r = new Regex(strRegex,RegexOptions.IgnoreCase);

MatchCollection m = r.Matches();*/

}

// 提取HTML代码中的网址

static ArrayList GetHyperLinks(string htmlCode)

{

ArrayList al = new ArrayList();

//定义网页的正则表达式

string strRegex = @"http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?";

Regex r = new Regex(strRegex, RegexOptions.IgnoreCase);

MatchCollection m = r.Matches(htmlCode);

for (int i = 0; i <= m.Count - 1; i++)

{

bool rep = false;

string strNew = m[i].ToString();

// 过滤重复的URL

foreach (string str in al)

{

if (strNew == str)

{

rep = true;

break;

}

if (!rep) al.Add(strNew);

}

al.Sort();

return al;

}

posted on 2012-04-27 11:16 AQinb 阅读(566) 评论(1) 收藏举报

刷新页面返回顶部

西轩

导航

公告

用asp.net提取网页中的URL