用asp.net提取网页中的URL
(在运行时需要连接网络)
网页的设计取一个Dafault.aspx
代码为:
<%@ Page Language="C#" AutoEventWireup="true" CodeFile="Default.aspx.cs" Inherits="_Default" %>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head runat="server">
<title></title>
</head>
<body>
<form id="form1" runat="server">
<div>
请输网址<asp:TextBox ID="t1" runat="server"></asp:TextBox><asp:RegularExpressionValidator
ID="RegularExpressionValidator1" runat="server" ErrorMessage="请输入正y确的ÌURL!"
ControlToValidate="t1"
ValidationExpression="http(s)?://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?"></asp:RegularExpressionValidator>
</div>
<asp:Button ID="Button1" runat="server" onclick="Button1_Click" Text="Button" />
<p></p>
<asp:TextBox ID="t2" runat="server" TextMode="MultiLine" MaxLength="101"
Rows="12" Height="210px" ReadOnly="True" Width="847px"></asp:TextBox>
<p></p>
<asp:ListBox ID="ListBox1" runat="server" Height="98px" Width="309px"></asp:ListBox>
</form>
</body>
</html>
其页面为:
本网页的后台代码为:
using System;
using System.Collections;
using System.Collections.Generic;
using System.IO;
using System.Net;
using System.Text.RegularExpressions;
using System.Web;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Xml;
public partial class _Default : System.Web.UI.Page
{
protected void Page_Load(object sender, EventArgs e)
{
}
protected void Button1_Click(object sender, EventArgs e)
{
ArrayList allinks;
//获取网页中的代码
if (t1.Text == "")
Response.Write("<script>alert('请入网址')</script>");
else
{
try
{
string str1 = t1.Text;
HttpWebRequest req = WebRequest.Create(t1.Text) as HttpWebRequest;
HttpWebResponse res = req.GetResponse() as HttpWebResponse;
StreamReader sr = new StreamReader(res.GetResponseStream(), System.Text.Encoding.Default);
string html = sr.ReadToEnd();
t2.Text = html;
//获取超链接
allinks = GetHyperLinks(html);
for (int i = 0; i < allinks.Count; i++)
{
ListBox1.Items.Add(allinks[i].ToString()); ;
}
}
catch (Exception ee)
{
string ss= ee.Message.ToString();
Response.Write(ss);
//Response.Write(ss);
}
}
/*//获取网页中的超链接
ArrayList al = new ArrayList();
string strRegex = @"http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?";
Regex r = new Regex(strRegex,RegexOptions.IgnoreCase);
MatchCollection m = r.Matches();*/
}
// 提取HTML代码中的网址
static ArrayList GetHyperLinks(string htmlCode)
{
ArrayList al = new ArrayList();
//定义网页的正则表达式
string strRegex = @"http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?";
Regex r = new Regex(strRegex, RegexOptions.IgnoreCase);
MatchCollection m = r.Matches(htmlCode);
for (int i = 0; i <= m.Count - 1; i++)
{
bool rep = false;
string strNew = m[i].ToString();
// 过滤重复的URL
foreach (string str in al)
{
if (strNew == str)
{
rep = true;
break;
}
}
if (!rep) al.Add(strNew);
}
al.Sort();
return al;
}
}
浙公网安备 33010602011771号