using System; 
using System.Xml; 
using System.Text; 
using System.Net; 
using System.IO; 
using System.Collections; 
using System.Text.RegularExpressions; 
public class App 
{ 
public static void Main() 
{ 
string strCode; 
ArrayList alLinks; 
Console.Write("请输入一个网页地址:"); 
string strURL = Console.ReadLine(); 
if(strURL.Substring(0,7) != @"http://") 
{ 
strURL = @"http://" + strURL; 
} 
Console.WriteLine("正在获取页面代码,请稍侯..."); 
strCode = GetPageSource(strURL); 
Console.WriteLine("正在提取超链接,请稍侯..."); 
alLinks = GetHyperLinks(strCode); 
Console.WriteLine("正在写入文件,请稍侯..."); 
WriteToXml(strURL,alLinks); 
} 
// 获取指定网页的HTML代码 
static string GetPageSource(string URL) 
{ 
Uri uri =new Uri(URL); 
HttpWebRequest hwReq = (HttpWebRequest)WebRequest.Create(uri); 
HttpWebResponse hwRes = (HttpWebResponse)hwReq.GetResponse(); 
hwReq.Method = "Get";
hwReq.KeepAlive = false;
StreamReader reader = new StreamReader(hwRes.GetResponseStream(),System.Text.Encoding.GetEncoding("GB2312"));
return reader.ReadToEnd(); 
} 
// 提取HTML代码中的网址 
static ArrayList GetHyperLinks(string htmlCode) 
{ 
ArrayList al = new ArrayList(); 
string strRegex = @"http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?";
Regex r = new Regex(strRegex,RegexOptions.IgnoreCase); 
MatchCollection m = r.Matches(htmlCode); 
for(int i=0; i<=m.Count-1; i++) 
{ 
bool rep = false; 
string strNew = m[i].ToString(); 
// 过滤重复的URL 
foreach(string str in al) 
{ 
if(strNew==str) 
{ 
rep =true; 
break; 
} 
} 
if(!rep) al.Add(strNew); 
} 
al.Sort();
return al; 
} 
// 把网址写入xml文件 
static void WriteToXml(string strURL, ArrayList alHyperLinks) 
{ 
XmlTextWriter writer = new XmlTextWriter("HyperLinks.xml",Encoding.UTF8); 
writer.Formatting = Formatting.Indented; 
writer.WriteStartDocument(false); 
writer.WriteDocType("HyperLinks", null, "urls.dtd", null); 
writer.WriteComment("提取自" + strURL + "的超链接"); 
writer.WriteStartElement("HyperLinks"); 
writer.WriteStartElement("HyperLinks", null); 
writer.WriteAttributeString("DateTime",DateTime.Now.ToString()); 
foreach(string str in alHyperLinks) 
{ 
string title = GetDomain(str); 
string body = str; 
writer.WriteElementString(title,null,body); 
} 
writer.WriteEndElement(); 
writer.WriteEndElement(); 
writer.Flush(); 
writer.Close(); 
} 
// 获取网址的域名后缀 
static string GetDomain(string strURL) 
{ 
string retVal; 
string strRegex = @"(\.com/|\.net/|\.cn/|\.org/|\.gov/)";
Regex r = new Regex(strRegex,RegexOptions.IgnoreCase); 
Match m = r.Match(strURL); 
retVal = m.ToString(); 
strRegex = @"\.|/$"; 
retVal = Regex.Replace(retVal, strRegex, "").ToString(); 
if(retVal == "") 
retVal = "other"; 
return retVal; 
} 
} 
本文来自CSDN博客,转载请标明出处:http://blog.csdn.net/21aspnet/archive/2007/03/24/1540012.aspx
 
                     
                    
                 
                    
                 
 
                
            
         
         浙公网安备 33010602011771号
浙公网安备 33010602011771号