Get All URLs on a Page

链接：http://www.csharphelp.com/archives4/archive693.html

GetUrls urls = new GetUrls(); urls.RetrieveUrls("http://www.microsoft.com");
2

The class is listed below. Have fun!
5

//required namespaces
6

using System;
7

using System.Collections.Generic;
8

using System.Text;
9

using System.Net;
10

using System.IO;
11

using System.Text.RegularExpressions;
12

namespace FindAllUrls
15

{
16

class GetUrls
17

{
18

//public method called from your application
20

public void RetrieveUrls( string webPage )
21

{
22

GetAllUrls(RetrieveContent(webPage));
23

}
24

//get the content of the web page passed in
26

private string RetrieveContent(string webPage)
27

{
28

HttpWebResponse response = null;//used to get response
29

StreamReader respStream = null;//used to read response into string
30

try
31

{
32

//create a request object using the url passed in
33

HttpWebRequest request = (HttpWebRequest)WebRequest.Create(webPage);
34

request.Timeout = 10000;
35

//go get a response from the page
37

response = (HttpWebResponse)request.GetResponse();
38

//create a streamreader object from the response
40

respStream = new StreamReader(response.GetResponseStream());
41

//get the contents of the page as a string and return it
43

return respStream.ReadToEnd();
44

}
45

catch (Exception ex)//houston we have a problem!
46

{
47

throw ex;
48

}
49

finally
50

{
51

//close it down, we're going home!
52

response.Close();
53

respStream.Close();
54

}
55

}
56

//using a regular expression, find all of the href or urls
58

//in the content of the page
59

private void GetAllUrls( string content )
60

{
61

//regular expression
62

.*?)(?:[\s>""'])";
64

//Set up regex object
66

Regex RegExpr = new Regex(pattern, RegexOptions.IgnoreCase);
67

//get the first match
69

Match match = RegExpr.Match(content);
70

//loop through matches
72

while (match.Success)
73

{
74

//output the match info
76

Console.WriteLine("href match: " + match.Groups[0].Value);
77

WriteToLog("C:\matchlog.txt", "href match: " + match.Groups[0].Value + "\r\n");
78

Console.WriteLine("Url match: " + match.Groups[1].Value);
80

WriteToLog("C:\matchlog.txt", "Url | Location | mailto match: " + match.Groups[1].Value + "\r\n");
81

//get next match
83

match = match.NextMatch();
84

}
85

}
86

//Write to a log file
88

private void WriteToLog(string file, string message)
89

{
90

using (StreamWriter w = File.AppendText(file))
91

{
92

w.WriteLine(DateTime.Now.ToString() + ": " + message); w.Close();
93

}
94

}
95

}
96

}
97

posted @ 2008-05-19 14:15 大宋提刑官阅读(332) 评论(1) 收藏举报

刷新页面返回顶部

一品梅

Get All URLs on a Page

公告