Get All URLs on a Page
链接:http://www.csharphelp.com/archives4/archive693.html
1
GetUrls urls = new GetUrls(); urls.RetrieveUrls("http://www.microsoft.com");
2
3
4
The class is listed below. Have fun!
5
//required namespaces
6
using System;
7
using System.Collections.Generic;
8
using System.Text;
9
using System.Net;
10
using System.IO;
11
using System.Text.RegularExpressions;
12
13
14
namespace FindAllUrls
15
{
16
class GetUrls
17
{
18
19
//public method called from your application
20
public void RetrieveUrls( string webPage )
21
{
22
GetAllUrls(RetrieveContent(webPage));
23
}
24
25
//get the content of the web page passed in
26
private string RetrieveContent(string webPage)
27
{
28
HttpWebResponse response = null;//used to get response
29
StreamReader respStream = null;//used to read response into string
30
try
31
{
32
//create a request object using the url passed in
33
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(webPage);
34
request.Timeout = 10000;
35
36
//go get a response from the page
37
response = (HttpWebResponse)request.GetResponse();
38
39
//create a streamreader object from the response
40
respStream = new StreamReader(response.GetResponseStream());
41
42
//get the contents of the page as a string and return it
43
return respStream.ReadToEnd();
44
}
45
catch (Exception ex)//houston we have a problem!
46
{
47
throw ex;
48
}
49
finally
50
{
51
//close it down, we're going home!
52
response.Close();
53
respStream.Close();
54
}
55
}
56
57
//using a regular expression, find all of the href or urls
58
//in the content of the page
59
private void GetAllUrls( string content )
60
{
61
//regular expression
62
string pattern = @"(?:href\s*=)(?:[\s""']*)(?!#|mailto|location.|javascript|.*css|.*this\.)(?
63
.*?)(?:[\s>""'])";
64
65
//Set up regex object
66
Regex RegExpr = new Regex(pattern, RegexOptions.IgnoreCase);
67
68
//get the first match
69
Match match = RegExpr.Match(content);
70
71
//loop through matches
72
while (match.Success)
73
{
74
75
//output the match info
76
Console.WriteLine("href match: " + match.Groups[0].Value);
77
WriteToLog("C:\matchlog.txt", "href match: " + match.Groups[0].Value + "\r\n");
78
79
Console.WriteLine("Url match: " + match.Groups[1].Value);
80
WriteToLog("C:\matchlog.txt", "Url | Location | mailto match: " + match.Groups[1].Value + "\r\n");
81
82
//get next match
83
match = match.NextMatch();
84
}
85
}
86
87
//Write to a log file
88
private void WriteToLog(string file, string message)
89
{
90
using (StreamWriter w = File.AppendText(file))
91
{
92
w.WriteLine(DateTime.Now.ToString() + ": " + message); w.Close();
93
}
94
}
95
}
96
}
97
98
GetUrls urls = new GetUrls(); urls.RetrieveUrls("http://www.microsoft.com"); 2

3

4
The class is listed below. Have fun! 5
//required namespaces6
using System; 7
using System.Collections.Generic; 8
using System.Text; 9
using System.Net; 10
using System.IO; 11
using System.Text.RegularExpressions; 12

13

14
namespace FindAllUrls 15
{ 16
class GetUrls 17
{ 18

19
//public method called from your application 20
public void RetrieveUrls( string webPage ) 21
{ 22
GetAllUrls(RetrieveContent(webPage)); 23
} 24

25
//get the content of the web page passed in 26
private string RetrieveContent(string webPage) 27
{ 28
HttpWebResponse response = null;//used to get response 29
StreamReader respStream = null;//used to read response into string 30
try 31
{ 32
//create a request object using the url passed in 33
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(webPage); 34
request.Timeout = 10000; 35

36
//go get a response from the page 37
response = (HttpWebResponse)request.GetResponse(); 38
39
//create a streamreader object from the response 40
respStream = new StreamReader(response.GetResponseStream()); 41

42
//get the contents of the page as a string and return it 43
return respStream.ReadToEnd(); 44
} 45
catch (Exception ex)//houston we have a problem! 46
{ 47
throw ex; 48
} 49
finally 50
{ 51
//close it down, we're going home! 52
response.Close(); 53
respStream.Close(); 54
} 55
} 56
57
//using a regular expression, find all of the href or urls 58
//in the content of the page 59
private void GetAllUrls( string content ) 60
{ 61
//regular expression 62
string pattern = @"(?:href\s*=)(?:[\s""']*)(?!#|mailto|location.|javascript|.*css|.*this\.)(?63
.*?)(?:[\s>""'])"; 64
65
//Set up regex object 66
Regex RegExpr = new Regex(pattern, RegexOptions.IgnoreCase); 67

68
//get the first match 69
Match match = RegExpr.Match(content); 70

71
//loop through matches 72
while (match.Success) 73
{ 74

75
//output the match info 76
Console.WriteLine("href match: " + match.Groups[0].Value); 77
WriteToLog("C:\matchlog.txt", "href match: " + match.Groups[0].Value + "\r\n"); 78

79
Console.WriteLine("Url match: " + match.Groups[1].Value); 80
WriteToLog("C:\matchlog.txt", "Url | Location | mailto match: " + match.Groups[1].Value + "\r\n"); 81
82
//get next match 83
match = match.NextMatch(); 84
} 85
} 86

87
//Write to a log file 88
private void WriteToLog(string file, string message) 89
{ 90
using (StreamWriter w = File.AppendText(file)) 91
{ 92
w.WriteLine(DateTime.Now.ToString() + ": " + message); w.Close(); 93
} 94
} 95
} 96
}97

98




浙公网安备 33010602011771号