C#的一个URL加载器,能处理编码、相对地址解析、GET/POST、HTML的include、页面重定向
要让机器模拟上网,首要的问题是解决HTTP请求响应,看我们的Url加载器,功能比较强。它考虑了编码、URL的相对地址解析(见RFC),还可以POST数据,还有HTML里的<!--include-->,还有<head>里的重定向,很好用的。
1
/// <summary>
2
/// 最基本的Url加载函数,其它重载函数均调用它
3
/// </summary>
4
/// <param name="url"></param>
5
/// <param name="encoding"></param>
6
/// <param name="postdata"></param>
7
/// <param name="include">是否在客户端包含include文件</param>
8
/// <param name="redirectioncounter">计算重定向的次数</param>
9
/// <returns></returns>
10
public static string LoadUrl(ref UrlOperation uo, string encoding, string postdata, bool include, int redirectioncounter)
11
{
12
string str;
13
14
string url=uo.Url;
15
HttpWebRequest request;
16
HttpWebResponse response;
17
18
//采用HTTP GET或者POST
19
if (postdata == null)
20
postdata = "";
21
if (postdata.Length == 0)//HTTP GET
22
{
23
try
24
{
25
request = (HttpWebRequest)System.Net.HttpWebRequest.Create(url);
26
}
27
catch
28
{
29
return "";
30
}
31
32
request.UserAgent = "Mozilla/4.0 (compatible; MSIE 5.5; Windows 98)";
33
34
//超时异常发生在这里
35
try
36
{
37
response = (HttpWebResponse)request.GetResponse();
38
//uo.Url = response.ResponseUri.ToString();
39
}
40
catch
41
{
42
return "";
43
}
44
45
System.IO.Stream stream = response.GetResponseStream();
46
47
Encoding source;
48
try
49
{
50
source = Encoding.GetEncoding(encoding);
51
}
52
catch
53
{
54
source = Encoding.UTF8;
55
}
56
57
StreamReader sr = new StreamReader(stream, source);
58
try
59
{
60
str = sr.ReadToEnd();
61
}
62
catch
63
{
64
return "";
65
}
66
sr.Close();
67
stream.Close();
68
}
69
else//HTTP POST
70
{
71
try
72
{
73
ASCIIEncoding asciiencoding = new ASCIIEncoding();
74
byte[] bytes = asciiencoding.GetBytes(postdata);
75
76
request = (HttpWebRequest)System.Net.HttpWebRequest.Create(url);
77
request.Method = "POST";
78
request.ContentType = "application/x-www-form-urlencoded";
79
request.ContentLength = postdata.Length;
80
81
Stream poststream = request.GetRequestStream();
82
poststream.Write(bytes, 0, bytes.Length);
83
poststream.Close();
84
85
response = (HttpWebResponse)request.GetResponse();
86
87
StreamReader sr = new StreamReader(response.GetResponseStream(), System.Text.Encoding.GetEncoding("GB2312"));
88
str = sr.ReadToEnd();
89
response.Close();
90
}
91
catch
92
{
93
return "";
94
}
95
}
96
97
uo.Url = response.ResponseUri.ToString();
98
99
//在客户端包含include文件
100
if (include)
101
{
102
System.Text.RegularExpressions.Regex regex = new Regex(@"<!--\W*include.*?-->", RegexOptions.IgnoreCase | RegexOptions.Compiled | RegexOptions.Singleline);
103
MatchCollection mc = regex.Matches(str);
104
if (mc.Count > 0)
105
{
106
System.Text.RegularExpressions.Regex urlregex = new Regex("(?<=\").*(?=\")", RegexOptions.IgnoreCase | RegexOptions.Compiled | RegexOptions.Singleline);
107
108
string[] segments = regex.Split(str);
109
110
StringBuilder sb = new StringBuilder();
111
sb.Append(segments[0]);
112
for (int i = 1; i <= mc.Count; i++)
113
{
114
string s = mc[i - 1].Value;
115
string newurl = urlregex.Match(s).Value;
116
UrlOperation newuo = uo.Forward(newurl);
117
string included = LoadUrl(ref newuo, encoding, "", true);
118
sb.Append(included);
119
sb.Append(segments[i]);
120
}
121
122
str = sb.ToString();
123
}
124
}
125
126
//页面重定向
127
string redirection=GetRedirection(str).Trim();
128
if (redirection.Length > 0&&redirectioncounter<5)
129
{
130
uo=uo.Forward(redirection);
131
return LoadUrl(ref uo, encoding, postdata, include, redirectioncounter + 1);
132
}
133
else
134
return str;
135
}



浙公网安备 33010602011771号