Asp.net 数据采集基类(远程抓取,分解,保存,匹配)
昨天因为接到做采集程序的任务,之前到是用过不少采集程序,但从没有自己动手做过,
要做起来还真有点无从下手.但任务下来就得完成.马上开始google,baidu...
搜出来的最多的就是我要发的这个基类...好东西..但网上到处都是.都不知道原创是哪的了.
反正很谢谢写这个类的大哥.帮了我的大忙.呵呵....
现在把这个类发这里一是给自己备忘,二是让有需要的人尽可能的搜到..
1
using System;
2
using System.Data;
3
using System.Configuration;
4
using System.Web;
5
using System.Web.Security;
6
using System.Web.UI;
7
using System.Web.UI.WebControls;
8
using System.Web.UI.WebControls.WebParts;
9
using System.Web.UI.HtmlControls;
10
using MSXML2; //这个需要添加引用MSXML2
11
using System.Text.RegularExpressions;
12
namespace EC
13

{
14
/**//// <summary>
15
/// 远程文件抓取类
16
/// </summary>
17
public class GetRemoteObj
18
{
19
20
构造与析构函数#region 构造与析构函数
21
public GetRemoteObj()
22
{
23
//
24
// TODO: 在此处添加构造函数逻辑
25
//
26
}
27
~GetRemoteObj()
28
{
29
Dispose();
30
}
31
#endregion
32
33
IDisposable 成员#region IDisposable 成员
34
35
public void Dispose()
36
{
37
GC.SuppressFinalize(this);
38
}
39
40
#endregion
41
42
日期随机函数#region 日期随机函数
43
/**//**********************************
44
* 函数名称:DateRndName
45
* 功能说明:日期随机函数
46
* 参 数:ra:随机数
47
* 调用示例:
48
* GetRemoteObj o = new GetRemoteObj();
49
* Random ra = new Random();
50
* string s = o.DateRndName(ra);
51
* Response.Write(s);
52
* o.Dispose();
53
* ********************************/
54
/**//// <summary>
55
/// 日期随机函数
56
/// </summary>
57
/// <param name="ra">随机数</param>
58
/// <returns></returns>
59
public string DateRndName(Random ra)
60
{
61
DateTime d = DateTime.Now;
62
string s = null, y, m, dd, h, mm, ss;
63
y = d.Year.ToString();
64
m = d.Month.ToString();
65
if (m.Length < 2) m = "0" + m;
66
dd = d.Day.ToString();
67
if (dd.Length < 2) dd = "0" + dd;
68
h = d.Hour.ToString();
69
if (h.Length < 2) h = "0" + h;
70
mm = d.Minute.ToString();
71
if (mm.Length < 2) mm = "0" + mm;
72
ss = d.Second.ToString();
73
if (ss.Length < 2) ss = "0" + ss;
74
s += y + m + dd + h + mm + ss;
75
s += ra.Next(100, 999).ToString();
76
return s;
77
}
78
#endregion
79
80
取得文件后缀#region 取得文件后缀
81
/**//**********************************
82
* 函数名称:GetFileExtends
83
* 功能说明:取得文件后缀
84
* 参 数:filename:文件名称
85
* 调用示例:
86
* GetRemoteObj o = new GetRemoteObj();
87
* string url = @"http://www.baidu.com/img/logo.gif";
88
* string s = o.GetFileExtends(url);
89
* Response.Write(s);
90
* o.Dispose();
91
* ********************************/
92
/**//// <summary>
93
/// 取得文件后缀
94
/// </summary>
95
/// <param name="filename">文件名称</param>
96
/// <returns></returns>
97
public string GetFileExtends(string filename)
98
{
99
string ext = null;
100
if (filename.IndexOf('.') > 0)
101
{
102
string[] fs = filename.Split('.');
103
ext = fs[fs.Length - 1];
104
}
105
return ext;
106
}
107
#endregion
108
109
获取远程文件源代码#region 获取远程文件源代码
110
/**//**********************************
111
* 函数名称:GetRemoteHtmlCode
112
* 功能说明:获取远程文件源代码
113
* 参 数:Url:远程url
114
* 调用示例:
115
* GetRemoteObj o = new GetRemoteObj();
116
* string url = @"http://www.baidu.com/";
117
* string s = o.GetRemoteHtmlCode(url);
118
* Response.Write(s);
119
* o.Dispose();
120
* ********************************/
121
/**//// <summary>
122
/// 获取远程文件源代码
123
/// </summary>
124
/// <param name="url">远程url</param>
125
/// <returns></returns>
126
public string GetRemoteHtmlCode(string Url)
127
{
128
string s = "";
129
MSXML2.XMLHTTP _xmlhttp = new MSXML2.XMLHTTPClass();
130
_xmlhttp.open("GET", Url, false, null, null);
131
_xmlhttp.send("");
132
if (_xmlhttp.readyState == 4)
133
{
134
s = System.Text.Encoding.Default.GetString((byte[])_xmlhttp.responseBody);
135
}
136
return s;
137
}
138
139
#endregion
140
141
保存远程文件#region 保存远程文件
142
/**//**********************************
143
* 函数名称:RemoteSave
144
* 功能说明:保存远程文件
145
* 参 数:Url:远程url;Path:保存到的路径
146
* 调用示例:
147
* GetRemoteObj o = new GetRemoteObj();
148
* string s = "";
149
* string url = @"http://www.baidu.com/img/logo.gif";
150
* string path =Server.MapPath("Html/");
151
* s = o.RemoteSave(url,path);
152
* Response.Write(s);
153
* o.Dispose();
154
* ******************************/
155
/**//// <summary>
156
/// 保存远程文件
157
/// </summary>
158
/// <param name="Url">远程url</param>
159
/// <param name="Path">保存到的路径</param>
160
/// <returns></returns>
161
public string RemoteSave(string Url, string Path)
162
{
163
Random ra = new Random();
164
string StringFileName = DateRndName(ra) + "." + GetFileExtends(Url);
165
string StringFilePath = Path + StringFileName;
166
MSXML2.XMLHTTP _xmlhttp = new MSXML2.XMLHTTPClass();
167
_xmlhttp.open("GET", Url, false, null, null);
168
_xmlhttp.send("");
169
if (_xmlhttp.readyState == 4)
170
{
171
if (System.IO.File.Exists(StringFilePath))
172
System.IO.File.Delete(StringFilePath);
173
System.IO.FileStream fs = new System.IO.FileStream(StringFilePath, System.IO.FileMode.CreateNew);
174
System.IO.BinaryWriter w = new System.IO.BinaryWriter(fs);
175
w.Write((byte[])_xmlhttp.responseBody);
176
w.Close();
177
fs.Close();
178
}
179
else
180
throw new Exception(_xmlhttp.statusText);
181
return StringFileName;
182
}
183
#endregion
184
185
替换网页中的换行和引号#region 替换网页中的换行和引号
186
/**//**********************************
187
* 函数名称:ReplaceEnter
188
* 功能说明:替换网页中的换行和引号
189
* 参 数:HtmlCode:html源代码
190
* 调用示例:
191
* GetRemoteObj o = new GetRemoteObj();
192
* string Url = @"http://www.baidu.com/";
193
* strion HtmlCode = o.GetRemoteHtmlCode(Url);
194
* string s = o.ReplaceEnter(HtmlCode);
195
* Response.Write(s);
196
* o.Dispose();
197
* ********************************/
198
/**//// <summary>
199
/// 替换网页中的换行和引号
200
/// </summary>
201
/// <param name="HtmlCode">HTML源代码</param>
202
/// <returns></returns>
203
public string ReplaceEnter(string HtmlCode)
204
{
205
string s = "";
206
if (HtmlCode == null || HtmlCode == "")
207
s = "";
208
else
209
s = HtmlCode.Replace("\"", "");
210
s = s.Replace("\r\n", "");
211
return s;
212
}
213
214
#endregion
215
216
执行正则提取出值#region 执行正则提取出值
217
/**//**********************************
218
* 函数名称:GetRegValue
219
* 功能说明:执行正则提取出值
220
* 参 数:HtmlCode:html源代码
221
* 调用示例:
222
* GetRemoteObj o = new GetRemoteObj();
223
* string Url = @"http://www.baidu.com/";
224
* strion HtmlCode = o.GetRemoteHtmlCode(Url);
225
* string s = o.ReplaceEnter(HtmlCode);
226
* string Reg="<title>.+?</title>";
227
* string GetValue=o.GetRegValue(Reg,HtmlCode)
228
* Response.Write(GetValue);
229
* o.Dispose();
230
* ********************************/
231
/**//// <summary>
232
/// 执行正则提取出值
233
/// </summary>
234
/// <param name="RegexString">正则表达式</param>
235
/// <param name="RemoteStr">HtmlCode源代码</param>
236
/// <returns></returns>
237
public string GetRegValue(string RegexString, string RemoteStr)
238
{
239
string MatchVale = "";
240
Regex r = new Regex(RegexString);
241
Match m = r.Match(RemoteStr);
242
if (m.Success)
243
{
244
MatchVale = m.Value;
245
}
246
return MatchVale;
247
}
248
#endregion
249
250
替换HTML源代码#region 替换HTML源代码
251
/**//**********************************
252
* 函数名称:RemoveHTML
253
* 功能说明:替换HTML源代码
254
* 参 数:HtmlCode:html源代码
255
* 调用示例:
256
* GetRemoteObj o = new GetRemoteObj();
257
* string Url = @"http://www.baidu.com/";
258
* strion HtmlCode = o.GetRemoteHtmlCode(Url);
259
* string s = o.ReplaceEnter(HtmlCode);
260
* string Reg="<title>.+?</title>";
261
* string GetValue=o.GetRegValue(Reg,HtmlCode)
262
* Response.Write(GetValue);
263
* o.Dispose();
264
* ********************************/
265
/**//// <summary>
266
/// 替换HTML源代码
267
/// </summary>
268
/// <param name="HtmlCode">html源代码</param>
269
/// <returns></returns>
270
public string RemoveHTML(string HtmlCode)
271
{
272
string MatchVale = HtmlCode;
273
foreach (Match s in Regex.Matches(HtmlCode, "<.+?>"))
274
{
275
MatchVale = MatchVale.Replace(s.Value, "");
276
}
277
return MatchVale;
278
}
279
280
#endregion
281
282
匹配页面的链接#region 匹配页面的链接
283
/**//**********************************
284
* 函数名称:GetHref
285
* 功能说明:匹配页面的链接
286
* 参 数:HtmlCode:html源代码
287
* 调用示例:
288
* GetRemoteObj o = new GetRemoteObj();
289
* string Url = @"http://www.baidu.com/";
290
* strion HtmlCode = o.GetRemoteHtmlCode(Url);
291
* string s = o.GetHref(HtmlCode);
292
* Response.Write(s);
293
* o.Dispose();
294
* ********************************/
295
/**//// <summary>
296
/// 获取页面的链接正则
297
/// </summary>
298
/// <param name="HtmlCode"></param>
299
/// <returns></returns>
300
public string GetHref(string HtmlCode)
301
{
302
string MatchVale = "";
303
string Reg = @"(h|H)(r|R)(e|E)(f|F) *= *('|"")?((\w|\\|\/|\.|:|-|_)+)('|""| *|>)?";
304
foreach(Match m in Regex.Matches(HtmlCode,Reg))
305
{
306
MatchVale += (m.Value).ToLower().Replace("href=", "").Trim() + "||";
307
}
308
return MatchVale;
309
}
310
#endregion
311
312
匹配页面的图片地址#region 匹配页面的图片地址
313
/**//**********************************
314
* 函数名称:GetImgSrc
315
* 功能说明:匹配页面的图片地址
316
* 参 数:HtmlCode:html源代码;imgHttp:要补充的http.当比如:<img src="bb/x.gif">则要补充http://www.baidu.com/,当包含http信息时,则可以为空
317
* 调用示例:
318
* GetRemoteObj o = new GetRemoteObj();
319
* string Url = @"http://www.baidu.com/";
320
* strion HtmlCode = o.GetRemoteHtmlCode(Url);
321
* string s = o.GetImgSrc(HtmlCode,"http://www.baidu.com/");
322
* Response.Write(s);
323
* o.Dispose();
324
* ********************************/
325
/**//// <summary>
326
/// 匹配页面的图片地址
327
/// </summary>
328
/// <param name="HtmlCode"></param>
329
/// <param name="imgHttp">要补充的http://路径信息</param>
330
/// <returns></returns>
331
public string GetImgSrc(string HtmlCode, string imgHttp)
332
{
333
string MatchVale = "";
334
string Reg = @"<img.+?>";
335
foreach (Match m in Regex.Matches(HtmlCode, Reg))
336
{
337
MatchVale += GetImg((m.Value).ToLower().Trim(), imgHttp) + "||";
338
}
339
return MatchVale;
340
}
341
/**//// <summary>
342
/// 匹配<img src="" />中的图片路径实际链接
343
/// </summary>
344
/// <param name="ImgString"><img src="" />字符串</param>
345
/// <returns></returns>
346
public string GetImg(string ImgString, string imgHttp)
347
{
348
string MatchVale = "";
349
string Reg = @"src=.+\.(bmp|jpg|gif|png|)";
350
foreach (Match m in Regex.Matches(ImgString.ToLower(), Reg))
351
{
352
MatchVale += (m.Value).ToLower().Trim().Replace("src=","");
353
}
354
return (imgHttp+MatchVale);
355
}
356
357
#endregion
358
359
替换通过正则获取字符串所带的正则首尾匹配字符串#region 替换通过正则获取字符串所带的正则首尾匹配字符串
360
/**//**********************************
361
* 函数名称:GetHref
362
* 功能说明:匹配页面的链接
363
* 参 数:HtmlCode:html源代码
364
* 调用示例:
365
* GetRemoteObj o = new GetRemoteObj();
366
* string Url = @"http://www.baidu.com/";
367
* strion HtmlCode = o.GetRemoteHtmlCode(Url);
368
* string s = o.RegReplace(HtmlCode,"<title>","</title>");
369
* Response.Write(s);
370
* o.Dispose();
371
* ********************************/
372
/**//// <summary>
373
/// 替换通过正则获取字符串所带的正则首尾匹配字符串
374
/// </summary>
375
/// <param name="RegValue">要替换的值</param>
376
/// <param name="regStart">正则匹配的首字符串</param>
377
/// <param name="regEnd">正则匹配的尾字符串</param>
378
/// <returns></returns>
379
public string RegReplace(string RegValue, string regStart,string regEnd)
380
{
381
string s = RegValue;
382
if (RegValue != "" && RegValue != null)
383
{
384
if (regStart != "" && regStart != null)
385
{
386
s = s.Replace(regStart, "");
387
}
388
if (regEnd != "" && regEnd != null)
389
{
390
s = s.Replace(regEnd, "");
391
}
392
}
393
return s;
394
}
395
#endregion
396
397
398
}
399
}
400
using System;2
using System.Data;3
using System.Configuration;4
using System.Web;5
using System.Web.Security;6
using System.Web.UI;7
using System.Web.UI.WebControls;8
using System.Web.UI.WebControls.WebParts;9
using System.Web.UI.HtmlControls;10
using MSXML2; //这个需要添加引用MSXML211
using System.Text.RegularExpressions;12
namespace EC13


{14

/**//// <summary>15
/// 远程文件抓取类16
/// </summary>17
public class GetRemoteObj18

{19
20

构造与析构函数#region 构造与析构函数21
public GetRemoteObj()22

{23
//24
// TODO: 在此处添加构造函数逻辑25
//26
}27
~GetRemoteObj()28

{29
Dispose();30
}31
#endregion32

33

IDisposable 成员#region IDisposable 成员34

35
public void Dispose()36

{ 37
GC.SuppressFinalize(this);38
}39

40
#endregion41

42

日期随机函数#region 日期随机函数43

/**//**********************************44
* 函数名称:DateRndName45
* 功能说明:日期随机函数46
* 参 数:ra:随机数47
* 调用示例:48
* GetRemoteObj o = new GetRemoteObj();49
* Random ra = new Random();50
* string s = o.DateRndName(ra);51
* Response.Write(s);52
* o.Dispose();53
* ********************************/54

/**//// <summary>55
/// 日期随机函数56
/// </summary>57
/// <param name="ra">随机数</param>58
/// <returns></returns>59
public string DateRndName(Random ra)60

{61
DateTime d = DateTime.Now;62
string s = null, y, m, dd, h, mm, ss;63
y = d.Year.ToString();64
m = d.Month.ToString();65
if (m.Length < 2) m = "0" + m;66
dd = d.Day.ToString();67
if (dd.Length < 2) dd = "0" + dd;68
h = d.Hour.ToString();69
if (h.Length < 2) h = "0" + h;70
mm = d.Minute.ToString();71
if (mm.Length < 2) mm = "0" + mm;72
ss = d.Second.ToString();73
if (ss.Length < 2) ss = "0" + ss;74
s += y + m + dd + h + mm + ss;75
s += ra.Next(100, 999).ToString();76
return s;77
}78
#endregion79

80

取得文件后缀#region 取得文件后缀81

/**//**********************************82
* 函数名称:GetFileExtends83
* 功能说明:取得文件后缀84
* 参 数:filename:文件名称85
* 调用示例:86
* GetRemoteObj o = new GetRemoteObj();87
* string url = @"http://www.baidu.com/img/logo.gif";88
* string s = o.GetFileExtends(url);89
* Response.Write(s);90
* o.Dispose();91
* ********************************/92

/**//// <summary>93
/// 取得文件后缀94
/// </summary>95
/// <param name="filename">文件名称</param>96
/// <returns></returns>97
public string GetFileExtends(string filename)98

{99
string ext = null;100
if (filename.IndexOf('.') > 0)101

{102
string[] fs = filename.Split('.');103
ext = fs[fs.Length - 1];104
}105
return ext;106
}107
#endregion108

109

获取远程文件源代码#region 获取远程文件源代码110

/**//**********************************111
* 函数名称:GetRemoteHtmlCode112
* 功能说明:获取远程文件源代码113
* 参 数:Url:远程url114
* 调用示例:115
* GetRemoteObj o = new GetRemoteObj();116
* string url = @"http://www.baidu.com/";117
* string s = o.GetRemoteHtmlCode(url);118
* Response.Write(s);119
* o.Dispose();120
* ********************************/121

/**//// <summary>122
/// 获取远程文件源代码123
/// </summary>124
/// <param name="url">远程url</param>125
/// <returns></returns>126
public string GetRemoteHtmlCode(string Url)127

{128
string s = "";129
MSXML2.XMLHTTP _xmlhttp = new MSXML2.XMLHTTPClass();130
_xmlhttp.open("GET", Url, false, null, null);131
_xmlhttp.send("");132
if (_xmlhttp.readyState == 4)133

{134
s = System.Text.Encoding.Default.GetString((byte[])_xmlhttp.responseBody);135
}136
return s;137
}138

139
#endregion140

141

保存远程文件#region 保存远程文件142

/**//**********************************143
* 函数名称:RemoteSave144
* 功能说明:保存远程文件145
* 参 数:Url:远程url;Path:保存到的路径146
* 调用示例:147
* GetRemoteObj o = new GetRemoteObj();148
* string s = "";149
* string url = @"http://www.baidu.com/img/logo.gif";150
* string path =Server.MapPath("Html/");151
* s = o.RemoteSave(url,path);152
* Response.Write(s);153
* o.Dispose(); 154
* ******************************/155

/**//// <summary>156
/// 保存远程文件157
/// </summary>158
/// <param name="Url">远程url</param>159
/// <param name="Path">保存到的路径</param>160
/// <returns></returns>161
public string RemoteSave(string Url, string Path)162

{163
Random ra = new Random();164
string StringFileName = DateRndName(ra) + "." + GetFileExtends(Url);165
string StringFilePath = Path + StringFileName;166
MSXML2.XMLHTTP _xmlhttp = new MSXML2.XMLHTTPClass();167
_xmlhttp.open("GET", Url, false, null, null);168
_xmlhttp.send("");169
if (_xmlhttp.readyState == 4)170

{171
if (System.IO.File.Exists(StringFilePath))172
System.IO.File.Delete(StringFilePath);173
System.IO.FileStream fs = new System.IO.FileStream(StringFilePath, System.IO.FileMode.CreateNew);174
System.IO.BinaryWriter w = new System.IO.BinaryWriter(fs);175
w.Write((byte[])_xmlhttp.responseBody);176
w.Close();177
fs.Close();178
}179
else180
throw new Exception(_xmlhttp.statusText);181
return StringFileName;182
}183
#endregion184

185

替换网页中的换行和引号#region 替换网页中的换行和引号186

/**//**********************************187
* 函数名称:ReplaceEnter188
* 功能说明:替换网页中的换行和引号189
* 参 数:HtmlCode:html源代码190
* 调用示例:191
* GetRemoteObj o = new GetRemoteObj();192
* string Url = @"http://www.baidu.com/";193
* strion HtmlCode = o.GetRemoteHtmlCode(Url);194
* string s = o.ReplaceEnter(HtmlCode);195
* Response.Write(s);196
* o.Dispose();197
* ********************************/198

/**//// <summary>199
/// 替换网页中的换行和引号200
/// </summary>201
/// <param name="HtmlCode">HTML源代码</param>202
/// <returns></returns>203
public string ReplaceEnter(string HtmlCode)204

{205
string s = "";206
if (HtmlCode == null || HtmlCode == "")207
s = "";208
else209
s = HtmlCode.Replace("\"", "");210
s = s.Replace("\r\n", "");211
return s;212
}213

214
#endregion 215

216

执行正则提取出值#region 执行正则提取出值217

/**//**********************************218
* 函数名称:GetRegValue219
* 功能说明:执行正则提取出值220
* 参 数:HtmlCode:html源代码221
* 调用示例:222
* GetRemoteObj o = new GetRemoteObj();223
* string Url = @"http://www.baidu.com/";224
* strion HtmlCode = o.GetRemoteHtmlCode(Url);225
* string s = o.ReplaceEnter(HtmlCode);226
* string Reg="<title>.+?</title>";227
* string GetValue=o.GetRegValue(Reg,HtmlCode)228
* Response.Write(GetValue);229
* o.Dispose();230
* ********************************/231

/**//// <summary>232
/// 执行正则提取出值233
/// </summary>234
/// <param name="RegexString">正则表达式</param>235
/// <param name="RemoteStr">HtmlCode源代码</param>236
/// <returns></returns>237
public string GetRegValue(string RegexString, string RemoteStr)238

{239
string MatchVale = "";240
Regex r = new Regex(RegexString);241
Match m = r.Match(RemoteStr);242
if (m.Success)243

{244
MatchVale = m.Value;245
}246
return MatchVale;247
}248
#endregion 249

250

替换HTML源代码#region 替换HTML源代码251

/**//**********************************252
* 函数名称:RemoveHTML253
* 功能说明:替换HTML源代码254
* 参 数:HtmlCode:html源代码255
* 调用示例:256
* GetRemoteObj o = new GetRemoteObj();257
* string Url = @"http://www.baidu.com/";258
* strion HtmlCode = o.GetRemoteHtmlCode(Url);259
* string s = o.ReplaceEnter(HtmlCode);260
* string Reg="<title>.+?</title>";261
* string GetValue=o.GetRegValue(Reg,HtmlCode)262
* Response.Write(GetValue);263
* o.Dispose();264
* ********************************/265

/**//// <summary>266
/// 替换HTML源代码267
/// </summary>268
/// <param name="HtmlCode">html源代码</param>269
/// <returns></returns>270
public string RemoveHTML(string HtmlCode)271

{272
string MatchVale = HtmlCode; 273
foreach (Match s in Regex.Matches(HtmlCode, "<.+?>"))274

{275
MatchVale = MatchVale.Replace(s.Value, "");276
}277
return MatchVale; 278
}279

280
#endregion281

282

匹配页面的链接#region 匹配页面的链接283

/**//**********************************284
* 函数名称:GetHref285
* 功能说明:匹配页面的链接286
* 参 数:HtmlCode:html源代码287
* 调用示例:288
* GetRemoteObj o = new GetRemoteObj();289
* string Url = @"http://www.baidu.com/";290
* strion HtmlCode = o.GetRemoteHtmlCode(Url);291
* string s = o.GetHref(HtmlCode);292
* Response.Write(s);293
* o.Dispose();294
* ********************************/295

/**//// <summary>296
/// 获取页面的链接正则297
/// </summary>298
/// <param name="HtmlCode"></param>299
/// <returns></returns>300
public string GetHref(string HtmlCode)301

{302
string MatchVale = "";303
string Reg = @"(h|H)(r|R)(e|E)(f|F) *= *('|"")?((\w|\\|\/|\.|:|-|_)+)('|""| *|>)?"; 304
foreach(Match m in Regex.Matches(HtmlCode,Reg))305

{306
MatchVale += (m.Value).ToLower().Replace("href=", "").Trim() + "||";307
}308
return MatchVale; 309
}310
#endregion311

312

匹配页面的图片地址#region 匹配页面的图片地址313

/**//**********************************314
* 函数名称:GetImgSrc315
* 功能说明:匹配页面的图片地址316
* 参 数:HtmlCode:html源代码;imgHttp:要补充的http.当比如:<img src="bb/x.gif">则要补充http://www.baidu.com/,当包含http信息时,则可以为空317
* 调用示例:318
* GetRemoteObj o = new GetRemoteObj();319
* string Url = @"http://www.baidu.com/";320
* strion HtmlCode = o.GetRemoteHtmlCode(Url);321
* string s = o.GetImgSrc(HtmlCode,"http://www.baidu.com/");322
* Response.Write(s);323
* o.Dispose();324
* ********************************/325

/**//// <summary>326
/// 匹配页面的图片地址327
/// </summary>328
/// <param name="HtmlCode"></param>329
/// <param name="imgHttp">要补充的http://路径信息</param>330
/// <returns></returns>331
public string GetImgSrc(string HtmlCode, string imgHttp)332

{333
string MatchVale = "";334
string Reg = @"<img.+?>";335
foreach (Match m in Regex.Matches(HtmlCode, Reg))336

{337
MatchVale += GetImg((m.Value).ToLower().Trim(), imgHttp) + "||";338
}339
return MatchVale;340
}341

/**//// <summary>342
/// 匹配<img src="" />中的图片路径实际链接343
/// </summary>344
/// <param name="ImgString"><img src="" />字符串</param>345
/// <returns></returns>346
public string GetImg(string ImgString, string imgHttp)347

{348
string MatchVale = "";349
string Reg = @"src=.+\.(bmp|jpg|gif|png|)";350
foreach (Match m in Regex.Matches(ImgString.ToLower(), Reg))351

{352
MatchVale += (m.Value).ToLower().Trim().Replace("src=","");353
}354
return (imgHttp+MatchVale);355
}356

357
#endregion358

359

替换通过正则获取字符串所带的正则首尾匹配字符串#region 替换通过正则获取字符串所带的正则首尾匹配字符串360

/**//**********************************361
* 函数名称:GetHref362
* 功能说明:匹配页面的链接363
* 参 数:HtmlCode:html源代码364
* 调用示例:365
* GetRemoteObj o = new GetRemoteObj();366
* string Url = @"http://www.baidu.com/";367
* strion HtmlCode = o.GetRemoteHtmlCode(Url);368
* string s = o.RegReplace(HtmlCode,"<title>","</title>");369
* Response.Write(s);370
* o.Dispose();371
* ********************************/372

/**//// <summary>373
/// 替换通过正则获取字符串所带的正则首尾匹配字符串374
/// </summary>375
/// <param name="RegValue">要替换的值</param>376
/// <param name="regStart">正则匹配的首字符串</param>377
/// <param name="regEnd">正则匹配的尾字符串</param>378
/// <returns></returns>379
public string RegReplace(string RegValue, string regStart,string regEnd)380

{381
string s = RegValue;382
if (RegValue != "" && RegValue != null)383

{384
if (regStart != "" && regStart != null)385

{386
s = s.Replace(regStart, "");387
}388
if (regEnd != "" && regEnd != null)389

{390
s = s.Replace(regEnd, "");391
}392
}393
return s;394
}395
#endregion396

397

398
}399
}400

浙公网安备 33010602011771号