1 public class WebPage
2 {
3 #region Property
4 /// <summary>
5 /// 页面地址
6 /// </summary>
7 public string URL { get; set; }
8 /// <summary>
9 /// 域名
10 /// </summary>
11 public string Host { get; set; }
12 /// <summary>
13 /// 原始HTML
14 /// </summary>
15 public string OriginalHTML { get; set; }
16 /// <summary>
17 /// 过滤style、script及注释后的HTML
18 /// </summary>
19 public string HTML { get; set; }
20 /// <summary>
21 /// 标题
22 /// </summary>
23 public string Title { get; set; }
24 /// <summary>
25 /// 来源
26 /// </summary>
27 public string Source { get; set; }
28 /// <summary>
29 /// 发布日期
30 /// </summary>
31 public DateTime? PublishDate { get; set; }
32 /// <summary>
33 /// 作者
34 /// </summary>
35 public string Author { get; set; }
36 /// <summary>
37 /// 摘要
38 /// </summary>
39 public string Abstract { get; set; }
40 /// <summary>
41 /// 关键字
42 /// </summary>
43 public string KeyWord { get; set; }
44 /// <summary>
45 /// 正文
46 /// </summary>
47 public string Content { get; set; }
48 #endregion
49
50 #region Constructor
51 /// <summary>
52 /// 构造函数
53 /// </summary>
54 public WebPage() { }
55 /// <summary>
56 /// 构造函数
57 /// </summary>
58 /// <param name="html">HTML文档</param>
59 public WebPage(string html) : this()
60 {
61 //this.OriginalHTML = System.Web.HtmlUtility.HtmlDecode(html);
62 //过滤style/script/注释
63 this.HTML = ClearHTML(html);
64 //标题
65 this.Title = GetTitle(this.HTML);
66 }
67 /// <summary>
68 /// 构造函数
69 /// </summary>
70 /// <param name="html">HTML文档</param>
71 /// <param name="url">页面地址,用于处理img标签图片路径</param>
72 public WebPage(string html, string url) : this(html)
73 {
74 this.URL = url;
75 this.Host = new Uri(url).Host;
76 }
77 #endregion
78
79 #region public methods
80
81 public static string GetHTML(string url, string cookie = "", bool throwError = false, string charset = "")
82 {
83 if (url.Substring(0, 4) != "http")
84 {
85 url = "http://" + url;
86 }
87 Uri uri = new Uri(url);
88 return GetHTML(uri, cookie, throwError, charset);
89 }
90 public static string GetHTML(Uri uri, string cookie = "", bool throwError = false, string charset = "")
91 {
92 try
93 {
94 string html = "";
95 HttpWebRequest req = (HttpWebRequest)WebRequest.Create(uri);
96 req.Method = "GET";
97 req.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*";
98 req.UserAgent = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)";
99 req.Timeout = 60 * 1000;
100 if (!string.IsNullOrEmpty(cookie))
101 {
102 req.Headers.Add("Cookie", cookie);
103 }
104
105 HttpWebResponse res = (HttpWebResponse)req.GetResponse();
106 List<byte> lstByteHtml = new List<byte>();
107 using (Stream stream = res.GetResponseStream())
108 {
109 int tempByte = stream.ReadByte();
110 while (tempByte != -1)
111 {
112 lstByteHtml.Add((byte)tempByte);
113 tempByte = stream.ReadByte();
114 }
115 }
116 byte[] arrByteHtml = lstByteHtml.ToArray();
117
118 if (!string.IsNullOrEmpty(charset))
119 {
120 html = Encoding.GetEncoding(charset).GetString(arrByteHtml, 0, arrByteHtml.Length);
121 }
122 else
123 {
124 html = Encoding.UTF8.GetString(arrByteHtml, 0, arrByteHtml.Length);
125 string strCharSet = Regex.Match(html, @"<meta.*?charset=""?([a-z0-9-]+)\b", RegexOptions.IgnoreCase).Groups[1].Value;
126 //如果匹配到了标签并且不是utf8 那么重新解码一次
127 if (strCharSet != "" && (strCharSet.ToLower().IndexOf("utf") == -1))
128 {
129 try
130 {
131 html = Encoding.GetEncoding(strCharSet).GetString(arrByteHtml, 0, arrByteHtml.Length);
132 }
133 catch { }
134 }
135 }
136 return html;
137 }
138 catch (Exception ex)
139 {
140 if (throwError)
141 {
142 throw ex;
143 }
144 else
145 {
146 return "";
147 }
148 }
149 }
150
151 public static string GetPostHTML(string url, string cookie = "", bool throwError = false, string chatset = "")
152 {
153 string[] arrUrl = url.Split('?');
154 string postdata = arrUrl.Length > 1 ? arrUrl[1] : "";
155 Uri uri = new Uri(url);
156 return GetPostHTML(uri, postdata, cookie, throwError, chatset);
157 }
158 public static string GetPostHTML(Uri uri, string postdata, string cookie = "", bool throwError = false, string chatset = "")
159 {
160 try
161 {
162 HttpWebRequest req = (HttpWebRequest)WebRequest.Create(uri);
163 req.Method = "POST";
164 req.ContentType = "application/x-www-form-urlencoded";
165 //req.ContentLength = Encoding.UTF8.GetByteCount(postData);
166 if (!string.IsNullOrEmpty(cookie))
167 {
168 req.Headers.Add("Cookie", cookie);
169 }
170 using (Stream stream = req.GetRequestStream())
171 {
172 StreamWriter sw = new StreamWriter(stream, Encoding.GetEncoding("gb2312"));
173 sw.Write(postdata);
174 sw.Close();
175 stream.Close();
176 }
177
178 List<byte> lstByteHtml = new List<byte>();
179 HttpWebResponse response = (HttpWebResponse)req.GetResponse();
180 using (Stream stream = response.GetResponseStream())
181 {
182 int tempByte = stream.ReadByte();
183 while (tempByte != -1)
184 {
185 lstByteHtml.Add((byte)tempByte);
186 tempByte = stream.ReadByte();
187 }
188 }
189 byte[] arrByteHtml = lstByteHtml.ToArray();
190 string html = "";
191 if (!string.IsNullOrEmpty(chatset))
192 {
193 html = Encoding.GetEncoding(chatset).GetString(arrByteHtml, 0, arrByteHtml.Length);
194 }
195 else
196 {
197 html = Encoding.UTF8.GetString(arrByteHtml, 0, arrByteHtml.Length);
198 string strCharSet = Regex.Match(html, @"<meta.*?charset=""?([a-z0-9-]+)\b", RegexOptions.IgnoreCase).Groups[1].Value;
199 //如果匹配到了标签并且不是utf8 那么重新解码一次
200 if (strCharSet != "" && (strCharSet.ToLower().IndexOf("utf") == -1))
201 {
202 try
203 {
204 html = Encoding.GetEncoding(strCharSet).GetString(arrByteHtml, 0, arrByteHtml.Length);
205 }
206 catch { }
207 }
208 }
209
210 return html;
211 }
212 catch (Exception ex)
213 {
214 if (throwError)
215 {
216 throw ex;
217 }
218 else
219 {
220 return "";
221 }
222 }
223 }
224
225 /// <summary>
226 /// 过滤style/script/注释,更新HTML
227 /// </summary>
228 public static string ClearHTML(string html)
229 {
230 //过滤表达式:style, script, 注释
231 string[] filter = { @"(?is)<style.*?>.*?</style>",
232 @"(?is)<script.*?>.*?</script>",
233 @"(?is)<!--.*?-->" };
234 foreach (string reg in filter)
235 {
236 html = Regex.Replace(html, reg, "");
237 }
238 return html;
239 }
240 public static string Html2Lower(string html)
241 {
242 MatchCollection matchs = Regex.Matches(html, @"(?<tag><[^\s>]+\s)|(?<tag><[^\s>]+>)");
243 Regex.Replace("", "", "");
244 foreach (Match match in matchs)
245 {
246 string tag = match.Value.ToLower();
247 html = html.Replace(match.Value, tag);
248 }
249 return html;
250 }
251 /// <summary>
252 /// 获取标题(匹配title和h1标签,当h1内容部分与title匹配时优先选择h1)
253 /// </summary>
254 /// <returns>title</returns>
255 public static string GetTitle(string html)
256 {
257 string titleFilter = @"<title>[\s\S]*?</title>",
258 h1Filter = @"<h1.*?>.*?</h1>",
259 clearFilter = @"<.*?>";
260
261 string title = "", h1 = "";
262 //匹配title标签中的值
263 Match match = Regex.Match(html, titleFilter, RegexOptions.IgnoreCase);
264 if (match.Success)
265 {
266 title = Regex.Replace(match.Groups[0].Value, clearFilter, "");
267 }
268 //匹配正文中h1标签中的值
269 match = Regex.Match(html, h1Filter, RegexOptions.IgnoreCase);
270 if (match.Success)
271 {
272 h1 = Regex.Replace(match.Groups[0].Value, clearFilter, "").Replace("\t", "");
273 }
274 //都匹配的时候优先取h1
275 if (!String.IsNullOrEmpty(h1) && (!String.IsNullOrEmpty(title) && title.IndexOf(h1) != -1))
276 {
277 title = h1;
278 }
279 return title;
280 }
281 public static string GetHeadTitle(string html)
282 {
283 string title = "";
284 Match match = Regex.Match(html, @"<title>([\s\S]*?)</title>", RegexOptions.IgnoreCase);
285 if (match.Success)
286 {
287 title = match.Groups[1].Value;
288 }
289 return title;
290 }
291 public static string GetHeadBase(string html)
292 {
293 return Regex.Match(html, @"<base.*?href=[""|'']?([a-zA-Z0-9-:/.]+)[""|''].*?>", RegexOptions.IgnoreCase).Groups[1].Value;
294 }
295
296 /// <summary>
297 /// 获取发布日期(未提取到时返回null)
298 /// </summary>
299 /// <param name="html"></param>
300 /// <returns></returns>
301 public static DateTime GetPublishDate(string html)
302 {
303 // 过滤html标签,防止标签对日期提取产生影响
304 string text = Regex.Replace(html, "(?is)<.*?>", "");
305 Match match = Regex.Match(text, @"((\d{4}|\d{2})(\-|\/)\d{1,2}\3\d{1,2})(\s?\d{2}:\d{2})?|(\d{4}年\d{1,2}月\d{1,2}日)(\s?\d{2}:\d{2})?", RegexOptions.IgnoreCase);
306
307 DateTime result = new DateTime(1900, 1, 1);
308 if (match.Success)
309 {
310 try
311 {
312 string dateStr = "";
313 for (int i = 0; i < match.Groups.Count; i++)
314 {
315 dateStr = match.Groups[i].Value;
316 if (!String.IsNullOrEmpty(dateStr))
317 {
318 break;
319 }
320 }
321 // 对中文日期的处理
322 if (dateStr.Contains("年"))
323 {
324 StringBuilder sb = new StringBuilder();
325 foreach (var ch in dateStr)
326 {
327 if (ch == '年' || ch == '月')
328 {
329 sb.Append("/");
330 continue;
331 }
332 if (ch == '日')
333 {
334 sb.Append(' ');
335 continue;
336 }
337 sb.Append(ch);
338 }
339 dateStr = sb.ToString();
340 }
341 result = Convert.ToDateTime(dateStr);
342 }
343 catch (Exception ex)
344 {
345 Console.WriteLine(ex);
346 }
347 }
348 return result;
349 }
350 /// <summary>
351 /// 替换a标签的地址为完整地址
352 /// </summary>
353 /// <param name="url"></param>
354 /// <param name="href"></param>
355 /// <returns></returns>
356 public static string ReplaceAhref(string url, string href, string baseStr = "")
357 {
358 try
359 {
360 string value = "";
361 if (baseStr != "" && !IsAbsolutePath(href))
362 {
363 value = baseStr + href;
364 }
365 else
366 {
367 if (href.PadRight(5).Substring(0, 4) == "http")
368 {
369 value = href;
370 }
371 else
372 {
373 if (url.Substring(0, 4) != "http")
374 {
375 url = "http://" + url;
376 }
377 Uri uri = new Uri(url);
378 if (href.Substring(0, 1) == "/")
379 {
380 value = uri.Scheme + ":" + "//" + uri.Host + ":" + uri.Port + href;
381 }
382 else
383 {
384 string[] arrURL = url.Split('/');
385 if (arrURL.Length == 1)
386 {
387 value = url + "/" + href;
388 }
389 else
390 {
391 arrURL[arrURL.Length - 1] = href;
392 value = string.Join("/", arrURL);
393 }
394 }
395 }
396 }
397 return value;
398 }
399 catch (Exception ex)
400 {
401 return href;
402 }
403 }
404 /// <summary>
405 /// 根据img标签的路径为base64编码
406 /// </summary>
407 /// <param name="url">页面地址</param>
408 /// <param name="src">img标签图片地址</param>
409 /// <returns></returns>
410 public static string ReplaceImgSrc(string url, string src, string baseStr = "")
411 {
412 try
413 {
414 string value = "";
415 if (baseStr != "" && !IsAbsolutePath(src))
416 {
417 value = baseStr + src;
418 }
419 else
420 {
421 if (src.PadRight(5).Substring(0, 4) == "http")
422 {
423 value = src;
424 }
425 else
426 {
427 if (url.Substring(0, 4) != "http")
428 {
429 url = "http://" + url;
430 }
431 Uri uri = new Uri(url);
432 if (src.Substring(0, 1) == "/")
433 {
434 value = uri.Scheme + ":" + "//" + uri.Host + ":" + uri.Port + src;
435 }
436 else
437 {
438 string[] arrURL = url.Split('/');
439 if (arrURL.Length == 1)
440 {
441 value = url + "/" + src;
442 }
443 else
444 {
445 arrURL[arrURL.Length - 1] = src;
446 value = string.Join("/", arrURL);
447 }
448 }
449 }
450 }
451 if (value.Substring(0, 4) != "http")
452 {
453 value = "http://" + value;
454 }
455 return WebPage.ImgToBase64String(value);
456 }
457 catch (Exception ex)
458 {
459 return src;
460 }
461 }
462
463 public static bool IsAbsolutePath(string url)
464 {
465 bool flag = false;
466 if (!string.IsNullOrEmpty(url))
467 {
468 string head = url.PadRight(10, ' ');
469 if (head.Substring(0, 5) == "http:" || head.Substring(0, 6) == "https:")
470 {
471 flag = true;
472 }
473 }
474 return flag;
475 }
476 //图片 转为 base64编码的文本
477 public static string ImgToBase64String(string imageUrl)
478 {
479 try
480 {
481 string imgBase64 = "";
482 string imgFormat = "png";
483
484 HttpWebRequest req = (HttpWebRequest)WebRequest.Create(imageUrl);
485 req.Accept = "image/*";
486 HttpWebResponse res = (HttpWebResponse)req.GetResponse();
487
488 Image img = Image.FromStream(res.GetResponseStream());
489 imgFormat = GetImageFormat(img);
490
491 using (MemoryStream ms = new MemoryStream())
492 {
493 img.Save(ms, img.RawFormat);
494 byte[] arrByteStream = new byte[ms.Length];
495 ms.Position = 0;
496 ms.Read(arrByteStream, 0, (int)ms.Length);
497 ms.Close();
498 imgBase64 = Convert.ToBase64String(arrByteStream);
499 }
500 return "data:image/" + imgFormat + ";base64," + imgBase64;
501 }
502 catch (Exception ex)
503 {
504 return imageUrl;
505 }
506 }
507 private static string GetImageFormat(Image img)
508 {
509 string format = "png";
510
511 if (img.RawFormat.Equals(ImageFormat.Bmp))
512 {
513 format = "jpeg";
514 }
515 else if (img.RawFormat.Equals(ImageFormat.Emf))
516 {
517 format = "emf";
518 }
519 else if (img.RawFormat.Equals(ImageFormat.Exif))
520 {
521 format = "exif";
522 }
523 else if (img.RawFormat.Equals(ImageFormat.Gif))
524 {
525 format = "gif";
526 }
527 else if (img.RawFormat.Equals(ImageFormat.Icon))
528 {
529 format = "icon";
530 }
531 else if (img.RawFormat.Equals(ImageFormat.Jpeg))
532 {
533 format = "jpeg";
534 }
535 else if (img.RawFormat.Equals(ImageFormat.MemoryBmp))
536 {
537 format = "memorybmp";
538 }
539 else if (img.RawFormat.Equals(ImageFormat.Png))
540 {
541 format = "png";
542 }
543 else if (img.RawFormat.Equals(ImageFormat.Tiff))
544 {
545 format = "tiff";
546 }
547 else if (img.RawFormat.Equals(ImageFormat.Wmf))
548 {
549 format = "wmf";
550 }
551 return format;
552 }
553 #endregion
554 }