获取html地址中的内容
这里 有两种,可能是因为它们本身的编码方式不一样,所以解析也不一样,只是做一个笔记,不一定能用得着
1、获取word文档解析的网页
public string GetHtml(string url) { //http://center.file.odxd.com/2016/12/19/10067/69f1ac86-d92e-448f-905e-2403642389a9.docx.html //http://center.file.odxd.com/2017/9/12/10155/eb279af3-03d6-4817-8804-fd563b573c44/00e681ab-0016-4883-a4d1-2b6d1f5d8c8a.txt.html string strResult; try { HttpWebRequest hwr = (System.Net.HttpWebRequest)System.Net.HttpWebRequest.Create(url); hwr.Timeout = 19600; HttpWebResponse hwrs = (HttpWebResponse)hwr.GetResponse(); Stream myStream = hwrs.GetResponseStream(); StreamReader sr = new StreamReader(myStream, Encoding.UTF8); StringBuilder sb = new StringBuilder(); while (-1 != sr.Peek()) { sb.Append(sr.ReadLine() + "\r\n"); } strResult = sb.ToString().Replace("?", ""); hwrs.Close(); } catch { strResult = ""; } return strResult; }
2、获取txt解析的网页
public string GetTxtHtml(string url) { try { byte[] buffer = new WebClient().DownloadData(url); StreamReader streamReader = new StreamReader(new MemoryStream(buffer), Encoding.UTF8); string text = streamReader.ReadToEnd(); if (IsMessyCode(text)) { streamReader = new StreamReader(new MemoryStream(buffer), Encoding.Default); text = streamReader.ReadToEnd(); } return text.Replace("\r\n", "<br>"); } catch { return ""; } } /// <summary> /// 判断是否为乱码 /// </summary> /// <param name="txt">文本</param> /// <returns></returns> private static bool IsMessyCode(string txt) { byte[] bytes = Encoding.UTF8.GetBytes(txt); for (int i = 0; i < bytes.Length; i++) { if ((i < (bytes.Length - 3)) && (((bytes[i] == 0xef) && (bytes[i + 1] == 0xbf)) && (bytes[i + 2] == 0xbd))) { return true; } } return false; }