获取html地址中的内容

这里 有两种,可能是因为它们本身的编码方式不一样,所以解析也不一样,只是做一个笔记,不一定能用得着

1、获取word文档解析的网页

        public string GetHtml(string url)
        {
            //http://center.file.odxd.com/2016/12/19/10067/69f1ac86-d92e-448f-905e-2403642389a9.docx.html
            //http://center.file.odxd.com/2017/9/12/10155/eb279af3-03d6-4817-8804-fd563b573c44/00e681ab-0016-4883-a4d1-2b6d1f5d8c8a.txt.html
            string strResult;
            try
            {
                HttpWebRequest hwr = (System.Net.HttpWebRequest)System.Net.HttpWebRequest.Create(url);
                hwr.Timeout = 19600;
                HttpWebResponse hwrs = (HttpWebResponse)hwr.GetResponse();
                Stream myStream = hwrs.GetResponseStream();
                StreamReader sr = new StreamReader(myStream, Encoding.UTF8);
                StringBuilder sb = new StringBuilder();
                while (-1 != sr.Peek())
                {
                    sb.Append(sr.ReadLine() + "\r\n");
                }
                strResult = sb.ToString().Replace("?", "");
                hwrs.Close();
            }
            catch
            {
                strResult = "";
            }
            return strResult;
        }

  2、获取txt解析的网页

public string GetTxtHtml(string url)
        {
            try
            {
                byte[] buffer = new WebClient().DownloadData(url);
                StreamReader streamReader = new StreamReader(new MemoryStream(buffer), Encoding.UTF8);
                string text = streamReader.ReadToEnd();
                if (IsMessyCode(text))
                {
                    streamReader = new StreamReader(new MemoryStream(buffer), Encoding.Default);
                    text = streamReader.ReadToEnd();
                }
                return text.Replace("\r\n", "<br>");
            }
            catch
            {
                return "";
            }
        }

        /// <summary>
        /// 判断是否为乱码
        /// </summary>
        /// <param name="txt">文本</param>
        /// <returns></returns>
        private static bool IsMessyCode(string txt)
        {
            byte[] bytes = Encoding.UTF8.GetBytes(txt);
            for (int i = 0; i < bytes.Length; i++)
            {
                if ((i < (bytes.Length - 3)) && (((bytes[i] == 0xef) && (bytes[i + 1] == 0xbf)) && (bytes[i + 2] == 0xbd)))
                {
                    return true;
                }
            }
            return false;
        }

  

 

posted on 2018-02-09 10:56  木头人段  阅读(252)  评论(0)    收藏  举报

导航