获取HTML源码(只取文字,判断编码,过滤标签)

private void button1_Click(object sender, EventArgs e)
        {
            string s1 = this.textBox1.Text;
            //正则表达式内容
            //string match = @"^(http|https|ftp)\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(:[a-zA-Z0-9]*)?/?([a-zA-Z0-9\-\._\?\,\'/\\\+&%\$#\=~])*$";
            //string match = @"[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(:[a-zA-Z0-9]*)?/?([a-zA-Z0-9\-\._\?\,\'/\\\+&%\$#\=~])*$";
            string match = @"[a-zA-z]+://[^\s]*";
            //初始化正则表达式实例
            Regex reg = new Regex(match);
            //开始验证
            bool HasValidate = reg.IsMatch(s1);

            if (HasValidate)
            {
                //MessageBox.Show("这是网站有效URL格式。");
                try
                {
                    string tmp = GetHtml(s1);
                    string tmpend = StripHTML(tmp);

                }
                catch (Exception)
                {
                    //MessageBox.Show("3.该网站只能手动查询!");
                }
            }
        }

 

 

1.获取HTML

GetHtml(String Url)

View Code
        /// <summary>
        /// 获取有效的HTML
        /// </summary>
        /// <param name="Url"></param>
        /// <returns></returns>
        public String GetHtml(String Url)
        {
            string sException = null;

            string sRslt = null;
            string GBsRslt = null;
            StreamReader htm = null;
            WebResponse oWebRps = null;
            WebResponse bWebRps = null;
            int a = 0;

            WebRequest oWebRqst = WebRequest.Create(Url);

            oWebRqst.Timeout = 50000;

            WebRequest bWebRqst = WebRequest.Create(Url);

            bWebRqst.Timeout = 50000;

            try
            {
                oWebRps = oWebRqst.GetResponse();
                bWebRps = bWebRqst.GetResponse();
            }
            catch (WebException e)
            {
                sException = e.Message.ToString();

                MessageBox.Show(sException);
            }
            catch (Exception e)
            {
                sException = e.ToString();

                MessageBox.Show(sException);
            }
            finally
            {
                if (oWebRps != null)
                {
                    StreamReader oStreamRd = new StreamReader(
                        oWebRps.GetResponseStream(), Encoding.GetEncoding("UTF-8")
                        );

                    StreamReader GBoStreamRd = new StreamReader(
                        bWebRps.GetResponseStream(), Encoding.GetEncoding("GB2312")
                        );

                    sRslt = oStreamRd.ReadToEnd();
                    GBsRslt = GBoStreamRd.ReadToEnd();

                    if (!isLuan(sRslt)) //判断utf8是否有乱码
                    {
                        htm = oStreamRd;
                    }

                    else
                    {
                        htm = GBoStreamRd;
                    }

                    if (htm == oStreamRd)
                    {
                        a = 1;
                    }
                    else
                    {
                        a = 2;
                    }

                    oStreamRd.Close();
                    GBoStreamRd.Close();
                    oWebRps.Close();

                }
            }
            if (a == 1)
            {
                return sRslt;
            }
            else
            {
                return GBsRslt;
            }

        }

  

2.去除HTML标记(正则表达式)

StripHTML(string strHtml)

View Code
 1         /// <summary>
 2         /// 去除HTML标记
 3         /// </summary>
 4         /// <param name="strHtml">包括HTML的源码 </param>
 5         /// <returns>已经去除后的文字</returns>
 6         public static string StripHTML(string strHtml)
 7         {
 8             //regex_str="<script type=\\s*[^>]*>[^<]*?</script>";//替换<script>内容</script>为空格
 9             string regex_str = "(?is)<script[^>]*>.*?</script>";//替换<script>内容</script>为空格
10             strHtml = Regex.Replace(strHtml, regex_str, "");
11 
12             //regex_str="<script type=\\s*[^>]*>[^<]*?</script>";//替换<style>内容</style>为空格
13             regex_str = "(?is)<style[^>]*>.*?</style>";//替换<style>内容</style>为空格
14             strHtml = Regex.Replace(strHtml, regex_str, "");
15 
16             //regex_str = "(&nbsp;)+";//替换&nbsp;为空格
17             regex_str = "(?i)&nbsp;";//替换&nbsp;为空格
18             strHtml = Regex.Replace(strHtml, regex_str, " ");
19 
20             //regex_str = "(\r\n)*";//替换\r\n为空
21             regex_str = @"[\r\n]*";//替换\r\n为空
22             strHtml = Regex.Replace(strHtml, regex_str, "", RegexOptions.IgnoreCase);
23 
24             //regex_str = "<[^<]*>";//替换Html标签为空
25             regex_str = "<[^<>]*>";//替换Html标签为空
26             strHtml = Regex.Replace(strHtml, regex_str, "");
27 
28             //regex_str = "\n*";//替换\n为空
29             regex_str = @"\n*";//替换\n为空
30             strHtml = Regex.Replace(strHtml, regex_str, "", RegexOptions.IgnoreCase);
31 
32             //可以这样
33             regex_str = "\t*";//替换\t为空
34             strHtml = Regex.Replace(strHtml, regex_str, "", RegexOptions.IgnoreCase);
35 
36             //可以
37             regex_str = "'";//替换'为’
38             strHtml = Regex.Replace(strHtml, regex_str, "", RegexOptions.IgnoreCase);
39 
40             //可以
41             regex_str = " +";//替换若干个空格为一个空格
42             strHtml = Regex.Replace(strHtml, regex_str, "  ", RegexOptions.IgnoreCase);
43 
44             Regex regex = new Regex("<.+?>", RegexOptions.IgnoreCase);
45 
46             string strOutput = regex.Replace(strHtml, "");//替换掉"<"和">"之间的内容
47             strOutput = strOutput.Replace("<", "");
48             strOutput = strOutput.Replace(">", "");
49             strOutput = strOutput.Replace("&nbsp;", "");
50 
51 
52             return strOutput;
53 
54         }

 

3.判断是否为乱码(编码):在StripHTML里调用

View Code
        //判断是否为乱码
        bool isLuan(string txt)
        {

            var bytes = Encoding.UTF8.GetBytes(txt);

            //239 191 189

            for (var i = 0; i < bytes.Length; i++)
            {

                if (i < bytes.Length - 3)

                    if (bytes[i] == 239 && bytes[i + 1] == 191 && bytes[i + 2] == 189)
                    {

                        return true;

                    }
            }

            return false;

        }

 

 

posted @ 2012-12-04 16:10  【唐】三三  阅读(3580)  评论(0编辑  收藏  举报