技术无极限

Nio的技术空间
  博客园  :: 首页  :: 新随笔  :: 联系 :: 订阅 订阅  :: 管理

ASP.net中去除HTML标记

Posted on 2006-09-14 17:37  Nio  阅读(2839)  评论(1编辑  收藏  举报
 

                  /// <summary>

                /// 去除HTML标记

                /// </summary>

                /// <param name="NoHTML">包括HTML的源码 </param>

                /// <returns>已经去除后的文字</returns>

                public static string NoHTML(string Htmlstring)

                {     

                        //删除脚本

                        Htmlstring = Regex.Replace(Htmlstring,@"<script[^>]*?>.*?</script>","",RegexOptions.IgnoreCase);

                        //删除HTML

                        Htmlstring = Regex.Replace(Htmlstring,@"<(.[^>]*)>","",RegexOptions.IgnoreCase);

                        Htmlstring = Regex.Replace(Htmlstring,@"([\r\n])[\s]+","",RegexOptions.IgnoreCase);

                        Htmlstring = Regex.Replace(Htmlstring,@"-->","",RegexOptions.IgnoreCase);

                        Htmlstring = Regex.Replace(Htmlstring,@"<!--.*","",RegexOptions.IgnoreCase);

                       

                        Htmlstring = Regex.Replace(Htmlstring,@"&(quot|#34);","\"",RegexOptions.IgnoreCase);

                        Htmlstring = Regex.Replace(Htmlstring,@"&(amp|#38);","&",RegexOptions.IgnoreCase);

                        Htmlstring = Regex.Replace(Htmlstring,@"&(lt|#60);","<",RegexOptions.IgnoreCase);

                        Htmlstring = Regex.Replace(Htmlstring,@"&(gt|#62);",">",RegexOptions.IgnoreCase);

                        Htmlstring = Regex.Replace(Htmlstring,@"&(nbsp|#160);"," ",RegexOptions.IgnoreCase);

                        Htmlstring = Regex.Replace(Htmlstring,@"&(iexcl|#161);","\xa1",RegexOptions.IgnoreCase);

                        Htmlstring = Regex.Replace(Htmlstring,@"&(cent|#162);","\xa2",RegexOptions.IgnoreCase);

                        Htmlstring = Regex.Replace(Htmlstring,@"&(pound|#163);","\xa3",RegexOptions.IgnoreCase);

                        Htmlstring = Regex.Replace(Htmlstring,@"&(copy|#169);","\xa9",RegexOptions.IgnoreCase);

                        Htmlstring = Regex.Replace(Htmlstring, @"&#(\d+);","",RegexOptions.IgnoreCase);

 

                        Htmlstring.Replace("<","");

                        Htmlstring.Replace(">","");

                        Htmlstring.Replace("\r\n","");

                        Htmlstring=HttpContext.Current.Server.HtmlEncode(Htmlstring).Trim();

 

                        return Htmlstring;

                }

 

 

 

 

///提取HTML代码中文字的C#函数

/// <summary>

/// 去除HTML标记

/// </summary>

/// <param name="strHtml">包括HTML的源码 </param>

/// <returns>已经去除后的文字</returns>

using System;

using System.Text.RegularExpressions;

public class StripHTMLTest{

 public static void Main(){

    string s=StripHTML("<HTML><HEAD><TITLE>中国石龙信息平台</TITLE></HEAD><BODY>faddfs龙信息平台</BODY></HTML>");

    Console.WriteLine(s);

 }

 

 public static string StripHTML(string strHtml){

    string [] aryReg ={

          @"<script[^>]*?>.*?</script>",

 

          @"<(\/\s*)?!?((\w+:)?\w+)(\w+(\s*=?\s*(([""'])(\\[""'tbnr]|[^\7])*?\7|\w+)|.{0})|\s)*?(\/\s*)?>",

          @"([\r\n])[\s]+",

          @"&(quot|#34);",

          @"&(amp|#38);",

          @"&(lt|#60);",

          @"&(gt|#62);",

          @"&(nbsp|#160);",

          @"&(iexcl|#161);",

          @"&(cent|#162);",

          @"&(pound|#163);",

          @"&(copy|#169);",

          @"&#(\d+);",

          @"-->",

          @"<!--.*\n"

         };

 

    string [] aryRep = {

           "",

           "",

           "",

           "\"",

           "&",

           "<",

           ">",

           " ",

           "\xa1",//chr(161),

           "\xa2",//chr(162),

           "\xa3",//chr(163),

           "\xa9",//chr(169),

           "",

           "\r\n",

           ""

          };

 

    string newReg =aryReg[0];

    string strOutput=strHtml;

    for(int i = 0;i<aryReg.Length;i++){

      Regex regex = new Regex(aryReg[i],RegexOptions.IgnoreCase);

      strOutput = regex.Replace(strOutput,aryRep[i]);

    }

    strOutput.Replace("<","");

    strOutput.Replace(">","");

    strOutput.Replace("\r\n","");

   return strOutput;

 }

}

 

 

写一个静态方法

#region 移除HTML标签

                /// <summary>

                /// 移除HTML标签

                /// </summary>

                /// <param name="HTMLStr">HTMLStr</param>

                public static string ParseTags(string HTMLStr)

                {

                        return System.Text.RegularExpressions.Regex.Replace(HTMLStr, "<[^>]*>", "");

                }

 

                #endregion

 

        #region 取出文本中的图片地址

        /// <summary>

        /// 取出文本中的图片地址

        /// </summary>

        /// <param name="HTMLStr">HTMLStr</param>

        public static string GetImgUrl(string HTMLStr)

        {

            string str = string.Empty;

            string sPattern = @"^<img\s+[^>]*>";

            Regex r = new Regex(@"<img\s+[^>]*\s*src\s*=\s*([']?)(?<url>\S+)'?[^>]*>",

                RegexOptions.Compiled);

            Match m = r.Match(HTMLStr.ToLower());

            if (m.Success)

                str = m.Result("${url}");

            return str;

        }

 

        #endregion