字符串中去除HTML标记

方法一:
  ///   <summary>  
  ///   去除HTML标记  
  ///   </summary>  
  ///   <param   name="strHtml">包括HTML的源码   </param>  
  ///   <returns>已经去除后的文字</returns>  
  using   System;  
  using   System.Text.RegularExpressions;  
  public   class   StripHTMLTest{  
      public   static   void   Main(){  
          string   s=StripHTML("<HTML><HEAD><TITLE>中国石龙信息平台</TITLE></HEAD><BODY>faddfs龙信息平台</BODY></HTML>");  
          Console.WriteLine(s);  
      }  
   
      public   static   string   StripHTML(string   strHtml){  
          string   []   aryReg   ={  
                      @"<script[^>]*?>.*?</script>",  
   
                      @"<(\/\s*)?!?((\w+:)?\w+)(\w+(\s*=?\s*(([""'])(\\[""'tbnr]|[^\7])*?\7|\w+)|.{0})|\s)*?(\/\s*)?>",  
                      @"([\r\n])[\s]+",  
                      @"&(quot|#34);",  
                      @"&(amp|#38);",  
                      @"&(lt|#60);",  
                      @"&(gt|#62);",    
                      @"&(nbsp|#160);",    
                      @"&(iexcl|#161);",  
                      @"&(cent|#162);",  
                      @"&(pound|#163);",  
                      @"&(copy|#169);",  
                      @"&#(\d+);",  
                      @"-->",  
                      @"<!--.*\n"  
                    };  
   
          string   []   aryRep   =   {  
                        "",  
                        "",  
                        "",  
                        "\"",  
                        "&",  
                        "<",  
                        ">",  
                        "   ",  
                        "\xa1",//chr(161),  
                        "\xa2",//chr(162),  
                        "\xa3",//chr(163),  
                        "\xa9",//chr(169),  
                        "",  
                        "\r\n",  
                        ""  
                      };  
   
          string   newReg   =aryReg[0];  
          string   strOutput=strHtml;  
          for(int   i   =   0;i<aryReg.Length;i++){  
              Regex   regex   =   new   Regex(aryReg[i],RegexOptions.IgnoreCase);  
              strOutput   =   regex.Replace(strOutput,aryRep[i]);  
          }  
          strOutput.Replace("<","");  
          strOutput.Replace(">","");  
          strOutput.Replace("\r\n","");  
          return   strOutput;  
      }  
  }
可用,但会保留图片!
方法二:

 

using System;
using System.Data;
using System.Configuration;
using System.Web;
using System.Web.Security;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Web.UI.WebControls.WebParts;
using System.Web.UI.HtmlControls;
using System.Text;
using System.Text.RegularExpressions;

/// <summary>
/// Summary description for StringUtilily
/// </summary>
public class StringUtilily
{
      /// <summary>
      /// 随机生成字符串源
      /// </summary>
      public const string RANDOM_STRING_SOURCE = "0123456789abcdefghijklmnopqrstuvwxyz";

      public StringUtilily()
      {
          //
          // TODO: Add constructor logic here
          //
      }
      /// <summary>
      /// 替换字符串
      /// </summary>
      /// <param name="src">要修改的字符串</param>
      /// <param name="pattern">要匹配的正则表达式模式</param>
      /// <param name="replacement">替换字符串</param>
      /// <returns>已修改的字符串</returns>
      public static string Replace(string src, string pattern, string replacement)
      {
          return Replace(src, pattern, replacement, RegexOptions.None);
      }

      /// <summary>
      /// 替换字符串,不区分大小写
      /// </summary>
      /// <param name="src">要修改的字符串</param>
      /// <param name="pattern">要匹配的正则表达式模式</param>
      /// <param name="replacement">替换字符串</param>
      /// <returns>已修改的字符串</returns>
      public static string ReplaceIgnoreCase(string src, string pattern, string replacement)
      {
          return Replace(src, pattern, replacement, RegexOptions.IgnoreCase);
      }

      /// <summary>
      /// 替换字符串
      /// </summary>
      /// <param name="src">要修改的字符串</param>
      /// <param name="pattern">要匹配的正则表达式模式</param>
      /// <param name="replacement">替换字符串</param>
      /// <param name="options">匹配模式</param>
      /// <returns>已修改的字符串</returns>
      public static string Replace(string src, string pattern, string replacement, RegexOptions options)
      {
          Regex regex = new Regex(pattern, options | RegexOptions.Compiled);

          return regex.Replace(src, replacement);
      }

      /// <summary>
      /// 删除字符串中指定的内容
      /// </summary>
      /// <param name="src">要修改的字符串</param>
      /// <param name="pattern">要删除的正则表达式模式</param>
      /// <returns>已删除指定内容的字符串</returns>
      public static string Drop(string src, string pattern)
      {
          return Replace(src, pattern, "");
      }

      /// <summary>
      /// 删除字符串中指定的内容,不区分大小写
      /// </summary>
      /// <param name="src">要修改的字符串</param>
      /// <param name="pattern">要删除的正则表达式模式</param>
      /// <returns>已删除指定内容的字符串</returns>
      public static string DropIgnoreCase(string src, string pattern)
      {
          return ReplaceIgnoreCase(src, pattern, "");
      }

      /// <summary>
      /// 替换字符串到数据库可输入模式
      /// </summary>
      /// <param name="src">待插入数据库的字符串</param>
      /// <returns>可插入数据库的字符串</returns>
      public static string ToSQL(string src)
      {
          if (src == null)
          {
              return null;
          }
          return Replace(src, "'", "''");
      }

      /// <summary>
      /// 去掉html内容中的指定的html标签
      /// </summary>
      /// <param name="content">html内容</param>
      /// <param name="tagName">html标签</param>
      /// <returns>去掉标签的内容</returns>
      public static string DropHtmlTag(string content, string tagName)
      {
          //去掉<tagname>和</tagname>
          return DropIgnoreCase(content, "<[/]{0,1}" + tagName + "[^\\>]*\\>");
      }

      /// <summary>
      /// 去掉html内容中全部标签
      /// </summary>
      /// <param name="content">html内容</param>
      /// <returns>去掉html标签的内容</returns>
      public static string DropHtmlTag(string content)
      {
          //去掉<*>
          return Drop(content, "<[^\\>]*>");
      }

      /// <summary>
      /// 生成随机字符串
      /// </summary>
      /// <param name="num">字符串的位数</param>
      /// <returns>可插入数据库的字符串</returns>
      public static string GetRandomString(int num)
      {
          string rndStr = "";
          Random rnd = new Random();
          for (int i = 0; i < num; i++)
          {
              rndStr += RANDOM_STRING_SOURCE.Substring(Convert.ToInt32(Math.Round(rnd.NextDouble() * 36, 0)), 1);
          }
          return rndStr;
      }
      /// <summary>
      /// 判断一个数据是不是数字
      /// </summary>
      /// <param name="inputData">字符串</param>
      /// <returns>结果</returns>
      public static bool IsNumeric(string inputData)
      {
          Regex _isNumber = new Regex(@"^\d+$");
          Match m = _isNumber.Match(inputData);
          return m.Success;
      }

      /// <summary>
      /// 转换html标签为web页可见内容
      /// </summary>
      /// <param name="src"></param>
      /// <returns></returns>
      public static string EscapeHtml(string src)
      {
          if (src == null)
          {
              return null;
          }
          string s = src;
          s = Replace(s, ">", "&gt;");
          s = Replace(s, "<", "&lt;");
          return s;
      }

      /// <summary>
      /// 将字符串格式化成HTML代码
      /// </summary>
      /// <param name="str">要格式化的字符串</param>
      /// <returns>格式化后的字符串</returns>
      public static String ToHtml(string str)
      {
          if (str == null || str.Equals(""))
          {
              return str;
          }

          StringBuilder sb = new StringBuilder(str);
          sb.Replace("&", "&amp;");
          sb.Replace("<", "&lt;");
          sb.Replace(">", "&gt;");
          sb.Replace("\r\n", "<br>");
          sb.Replace("\n", "<br>");
          sb.Replace("\t", " ");
          sb.Replace(" ", "&nbsp;");
          return sb.ToString();
      }


      /// <summary>
      /// 将HTML代码转化成文本格式
      /// </summary>
      /// <param name="str">要格式化的字符串</param>
      /// <returns>格式化后的字符串</returns>
      public static String ToTxt(String str)
      {
          if (str == null || str.Equals(""))
          {
              return str;
          }

          StringBuilder sb = new StringBuilder(str);
          sb.Replace("&nbsp;", " ");
          sb.Replace("<br>", "\r\n");
          sb.Replace("&lt;", "<");
          sb.Replace("&gt;", ">");
          sb.Replace("&amp;", "&");
          return sb.ToString();

             System.IO.StreamReader sr = new StreamReader("d:\\bb.html", System.Text.Encoding.UTF8);
             string HtmlString = sr.ReadToEnd();
             string Temp = HtmlString.Substring(HtmlString.IndexOf("<body>"));
             string Stup1 = Regex.Replace(Temp, @"<script>[\s\S]*</script>", "");   //过滤Js代码;
             string Stup2 = Regex.Replace(Stup1, @"<style[\s\S]*</style>", "");       //过滤Css样式
             string Stup3 = Regex.Replace(Stup2, "<.+?>", "");                        //过滤html标签
             string Stup4 = Regex.Replace(Stup3, @"\s", "");

         }
     }
}

}
但是如果输入的HTML标记有误的话问题就大了,

方法三:

using System.Text.RegularExpressions;

/// <summary>

  /// 去除HTML标记

  /// </summary>

  /// <param name="NoHTML">包括HTML的源码 </param>

  /// <returns>已经去除后的文字</returns>

  public static string NoHTML(string Htmlstring)

  { 

   //删除脚本

   Htmlstring = Regex.Replace(Htmlstring,@"<script[^>]*?>.*?</script>","",RegexOptions.IgnoreCase);

   //删除HTML

   Htmlstring = Regex.Replace(Htmlstring,@"<(.[^>]*)>","",RegexOptions.IgnoreCase);

   Htmlstring = Regex.Replace(Htmlstring,@"([\r\n])[\s]+","",RegexOptions.IgnoreCase);

   Htmlstring = Regex.Replace(Htmlstring,@"-->","",RegexOptions.IgnoreCase);

   Htmlstring = Regex.Replace(Htmlstring,@"<!--.*","",RegexOptions.IgnoreCase);

   

   Htmlstring = Regex.Replace(Htmlstring,@"&(quot|#34);","\"",RegexOptions.IgnoreCase);

   Htmlstring = Regex.Replace(Htmlstring,@"&(amp|#38);","&",RegexOptions.IgnoreCase);

   Htmlstring = Regex.Replace(Htmlstring,@"&(lt|#60);","<",RegexOptions.IgnoreCase);

   Htmlstring = Regex.Replace(Htmlstring,@"&(gt|#62);",">",RegexOptions.IgnoreCase);

   Htmlstring = Regex.Replace(Htmlstring,@"&(nbsp|#160);"," ",RegexOptions.IgnoreCase);

   Htmlstring = Regex.Replace(Htmlstring,@"&(iexcl|#161);","\xa1",RegexOptions.IgnoreCase);

   Htmlstring = Regex.Replace(Htmlstring,@"&(cent|#162);","\xa2",RegexOptions.IgnoreCase);

   Htmlstring = Regex.Replace(Htmlstring,@"&(pound|#163);","\xa3",RegexOptions.IgnoreCase);

   Htmlstring = Regex.Replace(Htmlstring,@"&(copy|#169);","\xa9",RegexOptions.IgnoreCase);

   Htmlstring = Regex.Replace(Htmlstring, @"&#(\d+);","",RegexOptions.IgnoreCase);

   Htmlstring.Replace("<","");

   Htmlstring.Replace(">","");

   Htmlstring.Replace("\r\n","");

   Htmlstring=HttpContext.Current.Server.HtmlEncode(Htmlstring).Trim();

   return Htmlstring;

  

posted @ 2009-05-25 17:13  peipei_t  阅读(583)  评论(0)    收藏  举报