利用正则表达式去掉html代码

 

using System.Text.RegularExpressions;//需要引用

  
// 利用正则表达式去掉"<"和">"之间的内容
  private string StripHT(string strHtml)
  
{
   Regex regex
=new Regex("<.+?>",RegexOptions.IgnoreCase);
   
string strOutput=regex.Replace(strHtml,"");
   
return strOutput;
  }



//方法二(不知为什么此方法占用CPU100%)

public static string DropHTML(string strHtml)
  
{
   
string [] aryReg ={
          
@"<script[^>]*?>.*?</script>",
          
@"<(\/\s*)?!?((\w+:)?\w+)(\w+(\s*=?\s*(([""''])(\\[""''tbnr]|[^\7])*?\7|\w+)|.{0})|\s)*?(\/\s*)?>",
          
@"([\r])[\s]+",
          
@"&(quot|#34);",
          
@"&(amp|#38);",
          
@"&(lt|#60);",
          
@"&(gt|#62);"
          
@"&(nbsp|#160);"
          
@"&(iexcl|#161);",
          
@"&(cent|#162);",
          
@"&(pound|#163);",
          
@"&(copy|#169);",
          
@"&#(\d+);",
          
@"-->",
          
@"<!--.*"         
         }
;

   
string [] aryRep = {
           
"",
           
"",
           
"",
           
"\"",
           "&",
           
"<",
           
">",
           
" ",
           
"\xa1",//chr(161),
           "\xa2",//chr(162),
           "\xa3",//chr(163),
           "\xa9",//chr(169),
           "",
           
"\r",
           
""    
          }
;

   
string newReg =aryReg[0];
   
string strOutput=strHtml;
   
for(int i = 0;i<aryReg.Length;i++)
   
{
    Regex regex 
= new Regex(aryReg[i],RegexOptions.IgnoreCase );
    strOutput 
= regex.Replace(strOutput,aryRep[i]);
   }


   strOutput.Replace(
"<","");
   strOutput.Replace(
">","");
   strOutput.Replace(
"\r","");
   
return strOutput;
      
  }
 

posted on 2006-09-16 08:59  感動常在  阅读(6391)  评论(2编辑  收藏  举报