木子苏

导航

HTML 转化为TXT

        private string filter(string strHtml)
        {
            string[] aryRegex ={ @"<%=[\w\W]*?%>", @"<script[\w\W]*?</script>", @"<style[\w\W]*?</style>", @"<[/]?[\w\W]*?>", @"([\r\n])[\s]+", "&(nbsp|#160);", "&(iexcl|#161);", "&(cent|#162);", "&(pound|#163);", "&(copy|#169);", @"&#(\d+);", "-->", @"<!--.*\n", "&ldquo;", "&rdquo;" };
            string[] aryReplacment = { "", "", "", "", "", " ", "\xa1", "\xa2", "\xa3", "\xa9", "", "\r\n", "", "", "" };
            string strStripped = strHtml;
            for (int i = 0; i < aryRegex.Length; i++)
            {
                Regex regex = new Regex(aryRegex[i], RegexOptions.IgnoreCase);
                strStripped = regex.Replace(strStripped, aryReplacment[i]);
            }
            strStripped.Replace("\r\n", "");
            return strStripped;
        }


string strHtml = System.Web.HttpUtility.HtmlDecode(STR)
filter(strHtml);

  public   static   string     ParseTags(string   HTMLStr)  
  {  
  return   System.Text.RegularExpressions.Regex.Replace(HTMLStr,   "<[^>]*>",   "");    
  }   
    
    #region   取出文本中的地址  
                  ///   <summary>  
                  ///   取出文本中的图片地址  
                  ///   </summary>  
                  ///   <param   name="HTMLStr">HTMLStr</param>  
                  public   static   string   GetImgUrl(string   HTMLStr)  
                  {  
                          string   str   =   string.Empty;  
                          string   sPattern   =   @"^<img\s+[^>]*>";  
                          Regex   r   =   new   Regex(@"<img\s+[^>]*\s*src\s*=\s*([']?)(?<url>\S+)'?[^>]*>",  
                                  RegexOptions.Compiled);  
                          Match   m   =   r.Match(HTMLStr.ToLower());  
                          if   (m.Success)  
                                  str   =   m.Result("${url}");  
                          return   str;  
                  }  
   
                  #endregion

///   <summary>  
  ///   去除HTML标记  
  ///   </summary>  
  ///   <param   name="NoHTML">包括HTML的源码   </param>  
  ///   <returns>已经去除后的文字</returns>  
  public   static   string   NoHTML(string   Htmlstring)  
  {  
  //删除脚本  
  Htmlstring   =   Regex.Replace(Htmlstring,@"<script[^>]*?>.*?</script>","",RegexOptions.IgnoreCase);  
  //删除HTML  
  Htmlstring   =   Regex.Replace(Htmlstring,@"<(.[^>]*)>","",RegexOptions.IgnoreCase);  
  Htmlstring   =   Regex.Replace(Htmlstring,@"([\r\n])[\s]+","",RegexOptions.IgnoreCase);  
  Htmlstring   =   Regex.Replace(Htmlstring,@"-->","",RegexOptions.IgnoreCase);  
  Htmlstring   =   Regex.Replace(Htmlstring,@"<!--.*","",RegexOptions.IgnoreCase);  
   
  Htmlstring   =   Regex.Replace(Htmlstring,@"&(quot|#34);","\"",RegexOptions.IgnoreCase);  
  Htmlstring   =   Regex.Replace(Htmlstring,@"&(amp|#38);","&",RegexOptions.IgnoreCase);  
  Htmlstring   =   Regex.Replace(Htmlstring,@"&(lt|#60);","<",RegexOptions.IgnoreCase);  
  Htmlstring   =   Regex.Replace(Htmlstring,@"&(gt|#62);",">",RegexOptions.IgnoreCase);  
  Htmlstring   =   Regex.Replace(Htmlstring,@"&(nbsp|#160);","   ",RegexOptions.IgnoreCase);  
  Htmlstring   =   Regex.Replace(Htmlstring,@"&(iexcl|#161);","\xa1",RegexOptions.IgnoreCase);  
  Htmlstring   =   Regex.Replace(Htmlstring,@"&(cent|#162);","\xa2",RegexOptions.IgnoreCase);  
  Htmlstring   =   Regex.Replace(Htmlstring,@"&(pound|#163);","\xa3",RegexOptions.IgnoreCase);  
  Htmlstring   =   Regex.Replace(Htmlstring,@"&(copy|#169);","\xa9",RegexOptions.IgnoreCase);  
  Htmlstring   =   Regex.Replace(Htmlstring,   @"&#(\d+);","",RegexOptions.IgnoreCase);  
   
  Htmlstring.Replace("<","");  
  Htmlstring.Replace(">","");  
  Htmlstring.Replace("\r\n","");  
  Htmlstring=HttpContext.Current.Server.HtmlEncode(Htmlstring).Trim();  
   
  return   Htmlstring;  
  }

posted on 2008-03-19 11:06  skind@126.com  阅读(1350)  评论(0)    收藏  举报