C#正则删除HTML标签

using System;
using System.Data;
using System.Configuration;
using System.Collections;
using System.Web;
using System.Web.Security;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Web.UI.WebControls.WebParts;
using System.Web.UI.HtmlControls;
using System.Text.RegularExpressions;

public partial class Ceshi : System.Web.UI.Page
{
    protected void Page_Load(object sender, EventArgs e)
    {
        if (!Page.IsPostBack)
        {
            string str = Regex.Replace("AAA\nBBB\nCCC<br>", "^", "开始=>", RegexOptions.Multiline | RegexOptions.IgnoreCase);//多行模式,每行前面加 '开始=>'
            Response.Write(str);
            string s = @"<html><title>title\\标题</title><head><script>alert('JS脚本');</script>head头部</head><body><table><tr><td><!--注释的东西-->TD的内容1</td><td>TD的内容2</td></table><div style='width:100px;'>DIV的内容</div><span>span内容1</spaN><spAN>span内容2</SPAN></body></html>";
            Response.Write(ClearHTMLTags(s));
        }
    }
    public static string ClearHTMLTags(string HTML)
    {
        string[] Regexs ={
                        @"<script[^>]*?>.*?</script>",
                        @"<(\/\s*)?!?((\w+:)?\w+)(\w+(\s*=?\s*(([""'])(\\[""'tbnr]|[^\7])*?\7|\w+)|.{0})|\s)*?(\/\s*)?>",
                        @"([\r\n])[\s]+",
                        @"&(quot|#34);",
                        @"&(amp|#38);",
                        @"&(lt|#60);",
                        @"&(gt|#62);",
                        @"&(nbsp|#160);",
                        @"&(iexcl|#161);",
                        @"&(cent|#162);",
                        @"&(pound|#163);",
                        @"&(copy|#169);",
                        @"&#(\d+);",
                        @"-->",
                        @"<!--.*\n"
        };

        string[] Replaces ={
                            "",
                            "",
                            "",
                            "\"",
                            "&",
                            "<",
                            ">",
                            " ",
                            "\xa1", //chr(161),
                            "\xa2", //chr(162),
                            "\xa3", //chr(163),
                            "\xa9", //chr(169),
                            "",
                            "\r\n",
                            ""
        };

        string s = HTML;
        for (int i = 0; i < Regexs.Length; i++)
        {
            s = new Regex(Regexs[i], RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(s, Replaces[i]);
        }
        s.Replace("<", "");
        s.Replace(">", "");
        s.Replace("\r\n", "");
        return s;
    }
}

posted on 2012-05-23 20:16  HOT SUMMER  阅读(3152)  评论(0编辑  收藏  举报

导航