C# 删除html标签,只保留文字

        public static string noHtml(string str)
        {
            if (str != null)
            {
                str = Regex.Replace(str, @"<script[^>]*>[\s\S]*?</script>", "", RegexOptions.IgnoreCase);//删除脚本
                str = Regex.Replace(str, @"(<style)+[^<>]*>[^\0]*(<\/style>)+", "", RegexOptions.IgnoreCase);//删除样式
                str = Regex.Replace(str, @"<object.*?/object>", "", RegexOptions.IgnoreCase);//删除object
                str = Regex.Replace(str, @"<!--.*", "", RegexOptions.IgnoreCase);//删除开始注释
                str = Regex.Replace(str, @"-->", "", RegexOptions.IgnoreCase);//删除结尾注释
                //str = Regex.Replace(str, @"<\/*[^<>]*>", "", RegexOptions.IgnoreCase);//删除全部html
                //str = Regex.Replace(str, @"<(\/){0,1}div[^<>]*>", "", RegexOptions.IgnoreCase);//删除div
                //str = Regex.Replace(str, @"<(\/){0,1}a[^<>]*>", "", RegexOptions.IgnoreCase);//删除超链接
                //str = Regex.Replace(str, @"<(\/){0,1}font[^<>]*>", "", RegexOptions.IgnoreCase);//删除文字样式
                //str = Regex.Replace(str, @"(class=){1,}(""|\'){0,1}\S+(""|\'|>|\s){0,1}", "", RegexOptions.IgnoreCase);//删除class
                //str = Regex.Replace(str, @"(<iframe){1,}[^<>]*>[^\0]*(<\/iframe>){1,}", "", RegexOptions.IgnoreCase);//删除框架
                //str = Regex.Replace(str, @"(<script){1,}[^<>]*>[^\0]*(<\/script>){1,}", "", RegexOptions.IgnoreCase);//删除脚本
                str = Regex.Replace(str, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase);//删除全部html
                str = Regex.Replace(str, @"([\r\n])[\s]+", "", RegexOptions.IgnoreCase);//删除换行
                str = Regex.Replace(str, @"&nbsp;&nbsp;", " ", RegexOptions.IgnoreCase);//替换空格
            }
            return str;
        }

posted @ 2012-10-06 09:09  八星瓢虫  阅读(646)  评论(0)    收藏  举报