字符串中去除HTML标记
方法一:
/// <summary>
/// 去除HTML标记
/// </summary>
/// <param name="strHtml">包括HTML的源码 </param>
/// <returns>已经去除后的文字</returns>
using System;
using System.Text.RegularExpressions;
public class StripHTMLTest{
public static void Main(){
string s=StripHTML("<HTML><HEAD><TITLE>中国石龙信息平台</TITLE></HEAD><BODY>faddfs龙信息平台</BODY></HTML>");
Console.WriteLine(s);
}
public static string StripHTML(string strHtml){
string [] aryReg ={
@"<script[^>]*?>.*?</script>",
@"<(\/\s*)?!?((\w+:)?\w+)(\w+(\s*=?\s*(([""'])(\\[""'tbnr]|[^\7])*?\7|\w+)|.{0})|\s)*?(\/\s*)?>",
@"([\r\n])[\s]+",
@"&(quot|#34);",
@"&(amp|#38);",
@"&(lt|#60);",
@"&(gt|#62);",
@"&(nbsp|#160);",
@"&(iexcl|#161);",
@"&(cent|#162);",
@"&(pound|#163);",
@"&(copy|#169);",
@"&#(\d+);",
@"-->",
@"<!--.*\n"
};
string [] aryRep = {
"",
"",
"",
"\"",
"&",
"<",
">",
" ",
"\xa1",//chr(161),
"\xa2",//chr(162),
"\xa3",//chr(163),
"\xa9",//chr(169),
"",
"\r\n",
""
};
string newReg =aryReg[0];
string strOutput=strHtml;
for(int i = 0;i<aryReg.Length;i++){
Regex regex = new Regex(aryReg[i],RegexOptions.IgnoreCase);
strOutput = regex.Replace(strOutput,aryRep[i]);
}
strOutput.Replace("<","");
strOutput.Replace(">","");
strOutput.Replace("\r\n","");
return strOutput;
}
}
可用,但会保留图片!
方法二:
using System;
using System.Data;
using System.Configuration;
using System.Web;
using System.Web.Security;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Web.UI.WebControls.WebParts;
using System.Web.UI.HtmlControls;
using System.Text;
using System.Text.RegularExpressions;
/// <summary>
/// Summary description for StringUtilily
/// </summary>
public class StringUtilily
{
/// <summary>
/// 随机生成字符串源
/// </summary>
public const string RANDOM_STRING_SOURCE = "0123456789abcdefghijklmnopqrstuvwxyz";
public StringUtilily()
{
//
// TODO: Add constructor logic here
//
}
/// <summary>
/// 替换字符串
/// </summary>
/// <param name="src">要修改的字符串</param>
/// <param name="pattern">要匹配的正则表达式模式</param>
/// <param name="replacement">替换字符串</param>
/// <returns>已修改的字符串</returns>
public static string Replace(string src, string pattern, string replacement)
{
return Replace(src, pattern, replacement, RegexOptions.None);
}
/// <summary>
/// 替换字符串,不区分大小写
/// </summary>
/// <param name="src">要修改的字符串</param>
/// <param name="pattern">要匹配的正则表达式模式</param>
/// <param name="replacement">替换字符串</param>
/// <returns>已修改的字符串</returns>
public static string ReplaceIgnoreCase(string src, string pattern, string replacement)
{
return Replace(src, pattern, replacement, RegexOptions.IgnoreCase);
}
/// <summary>
/// 替换字符串
/// </summary>
/// <param name="src">要修改的字符串</param>
/// <param name="pattern">要匹配的正则表达式模式</param>
/// <param name="replacement">替换字符串</param>
/// <param name="options">匹配模式</param>
/// <returns>已修改的字符串</returns>
public static string Replace(string src, string pattern, string replacement, RegexOptions options)
{
Regex regex = new Regex(pattern, options | RegexOptions.Compiled);
return regex.Replace(src, replacement);
}
/// <summary>
/// 删除字符串中指定的内容
/// </summary>
/// <param name="src">要修改的字符串</param>
/// <param name="pattern">要删除的正则表达式模式</param>
/// <returns>已删除指定内容的字符串</returns>
public static string Drop(string src, string pattern)
{
return Replace(src, pattern, "");
}
/// <summary>
/// 删除字符串中指定的内容,不区分大小写
/// </summary>
/// <param name="src">要修改的字符串</param>
/// <param name="pattern">要删除的正则表达式模式</param>
/// <returns>已删除指定内容的字符串</returns>
public static string DropIgnoreCase(string src, string pattern)
{
return ReplaceIgnoreCase(src, pattern, "");
}
/// <summary>
/// 替换字符串到数据库可输入模式
/// </summary>
/// <param name="src">待插入数据库的字符串</param>
/// <returns>可插入数据库的字符串</returns>
public static string ToSQL(string src)
{
if (src == null)
{
return null;
}
return Replace(src, "'", "''");
}
/// <summary>
/// 去掉html内容中的指定的html标签
/// </summary>
/// <param name="content">html内容</param>
/// <param name="tagName">html标签</param>
/// <returns>去掉标签的内容</returns>
public static string DropHtmlTag(string content, string tagName)
{
//去掉<tagname>和</tagname>
return DropIgnoreCase(content, "<[/]{0,1}" + tagName + "[^\\>]*\\>");
}
/// <summary>
/// 去掉html内容中全部标签
/// </summary>
/// <param name="content">html内容</param>
/// <returns>去掉html标签的内容</returns>
public static string DropHtmlTag(string content)
{
//去掉<*>
return Drop(content, "<[^\\>]*>");
}
/// <summary>
/// 生成随机字符串
/// </summary>
/// <param name="num">字符串的位数</param>
/// <returns>可插入数据库的字符串</returns>
public static string GetRandomString(int num)
{
string rndStr = "";
Random rnd = new Random();
for (int i = 0; i < num; i++)
{
rndStr += RANDOM_STRING_SOURCE.Substring(Convert.ToInt32(Math.Round(rnd.NextDouble() * 36, 0)), 1);
}
return rndStr;
}
/// <summary>
/// 判断一个数据是不是数字
/// </summary>
/// <param name="inputData">字符串</param>
/// <returns>结果</returns>
public static bool IsNumeric(string inputData)
{
Regex _isNumber = new Regex(@"^\d+$");
Match m = _isNumber.Match(inputData);
return m.Success;
}
/// <summary>
/// 转换html标签为web页可见内容
/// </summary>
/// <param name="src"></param>
/// <returns></returns>
public static string EscapeHtml(string src)
{
if (src == null)
{
return null;
}
string s = src;
s = Replace(s, ">", ">");
s = Replace(s, "<", "<");
return s;
}
/// <summary>
/// 将字符串格式化成HTML代码
/// </summary>
/// <param name="str">要格式化的字符串</param>
/// <returns>格式化后的字符串</returns>
public static String ToHtml(string str)
{
if (str == null || str.Equals(""))
{
return str;
}
StringBuilder sb = new StringBuilder(str);
sb.Replace("&", "&");
sb.Replace("<", "<");
sb.Replace(">", ">");
sb.Replace("\r\n", "<br>");
sb.Replace("\n", "<br>");
sb.Replace("\t", " ");
sb.Replace(" ", " ");
return sb.ToString();
}
/// <summary>
/// 将HTML代码转化成文本格式
/// </summary>
/// <param name="str">要格式化的字符串</param>
/// <returns>格式化后的字符串</returns>
public static String ToTxt(String str)
{
if (str == null || str.Equals(""))
{
return str;
}
StringBuilder sb = new StringBuilder(str);
sb.Replace(" ", " ");
sb.Replace("<br>", "\r\n");
sb.Replace("<", "<");
sb.Replace(">", ">");
sb.Replace("&", "&");
return sb.ToString();
System.IO.StreamReader sr = new StreamReader("d:\\bb.html", System.Text.Encoding.UTF8);
string HtmlString = sr.ReadToEnd();
string Temp = HtmlString.Substring(HtmlString.IndexOf("<body>"));
string Stup1 = Regex.Replace(Temp, @"<script>[\s\S]*</script>", ""); //过滤Js代码;
string Stup2 = Regex.Replace(Stup1, @"<style[\s\S]*</style>", ""); //过滤Css样式
string Stup3 = Regex.Replace(Stup2, "<.+?>", ""); //过滤html标签
string Stup4 = Regex.Replace(Stup3, @"\s", "");
}
}
}
}
但是如果输入的HTML标记有误的话问题就大了,
方法三:
using System.Text.RegularExpressions;
/// <summary>
/// 去除HTML标记
/// </summary>
/// <param name="NoHTML">包括HTML的源码 </param>
/// <returns>已经去除后的文字</returns>
public static string NoHTML(string Htmlstring)
{
//删除脚本
Htmlstring = Regex.Replace(Htmlstring,@"<script[^>]*?>.*?</script>","",RegexOptions.IgnoreCase);
//删除HTML
Htmlstring = Regex.Replace(Htmlstring,@"<(.[^>]*)>","",RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring,@"([\r\n])[\s]+","",RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring,@"-->","",RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring,@"<!--.*","",RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring,@"&(quot|#34);","\"",RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring,@"&(amp|#38);","&",RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring,@"&(lt|#60);","<",RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring,@"&(gt|#62);",">",RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring,@"&(nbsp|#160);"," ",RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring,@"&(iexcl|#161);","\xa1",RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring,@"&(cent|#162);","\xa2",RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring,@"&(pound|#163);","\xa3",RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring,@"&(copy|#169);","\xa9",RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&#(\d+);","",RegexOptions.IgnoreCase);
Htmlstring.Replace("<","");
Htmlstring.Replace(">","");
Htmlstring.Replace("\r\n","");
Htmlstring=HttpContext.Current.Server.HtmlEncode(Htmlstring).Trim();
return Htmlstring;

浙公网安备 33010602011771号