/// <summary>
/// 使用示例
/// </summary>
public static void HtmlRemove()
{
string requestBody = "<html><head><title>Test</title></head><body><a lay-her='123' href=\"https://example.com\">Link</a><p>Not allowed</p><span class=\"developer\"><img src=\"https://www.luocore.com/assets/logo-dark.be3794d7.png\"> <span>LuoCore</span></span><img lay-her='123' data-luo='222' src=\"图片路径\" data=\"test\" /> <a data-luo='222' href=\"baidu.com\" /> <div><span>测试标签</span><img src=\"https://pic.cnblogs.com/face/646489/20140908123308.png\" class=\"avatar\" alt=\"博主头像\"></div></body></html>";
Dictionary<string, string[]> allowedTags = new Dictionary<string, string[]>()
{
{ "a", new string[]{ "href", "data-luo" } },
{ "img", new string[]{ "src", "lay-her", "data-luo" } }
};
// 过滤HTML标签
string filteredRequestBody = HtmlRemoveTagsExcept(requestBody, allowedTags);
Console.WriteLine(filteredRequestBody);
}
/// <summary>
/// html 代码移除
/// </summary>
/// <param name="html"></param>
/// <param name="allowedTags"></param>
/// <returns></returns>
public static string HtmlRemoveTagsExcept(string html, Dictionary<string, string[]> allowedTags)
{
// 预编译正则表达式以提高性能
var tagsPattern = new Regex($"<(/?)(?!({string.Join("|", allowedTags.Keys)})(\\s|/?>))[^>]*>", RegexOptions.IgnoreCase | RegexOptions.Compiled);
var attributePatterns = allowedTags.ToDictionary(
tag => tag.Key,
tag => new Regex($@"\s+({string.Join("|", tag.Value.Select(attr => Regex.Escape(attr)))})\s*=\s*(['""][^'""]*['""])", RegexOptions.IgnoreCase | RegexOptions.Compiled)
);
// 移除不允许的标签
string intermediateResult = tagsPattern.Replace(html, "");
// 处理允许的标签,只保留允许的属性
foreach (var tag in allowedTags.Keys)
{
string fullPattern = $"<{tag}(\\s[^>]*?)?(/?>)";
intermediateResult = Regex.Replace(intermediateResult, fullPattern, match =>
{
string insideTag = match.Groups[1].Value;
string tagClose = match.Groups[2].Value;
string filteredAttributes = attributePatterns[tag].Matches(insideTag)
.Cast<Match>()
.Aggregate("", (current, attrMatch) => current + attrMatch.Value);
return $"<{tag}{filteredAttributes}{tagClose}";
}, RegexOptions.IgnoreCase);
}
// 移除多余的空格和修正属性格式
intermediateResult = Regex.Replace(intermediateResult, @"\s{2,}", " ");
intermediateResult = Regex.Replace(intermediateResult, @"<(\w+)(\s[^>]*?)?\s*(/?>)", "<$1$2$3");
return intermediateResult;
}