using System;
using System.IO;
using System.Collections.Generic;
using System.Text;
using System.Text.RegularExpressions;

namespace HtmlStripTags


{
class HtmlHelper

{

private static readonly string[][] htmlNamedEntities = new string[][]
{

new string[]
{ """, "\"" },

new string[]
{ "<", "<" },

new string[]
{ ">", ">" },

new string[]
{ " ", " " },

new string[]
{ "¡", "¡" },

new string[]
{ "¢", "¢" },

new string[]
{ "£", "£" },

new string[]
{ "¤", "¤" },

new string[]
{ "¥", "¥" },

new string[]
{ "¦", "¦" },

new string[]
{ "§", "§" },

new string[]
{ "¨", "¨" },

new string[]
{ "©", "©" },

new string[]
{ "ª", "ª" },

new string[]
{ "«", "«" },

new string[]
{ "¬", "¬" },

new string[]
{ "­", "" },

new string[]
{ "®", "®" },

new string[]
{ "¯", "¯" },

new string[]
{ "°", "°" },

new string[]
{ "±", "±" },

new string[]
{ "²", "²" },

new string[]
{ "³", "³" },

new string[]
{ "´", "´" },

new string[]
{ "µ", "µ" },

new string[]
{ "¶", "¶" },

new string[]
{ "·", "·" },

new string[]
{ "¸", "¸" },

new string[]
{ "¹", "¹" },

new string[]
{ "º", "º" },

new string[]
{ "»", " »" },

new string[]
{ "¼", "¼" },

new string[]
{ "½", "½" },

new string[]
{ "¾", "¾" },

new string[]
{ "¿", "¿" },

new string[]
{ "À", "À" },

new string[]
{ "Á", "Á" },

new string[]
{ "Â", "Â" },

new string[]
{ "Ã", "Ã" },

new string[]
{ "Ä", "Ä" },

new string[]
{ "Å", "Å" },

new string[]
{ "Æ", "Æ" },

new string[]
{ "Ç", "Ç" },

new string[]
{ "È", "È" },

new string[]
{ "É", "É" },

new string[]
{ "Ê", "Ê" },

new string[]
{ "Ë", "Ë" },

new string[]
{ "Ì", "Ì" },

new string[]
{ "Í", "Í" },

new string[]
{ "Î", "Î" },

new string[]
{ "Ï", "Ï" },

new string[]
{ "Ð", "Ð" },

new string[]
{ "Ñ", "Ñ" },

new string[]
{ "Ò", "Ò" },

new string[]
{ "Ó", "Ó" },

new string[]
{ "Ô", "Ô" },

new string[]
{ "Õ", "Õ" },

new string[]
{ "Ö", "Ö" },

new string[]
{ "×", "×" },

new string[]
{ "Ø", "Ø" },

new string[]
{ "Ù", "Ù" },

new string[]
{ "Ú", "Ú" },

new string[]
{ "Û", "Û" },

new string[]
{ "Ü", "Ü" },

new string[]
{ "Ý", "Ý" },

new string[]
{ "Þ", "Þ" },

new string[]
{ "ß", "ß" },

new string[]
{ "à", "à" },

new string[]
{ "á", "á" },

new string[]
{ "â", "â" },

new string[]
{ "ã", "ã" },

new string[]
{ "ä", "ä" },

new string[]
{ "å", "å" },

new string[]
{ "æ", "æ" },

new string[]
{ "ç", "ç" },

new string[]
{ "è", "è" },

new string[]
{ "é", "é" },

new string[]
{ "ê", "ê" },

new string[]
{ "ë", "ë" },

new string[]
{ "ì", "ì" },

new string[]
{ "í", "í" },

new string[]
{ "î", "î" },

new string[]
{ "ï", "ï" },

new string[]
{ "ð", "ð" },

new string[]
{ "ñ", "ñ" },

new string[]
{ "ò", "ò" },

new string[]
{ "ó", "ó" },

new string[]
{ "ô", "ô" },

new string[]
{ "õ", "õ" },

new string[]
{ "ö", "ö" },

new string[]
{ "÷", "÷" },

new string[]
{ "ø", "ø" },

new string[]
{ "ù", "ù" },

new string[]
{ "ú", "ú" },

new string[]
{ "û", "û" },

new string[]
{ "ü", "ü" },

new string[]
{ "ý", "ý" },

new string[]
{ "þ", "þ" },

new string[]
{ "ÿ", "ÿ" },

new string[]
{ "Œ", "Œ" },

new string[]
{ "œ", "œ" },

new string[]
{ "Š", "Š" },

new string[]
{ "š", "š" },

new string[]
{ "Ÿ", "Ÿ" },

new string[]
{ "ƒ", "ƒ" },

new string[]
{ "ˆ", "ˆ" },

new string[]
{ "˜", "˜" },

new string[]
{ "Α", "Α" },

new string[]
{ "Β", "Β" },

new string[]
{ "Γ", "Γ" },

new string[]
{ "Δ", "Δ" },

new string[]
{ "Ε", "Ε" },

new string[]
{ "Ζ", "Ζ" },

new string[]
{ "Η", "Η" },

new string[]
{ "Θ", "Θ" },

new string[]
{ "Ι", "Ι" },

new string[]
{ "Κ", "Κ" },

new string[]
{ "Λ", "Λ" },

new string[]
{ "Μ", "Μ" },

new string[]
{ "Ν", "Ν" },

new string[]
{ "Ξ", "Ξ" },

new string[]
{ "Ο", "Ο" },

new string[]
{ "Π", "Π" },

new string[]
{ "Ρ", "Ρ" },

new string[]
{ "Σ", "Σ" },

new string[]
{ "Τ", "Τ" },

new string[]
{ "Υ", "Υ" },

new string[]
{ "Φ", "Φ" },

new string[]
{ "Χ", "Χ" },

new string[]
{ "Ψ", "Ψ" },

new string[]
{ "Ω", "Ω" },

new string[]
{ "α", "α" },

new string[]
{ "β", "β" },

new string[]
{ "γ", "γ" },

new string[]
{ "δ", "δ" },

new string[]
{ "ε", "ε" },

new string[]
{ "ζ", "ζ" },

new string[]
{ "η", "η" },

new string[]
{ "θ", "θ" },

new string[]
{ "ι", "ι" },

new string[]
{ "κ", "κ" },

new string[]
{ "λ", "λ" },

new string[]
{ "μ", "μ" },

new string[]
{ "ν", "ν" },

new string[]
{ "ξ", "ξ" },

new string[]
{ "ο", "ο" },

new string[]
{ "π", "π" },

new string[]
{ "ρ", "ρ" },

new string[]
{ "ς", "ς" },

new string[]
{ "σ", "σ" },

new string[]
{ "τ", "τ" },

new string[]
{ "υ", "υ" },

new string[]
{ "φ", "φ" },

new string[]
{ "χ", "χ" },

new string[]
{ "ψ", "ψ" },

new string[]
{ "ω", "ω" },

new string[]
{ "ϑ", "ϑ" },

new string[]
{ "ϒ", "ϒ" },

new string[]
{ "ϖ", "ϖ" },

new string[]
{ " ", " " },

new string[]
{ " ", " " },

new string[]
{ " ", " " },

new string[]
{ "‌", "" },

new string[]
{ "‍", "" },

new string[]
{ "‎", "" },

new string[]
{ "‏", "" },

new string[]
{ "–", "–" },

new string[]
{ "—", "—" },

new string[]
{ "‘", "‘" },

new string[]
{ "’", "’" },

new string[]
{ "‚", "‚" },

new string[]
{ "“", "“" },

new string[]
{ "”", "”" },

new string[]
{ "„", "„" },

new string[]
{ "†", "†" },

new string[]
{ "‡", "‡" },

new string[]
{ "•", "•" },

new string[]
{ "…", "…" },

new string[]
{ "‰", "‰" },

new string[]
{ "′", "′" },

new string[]
{ "″", "″" },

new string[]
{ "‹", "‹" },

new string[]
{ "›", "›" },

new string[]
{ "‾", "‾" },

new string[]
{ "⁄", "⁄" },

new string[]
{ "€", "€" },

new string[]
{ "ℑ", "ℑ" },

new string[]
{ "℘", "℘" },

new string[]
{ "ℜ", "ℜ" },

new string[]
{ "™", "™" },

new string[]
{ "ℵ", "ℵ" },

new string[]
{ "←", "←" },

new string[]
{ "↑", "↑" },

new string[]
{ "→", "→" },

new string[]
{ "↓", "↓" },

new string[]
{ "↔", "↔" },

new string[]
{ "↵", "↵" },

new string[]
{ "⇐", "⇐" },

new string[]
{ "⇑", "⇑" },

new string[]
{ "⇒", "⇒" },

new string[]
{ "⇓", "⇓" },

new string[]
{ "⇔", "⇔" },

new string[]
{ "∀", "∀" },

new string[]
{ "∂", "∂" },

new string[]
{ "∃", "∃" },

new string[]
{ "∅", "∅" },

new string[]
{ "∇", "∇" },

new string[]
{ "∈", "∈" },

new string[]
{ "∉", "∉" },

new string[]
{ "∋", "∋" },

new string[]
{ "∏", "∏" },

new string[]
{ "∑", "∑" },

new string[]
{ "−", "−" },

new string[]
{ "∗", "∗" },

new string[]
{ "√", "√" },

new string[]
{ "∝", "∝" },

new string[]
{ "∞", "∞" },

new string[]
{ "∠", "∠" },

new string[]
{ "∧", "∧" },

new string[]
{ "∨", "∨" },

new string[]
{ "∩", "∩" },

new string[]
{ "∪", "∪" },

new string[]
{ "∫", "∫" },

new string[]
{ "∴", "∴" },

new string[]
{ "∼", "∼" },

new string[]
{ "≅", "≅" },

new string[]
{ "≈", "≈" },

new string[]
{ "≠", "≠" },

new string[]
{ "≡", "≡" },

new string[]
{ "≤", "≤" },

new string[]
{ "≥", "≥" },

new string[]
{ "⊂", "⊂" },

new string[]
{ "⊃", "⊃" },

new string[]
{ "⊄", "⊄" },

new string[]
{ "⊆", "⊆" },

new string[]
{ "⊇", "⊇" },

new string[]
{ "⊕", "⊕" },

new string[]
{ "⊗", "⊗" },

new string[]
{ "⊥", "⊥" },

new string[]
{ "⋅", "⋅" },

new string[]
{ "⌈", "⌈" },

new string[]
{ "⌉", "⌉" },

new string[]
{ "⌊", "⌊" },

new string[]
{ "⌋", "⌋" },

new string[]
{ "⟨", "〈" },

new string[]
{ "⟩", "〉" },

new string[]
{ "◊", "◊" },

new string[]
{ "♠", "♠" },

new string[]
{ "♣", "♣" },

new string[]
{ "♥", "♥" },

new string[]
{ "♦", "♦" },

new string[]
{ "&", "&" }
};

public static string HtmlStripTags(string htmlContent, bool replaceNamedEntities, bool replaceNumberedEntities)

{
if (htmlContent == null)
return null;
htmlContent = htmlContent.Trim();
if (htmlContent == string.Empty)
return string.Empty;

int bodyStartTagIdx = htmlContent.IndexOf("<body", StringComparison.CurrentCultureIgnoreCase);
int bodyEndTagIdx = htmlContent.IndexOf("</body>", StringComparison.CurrentCultureIgnoreCase);

int startIdx = 0, endIdx = htmlContent.Length - 1;
if (bodyStartTagIdx >= 0)
startIdx = bodyStartTagIdx;
if (bodyEndTagIdx >= 0)
endIdx = bodyEndTagIdx;

bool insideTag = false,
insideAttributeValue = false,
insideHtmlComment = false,
insideScriptBlock = false,
insideNoScriptBlock = false,
insideStyleBlock = false;
char attributeValueDelimiter = '"';

StringBuilder sb = new StringBuilder(htmlContent.Length);
for (int i = startIdx; i <= endIdx; i++)

{
if (i == 4163)

{
}

// html comment block
if (!insideHtmlComment)

{
if (i + 3 < htmlContent.Length &&
htmlContent[i] == '<' &&
htmlContent[i + 1] == '!' &&
htmlContent[i + 2] == '-' &&
htmlContent[i + 3] == '-')

{
i += 3;
insideHtmlComment = true;
continue;
}
}
else // inside html comment

{
if (i + 2 < htmlContent.Length &&
htmlContent[i] == '-' &&
htmlContent[i + 1] == '-' &&
htmlContent[i + 2] == '>')

{
i += 2;
insideHtmlComment = false;
continue;
}
else
continue;
}

// noscript block
if (!insideNoScriptBlock)

{
if (i + 9 < htmlContent.Length &&
htmlContent[i] == '<' &&
(htmlContent[i + 1] == 'n' || htmlContent[i + 1] == 'N') &&
(htmlContent[i + 2] == 'o' || htmlContent[i + 2] == 'O') &&
(htmlContent[i + 3] == 's' || htmlContent[i + 3] == 'S') &&
(htmlContent[i + 4] == 'c' || htmlContent[i + 4] == 'C') &&
(htmlContent[i + 5] == 'r' || htmlContent[i + 5] == 'R') &&
(htmlContent[i + 6] == 'i' || htmlContent[i + 6] == 'I') &&
(htmlContent[i + 7] == 'p' || htmlContent[i + 7] == 'P') &&
(htmlContent[i + 8] == 't' || htmlContent[i + 8] == 'T') &&
(char.IsWhiteSpace(htmlContent[i + 9]) || htmlContent[i + 9] == '>'))

{
i += 9;
insideNoScriptBlock = true;
continue;
}
}
else // inside noscript block

{
if (i + 10 < htmlContent.Length &&
htmlContent[i] == '<' &&
htmlContent[i + 1] == '/' &&
(htmlContent[i + 2] == 'n' || htmlContent[i + 2] == 'N') &&
(htmlContent[i + 3] == 'o' || htmlContent[i + 3] == 'O') &&
(htmlContent[i + 4] == 's' || htmlContent[i + 4] == 'S') &&
(htmlContent[i + 5] == 'c' || htmlContent[i + 5] == 'C') &&
(htmlContent[i + 6] == 'r' || htmlContent[i + 6] == 'R') &&
(htmlContent[i + 7] == 'i' || htmlContent[i + 7] == 'I') &&
(htmlContent[i + 8] == 'p' || htmlContent[i + 8] == 'P') &&
(htmlContent[i + 9] == 't' || htmlContent[i + 9] == 'T') &&
(char.IsWhiteSpace(htmlContent[i + 10]) || htmlContent[i + 10] == '>'))

{
if (htmlContent[i + 10] != '>')

{
i += 9;
while (i < htmlContent.Length && htmlContent[i] != '>')
i++;
}
else
i += 10;
insideNoScriptBlock = false;
}
continue;
}

// script block
if (!insideScriptBlock)

{
if (i + 7 < htmlContent.Length &&
htmlContent[i] == '<' &&
(htmlContent[i + 1] == 's' || htmlContent[i + 1] == 'S') &&
(htmlContent[i + 2] == 'c' || htmlContent[i + 2] == 'C') &&
(htmlContent[i + 3] == 'r' || htmlContent[i + 3] == 'R') &&
(htmlContent[i + 4] == 'i' || htmlContent[i + 4] == 'I') &&
(htmlContent[i + 5] == 'p' || htmlContent[i + 5] == 'P') &&
(htmlContent[i + 6] == 't' || htmlContent[i + 6] == 'T') &&
(char.IsWhiteSpace(htmlContent[i + 7]) || htmlContent[i + 7] == '>'))

{
i += 6;
insideScriptBlock = true;
continue;
}
}
else // inside script block

{
if (i + 8 < htmlContent.Length &&
htmlContent[i] == '<' &&
htmlContent[i + 1] == '/' &&
(htmlContent[i + 2] == 's' || htmlContent[i + 2] == 'S') &&
(htmlContent[i + 3] == 'c' || htmlContent[i + 3] == 'C') &&
(htmlContent[i + 4] == 'r' || htmlContent[i + 4] == 'R') &&
(htmlContent[i + 5] == 'i' || htmlContent[i + 5] == 'I') &&
(htmlContent[i + 6] == 'p' || htmlContent[i + 6] == 'P') &&
(htmlContent[i + 7] == 't' || htmlContent[i + 7] == 'T') &&
(char.IsWhiteSpace(htmlContent[i + 8]) || htmlContent[i + 8] == '>'))

{
if (htmlContent[i + 8] != '>')

{
i += 7;
while (i < htmlContent.Length && htmlContent[i] != '>')
i++;
}
else
i += 8;
insideScriptBlock = false;
}
continue;
}

// style block
if (!insideStyleBlock)

{
if (i + 7 < htmlContent.Length &&
htmlContent[i] == '<' &&
(htmlContent[i + 1] == 's' || htmlContent[i + 1] == 'S') &&
(htmlContent[i + 2] == 't' || htmlContent[i + 2] == 'T') &&
(htmlContent[i + 3] == 'y' || htmlContent[i + 3] == 'Y') &&
(htmlContent[i + 4] == 'l' || htmlContent[i + 4] == 'L') &&
(htmlContent[i + 5] == 'e' || htmlContent[i + 5] == 'E') &&
(char.IsWhiteSpace(htmlContent[i + 6]) || htmlContent[i + 6] == '>'))

{
i += 5;
insideStyleBlock = true;
continue;
}
}
else // inside script block

{
if (i + 8 < htmlContent.Length &&
htmlContent[i] == '<' &&
htmlContent[i + 1] == '/' &&
(htmlContent[i + 2] == 's' || htmlContent[i + 2] == 'S') &&
(htmlContent[i + 3] == 't' || htmlContent[i + 3] == 'C') &&
(htmlContent[i + 4] == 'y' || htmlContent[i + 4] == 'R') &&
(htmlContent[i + 5] == 'l' || htmlContent[i + 5] == 'I') &&
(htmlContent[i + 6] == 'e' || htmlContent[i + 6] == 'P') &&
(char.IsWhiteSpace(htmlContent[i + 7]) || htmlContent[i + 7] == '>'))

{
if (htmlContent[i + 7] != '>')

{
i += 7;
while (i < htmlContent.Length && htmlContent[i] != '>')
i++;
}
else
i += 7;
insideStyleBlock = false;
}
continue;
}

if (!insideTag)

{
if (i < htmlContent.Length &&
htmlContent[i] == '<')

{
insideTag = true;
continue;
}
}
else // inside tag

{
if (!insideAttributeValue)

{
if (htmlContent[i] == '"' || htmlContent[i] == '\'')

{
attributeValueDelimiter = htmlContent[i];
insideAttributeValue = true;
continue;
}
if (htmlContent[i] == '>')

{
insideTag = false;
sb.Append(' '); // prevent words from different tags (<td>s for example) from joining together
continue;
}
}
else // inside tag and inside attribute value

{
if (htmlContent[i] == attributeValueDelimiter)

{
insideAttributeValue = false;
continue;
}
}
continue;
}

sb.Append(htmlContent[i]);
}

if (replaceNamedEntities)
foreach (string[] htmlNamedEntity in htmlNamedEntities)
sb.Replace(htmlNamedEntity[0], htmlNamedEntity[1]);

if (replaceNumberedEntities)
for (int i = 0; i < 512; i++)
sb.Replace("&#" + i + ";", ((char)i).ToString());

return sb.ToString();
}
}
}
posted on
2007-09-24 23:28
迷你软件
阅读(
365)
评论()
收藏
举报