清理Word生成HTML的冗余;清理与清除HTML标签1
/// <summary>清理Word生成的冗余HTML【使用】 /// /// </summary> /// <param name="html"></param> /// <returns></returns> static string ClearWord(string html) { System.Collections.Specialized.StringCollection sc = new System.Collections.Specialized.StringCollection(); // get rid of unnecessary tag spans (comments and title) sc.Add(@"<!--(\w|\W)+?-->"); sc.Add(@"<title>(\w|\W)+?</title>"); // Get rid of classes and styles sc.Add(@"\s?class=\w+"); sc.Add(@"\s+style='[^']+'"); // Get rid of unnecessary tags //sc.Add(@"<(meta|link|/?o:|/?style|/?div|/?st\d|/?head|/?html|body|/?body|/?span|!\[)[^>]*?>"); sc.Add(@"<(meta|link|/?o:|/?style|/?font|/?strong|/?st\d|/?head|/?html|body|/?body|/?span|!\[)[^>]*?>"); // Get rid of empty paragraph tags sc.Add(@"(<[^>]+>)+ (</\w+>)+"); // remove bizarre v: element attached to <img> tag sc.Add(@"\s+v:\w+=""[^""]+"""); // remove extra lines sc.Add(@"(\n\r){2,}"); //return sc.Cast<string>().Aggregate(html, (current, s) => Regex.Replace(current, s, "", RegexOptions.IgnoreCase)); foreach (string s in sc) { html = Regex.Replace(html, s, "", RegexOptions.IgnoreCase); } return html; }