清理Word生成HTML的冗余；清理与清除HTML标签1

 /// <summary>清理Word生成的冗余HTML【使用】
        ///  
        /// </summary>
        /// <param name="html"></param>
        /// <returns></returns>
        static string ClearWord(string html)
        {

            System.Collections.Specialized.StringCollection sc = new System.Collections.Specialized.StringCollection();
            // get rid of unnecessary tag spans (comments and title)
            sc.Add(@"<!--(\w|\W)+?-->");
            sc.Add(@"<title>(\w|\W)+?</title>");
            // Get rid of classes and styles
            sc.Add(@"\s?class=\w+");
            sc.Add(@"\s+style='[^']+'");
            // Get rid of unnecessary tags
            //sc.Add(@"<(meta|link|/?o:|/?style|/?div|/?st\d|/?head|/?html|body|/?body|/?span|!\[)[^>]*?>");
            sc.Add(@"<(meta|link|/?o:|/?style|/?font|/?strong|/?st\d|/?head|/?html|body|/?body|/?span|!\[)[^>]*?>");
            // Get rid of empty paragraph tags
            sc.Add(@"(<[^>]+>)+ (</\w+>)+");
            // remove bizarre v: element attached to <img> tag
            sc.Add(@"\s+v:\w+=""[^""]+""");
            // remove extra lines
            sc.Add(@"(\n\r){2,}");
            //return sc.Cast<string>().Aggregate(html, (current, s) => Regex.Replace(current, s, "", RegexOptions.IgnoreCase));
            foreach (string s in sc)
            {
                html = Regex.Replace(html, s, "", RegexOptions.IgnoreCase);
            }
            return html;
        }

posted @ 2012-07-26 14:12 zhu_xj 阅读(478) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

zhu_xj

清理Word生成HTML的冗余；清理与清除HTML标签1

公告