应用HTMLParser解释操作HTML内容
至于她为什么要那么命名,为什么要那么做就不用深究了,总之用htmlparser做html解释是挺不错的选择。
根据“笑起来像你”发过来的原装代码展示如下:
dfdasfads.java
1import java.util.Vector;
2
3import org.htmlparser.Attribute;
4import org.htmlparser.Node;
5import org.htmlparser.Parser;
6import org.htmlparser.Tag;
7import org.htmlparser.tags.ScriptTag;
8
9import org.htmlparser.util.NodeList;
10import org.htmlparser.util.ParserException;
11
12public class dfdasfads {
13
14 public static void main(String[] args) throws ParserException {
15 StringBuffer sb = new StringBuffer();
16 sb.append("<p id=\"p1\"><span id=\"s2\" name=\"s22\">vvv</span></p><script>var char='aaaaa';</script>\r\n<input test.>\r\n<h1 dfasdf..>");
17 System.out.println(new dfdasfads().getFilterBody(sb.toString()));
18 }
19
20 public String getFilterBody(String strBody) {
21 // 一、 htmlparser 处理提交
22 Parser parser = Parser.createParser(strBody, "utf-8");
23 NodeList list;
24 String reValue = strBody;
25 try {
26 list = parser.parse(null);
27 visitNodeList(list);
28 reValue = list.toHtml();
29 } catch (ParserException e1) {
30 e1.printStackTrace();
31 }
32 return reValue;
33 }
34
35 /** *//**
36 * 移除所有标签的Id属性
37 * @param list 标签集合
38 */
39 private void visitNodeList(NodeList list) {
40 System.out.println(">>>visitNodeList(list)");
41 for (int i = 0; i < list.size(); i++) {
42 Node node = list.elementAt(i);
43
44 if (node instanceof ScriptTag) {
45 list.remove(i);
46 continue;
47 } else if (node instanceof Tag) {
48 Tag _tag = (Tag) node;
49
50 _tag.removeAttribute("id");
51 }
52
53 NodeList children = node.getChildren();
54 if (children != null && children.size() > 0) {
55 visitNodeList(children);
56 }
57
58 }
59 System.out.println("<<<visitNodeList(list)");
60 }
61}
62
1import java.util.Vector;
2
3import org.htmlparser.Attribute;
4import org.htmlparser.Node;
5import org.htmlparser.Parser;
6import org.htmlparser.Tag;
7import org.htmlparser.tags.ScriptTag;
8
9import org.htmlparser.util.NodeList;
10import org.htmlparser.util.ParserException;
11
12public class dfdasfads {
13
14 public static void main(String[] args) throws ParserException {
15 StringBuffer sb = new StringBuffer();
16 sb.append("<p id=\"p1\"><span id=\"s2\" name=\"s22\">vvv</span></p><script>var char='aaaaa';</script>\r\n<input test.>\r\n<h1 dfasdf..>");
17 System.out.println(new dfdasfads().getFilterBody(sb.toString()));
18 }
19
20 public String getFilterBody(String strBody) {
21 // 一、 htmlparser 处理提交
22 Parser parser = Parser.createParser(strBody, "utf-8");
23 NodeList list;
24 String reValue = strBody;
25 try {
26 list = parser.parse(null);
27 visitNodeList(list);
28 reValue = list.toHtml();
29 } catch (ParserException e1) {
30 e1.printStackTrace();
31 }
32 return reValue;
33 }
34
35 /** *//**
36 * 移除所有标签的Id属性
37 * @param list 标签集合
38 */
39 private void visitNodeList(NodeList list) {
40 System.out.println(">>>visitNodeList(list)");
41 for (int i = 0; i < list.size(); i++) {
42 Node node = list.elementAt(i);
43
44 if (node instanceof ScriptTag) {
45 list.remove(i);
46 continue;
47 } else if (node instanceof Tag) {
48 Tag _tag = (Tag) node;
49
50 _tag.removeAttribute("id");
51 }
52
53 NodeList children = node.getChildren();
54 if (children != null && children.size() > 0) {
55 visitNodeList(children);
56 }
57
58 }
59 System.out.println("<<<visitNodeList(list)");
60 }
61}
62
FilterBody.java
1
2
3import java.util.Vector;
4
5
6import org.htmlparser.Attribute;
7import org.htmlparser.Node;
8import org.htmlparser.Parser;
9import org.htmlparser.Tag;
10
11import org.htmlparser.tags.ScriptTag;
12import org.htmlparser.util.NodeList;
13import org.htmlparser.util.ParserException;
14
15
16
17
18public class FilterBody {
19
20
21 public static void main(String[] args) throws ParserException {
22 String sttt = "<embed allowFullScreen=\"true\" src=\"http://vhead.blog.sina.com.cn/player/outer_player.swf\" quality=\"high\" bgcolor=\"#ffffff\" width=\"424\" height=\"404\" name=\"vsplayer\" align=\"middle\" type=\"application/x-shockwave-flash\" pluginspage=\"http://www.macromedia.com/go/getflashplayer\" />";
23 System.out.println(new FilterBody().getFilterBody(sttt));
24
25 }
26
27 public String getFilterBody(String strBody) {
28
29 // 一、 htmlparser 处理提交
30 Parser parser = Parser.createParser(strBody, "utf-8");
31 NodeList list;
32 String reValue = strBody;
33 try {
34 list = parser.parse(null);
35 visitNodeList(list);
36 reValue = list.toHtml();
37 } catch (ParserException e1) {
38
39 }
40
41 ;
42
43 return reValue;
44 }
45
46
47
48 // 递归
49 private void visitNodeList(NodeList list) {
50 for (int i = 0; i < list.size(); i++) {
51 Node node = list.elementAt(i);
52
53 if (node instanceof Tag) {
54 if (node instanceof ScriptTag) {
55 list.remove(i);
56 continue;
57 }
58 Tag _tag = (Tag) node;
59 _tag.removeAttribute("id");
60 _tag.removeAttribute("onload");
61 _tag.removeAttribute("alt");
62
63 String tagName = _tag.getTagName();
64 if (tagName == null)
65 tagName = "";
66 else
67 tagName = tagName.trim().toUpperCase();
68
69
70 visitTag((Tag) node);
71 }
72
73 NodeList children = node.getChildren();
74 if (children != null && children.size() > 0)
75 visitNodeList(children);
76
77 }
78 }
79
80 // 获取tag
81 private void visitTag(Tag tag) {
82 String tagName = tag.getTagName();
83 if (tagName != null && tagName.equalsIgnoreCase("embed")) {
84 tag.setEmptyXmlTag(false);
85 tag.setAttribute("AllowNetworking", "\"none\"");
86 tag.setAttribute("AllowScriptAccess", "never", '"');
87 tag.setEmptyXmlTag(true);
88 }
89
90 Vector attrs = tag.getAttributesEx();
91
92 for (int i = 0; i < attrs.size(); i++) {
93 Object obj = attrs.elementAt(i);
94 if (obj != null && obj instanceof Attribute) {
95 visitAttribute((Attribute) obj, tag);
96 }
97 }
98 }
99
100 // 获取tag属性
101 private void visitAttribute(Attribute attribute, Tag tag) {
102 String attName = attribute.getName();
103 if (attName == null)
104 attName = "";
105 else
106 attName = attName.trim().toLowerCase();
107 String tagName = tag.getTagName();
108 if (tagName == null)
109 tagName = "";
110 else
111 tagName = tagName.trim().toLowerCase();
112 String tagValue = tag.getText();
113 if (tagValue == null)
114 tagValue = "";
115 else
116 tagValue = tagValue.trim().toLowerCase();
117 String attribValue = attribute.getValue();
118 if (attribValue == null)
119 attribValue = "";
120 else
121 attribValue = attribValue.trim().toLowerCase();
122
123 }
124
125}
126
1
2
3import java.util.Vector;
4
5
6import org.htmlparser.Attribute;
7import org.htmlparser.Node;
8import org.htmlparser.Parser;
9import org.htmlparser.Tag;
10
11import org.htmlparser.tags.ScriptTag;
12import org.htmlparser.util.NodeList;
13import org.htmlparser.util.ParserException;
14
15
16
17
18public class FilterBody {
19
20
21 public static void main(String[] args) throws ParserException {
22 String sttt = "<embed allowFullScreen=\"true\" src=\"http://vhead.blog.sina.com.cn/player/outer_player.swf\" quality=\"high\" bgcolor=\"#ffffff\" width=\"424\" height=\"404\" name=\"vsplayer\" align=\"middle\" type=\"application/x-shockwave-flash\" pluginspage=\"http://www.macromedia.com/go/getflashplayer\" />";
23 System.out.println(new FilterBody().getFilterBody(sttt));
24
25 }
26
27 public String getFilterBody(String strBody) {
28
29 // 一、 htmlparser 处理提交
30 Parser parser = Parser.createParser(strBody, "utf-8");
31 NodeList list;
32 String reValue = strBody;
33 try {
34 list = parser.parse(null);
35 visitNodeList(list);
36 reValue = list.toHtml();
37 } catch (ParserException e1) {
38
39 }
40
41 ;
42
43 return reValue;
44 }
45
46
47
48 // 递归
49 private void visitNodeList(NodeList list) {
50 for (int i = 0; i < list.size(); i++) {
51 Node node = list.elementAt(i);
52
53 if (node instanceof Tag) {
54 if (node instanceof ScriptTag) {
55 list.remove(i);
56 continue;
57 }
58 Tag _tag = (Tag) node;
59 _tag.removeAttribute("id");
60 _tag.removeAttribute("onload");
61 _tag.removeAttribute("alt");
62
63 String tagName = _tag.getTagName();
64 if (tagName == null)
65 tagName = "";
66 else
67 tagName = tagName.trim().toUpperCase();
68
69
70 visitTag((Tag) node);
71 }
72
73 NodeList children = node.getChildren();
74 if (children != null && children.size() > 0)
75 visitNodeList(children);
76
77 }
78 }
79
80 // 获取tag
81 private void visitTag(Tag tag) {
82 String tagName = tag.getTagName();
83 if (tagName != null && tagName.equalsIgnoreCase("embed")) {
84 tag.setEmptyXmlTag(false);
85 tag.setAttribute("AllowNetworking", "\"none\"");
86 tag.setAttribute("AllowScriptAccess", "never", '"');
87 tag.setEmptyXmlTag(true);
88 }
89
90 Vector attrs = tag.getAttributesEx();
91
92 for (int i = 0; i < attrs.size(); i++) {
93 Object obj = attrs.elementAt(i);
94 if (obj != null && obj instanceof Attribute) {
95 visitAttribute((Attribute) obj, tag);
96 }
97 }
98 }
99
100 // 获取tag属性
101 private void visitAttribute(Attribute attribute, Tag tag) {
102 String attName = attribute.getName();
103 if (attName == null)
104 attName = "";
105 else
106 attName = attName.trim().toLowerCase();
107 String tagName = tag.getTagName();
108 if (tagName == null)
109 tagName = "";
110 else
111 tagName = tagName.trim().toLowerCase();
112 String tagValue = tag.getText();
113 if (tagValue == null)
114 tagValue = "";
115 else
116 tagValue = tagValue.trim().toLowerCase();
117 String attribValue = attribute.getValue();
118 if (attribValue == null)
119 attribValue = "";
120 else
121 attribValue = attribValue.trim().toLowerCase();
122
123 }
124
125}
126