
1 <?xml version="1.0" encoding="UTF-8" ?>
2 <students>
3 <student number="itcast_0001">
4 <!--<name id="itcast">tom</name>-->
5 <name id="itcast">
6 <xing>张</xing>
7 <ming>三</ming>
8 </name>
9 <age>9999</age>
10 <sex>male</sex>
11 </student>
12 <student number="itcast_0002">
13 <name>sam</name>
14 <age>20</age>
15 <sex>female</sex>
16 </student>
17 </students>
1 package com.haifei.jsoup;
2
3 import org.jsoup.Jsoup;
4 import org.jsoup.nodes.Document;
5
6 import java.io.File;
7 import java.io.IOException;
8 import java.net.URL;
9
10 /**
11 * Jsoup对象功能
12 */
13 public class JsoupDemo2 {
14 public static void main(String[] args) throws IOException {
15 //1 parse(File in, String charsetName):解析xml或html文件
16 String path = JsoupDemo1.class.getClassLoader().getResource("student.xml").getPath();
17 Document document = Jsoup.parse(new File(path), "utf-8");
18 // System.out.println(document); //返回字符串形式的xml文档内容
19
20 //2 parse(String html):解析xml或html字符串
21 String str = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n" +
22 "<students>\n" +
23 "\t<student number=\"itcast_0001\">\n" +
24 "\t\t<name>tom</name>\n" +
25 "\t\t<age>9999</age>\n" +
26 "\t\t<sex>male</sex>\n" +
27 "\t</student>\n" +
28 "\t<student number=\"itcast_0002\">\n" +
29 "\t\t<name>sam</name>\n" +
30 "\t\t<age>20</age>\n" +
31 "\t\t<sex>female</sex>\n" +
32 "\t</student>\n" +
33 "</students>";
34 Document document1 = Jsoup.parse(str);
35 // System.out.println(document1);
36
37 //3 parse(URL url, int timeoutMillis):通过网络路径获取指定的html或xml的文档对象,可爬虫
38 URL url = new URL("https://baike.baidu.com/item/jsoup/9012509?fr=aladdin");
39 Document document2 = Jsoup.parse(url, 10000);
40 // System.out.println(document2); //html网页代码-字符串形式
41 }
42 }
1 package com.haifei.jsoup;
2
3 import org.jsoup.Jsoup;
4 import org.jsoup.nodes.Document;
5 import org.jsoup.nodes.Element;
6 import org.jsoup.select.Elements;
7
8 import java.io.File;
9 import java.io.IOException;
10
11 /**
12 * Document/Element对象功能
13 */
14 public class JsoupDemo3 {
15 public static void main(String[] args) throws IOException {
16 String path = JsoupDemo1.class.getClassLoader().getResource("student.xml").getPath();
17 Document document = Jsoup.parse(new File(path), "utf-8");
18
19 //1 获取所有student对象
20 Elements elements = document.getElementsByTag("student");
21 // System.out.println(elements);
22 /*
23 <student number="itcast_0001">
24 <name>
25 tom
26 </name>
27 <age>
28 9999
29 </age>
30 <sex>
31 male
32 </sex>
33 </student>
34 <student number="itcast_0002">
35 <name>
36 sam
37 </name>
38 <age>
39 20
40 </age>
41 <sex>
42 female
43 </sex>
44 </student>
45 */
46
47 //2 获取属性名为id的元素对象
48 Elements elements1 = document.getElementsByAttribute("id");
49 // System.out.println(elements1);
50 /*
51 <name id="itcast">
52 tom
53 </name>
54 */
55
56 //3 获取number属性值为itcast_0002的元素对象
57 Elements elements2 = document.getElementsByAttributeValue("number", "itcast_0002");
58 // System.out.println(elements2);
59 /*
60 <student number="itcast_0002">
61 <name>
62 sam
63 </name>
64 <age>
65 20
66 </age>
67 <sex>
68 female
69 </sex>
70 </student>
71 */
72
73 //4 根据id获取值为itcast的元素对象
74 Element element = document.getElementById("itcast");
75 System.out.println(element);
76 /*
77 <name id="itcast">
78 tom
79 </name>
80 */
81 }
82 }
1 package com.haifei.jsoup;
2
3 import org.jsoup.Jsoup;
4 import org.jsoup.nodes.Document;
5 import org.jsoup.nodes.Element;
6 import org.jsoup.select.Elements;
7
8 import java.io.File;
9 import java.io.IOException;
10
11 /**
12 * Element对象功能
13 */
14 public class JsoupDemo4 {
15 public static void main(String[] args) throws IOException {
16 String path = JsoupDemo1.class.getClassLoader().getResource("student.xml").getPath();
17 Document document = Jsoup.parse(new File(path), "utf-8");
18
19 //通过Document对象获取name标签,获取所有的name标签
20 Elements elements = document.getElementsByTag("name");
21 System.out.println(elements.size()); //2
22
23 Element element_student = document.getElementsByTag("student").get(0);
24 Elements element_name = element_student.getElementsByTag("name");
25 System.out.println(element_name.size()); //1
26
27 //获取student对象的属性值
28 String number = element_student.attr("number");
29 System.out.println(number); //itcast_0001
30
31 //获取文本内容
32 String text = element_name.text(); //仅获取文本内容
33 String html = element_name.html(); //获取标签体的所有内容(包括子标签的字符串内容)
34 System.out.println(text); //tom
35 System.out.println(html); //tom
36 /*
37 张 三
38
39 <xing>
40 张
41 </xing>
42 <ming>
43 三
44 </ming>
45 */
46 }
47 }