HTML、XML 等 Dom 结点类解析库Jsoup

Jsoup 是一款Java 的HTML解析器,可直接解析某个URL地址、HTML文本内容。它提供了一套非常省力的API,可通过DOM,CSS以及类似于jQuery的操作方法来取出和操作数据。

特点:

  • HTML、XML、自定义DOM格式文本解析;
  • 可操作HTML元素、属性、文本;
  • 适用于采集解析网站HTML;
  • DOM解析功能强大。

开源库jsoup-1.8.1.jar,基本用法:

 1 package com.zhang.jsoupdemo;
 2 
 3 import android.os.Environment;
 4 import android.support.v7.app.AppCompatActivity;
 5 import android.os.Bundle;
 6 
 7 import org.jsoup.Jsoup;
 8 import org.jsoup.nodes.Document;
 9 import org.jsoup.nodes.Element;
10 import org.jsoup.safety.Whitelist;
11 import org.jsoup.select.Elements;
12 
13 import java.io.File;
14 import java.io.IOException;
15 
16 public class MainActivity extends AppCompatActivity {
17 
18     private String html = "<html><head><title>Jsoup用法</title></head>"
19             + "<body><p><a href='http://baidu.com'>这里是 jsoup 项目的相关文章</a></p></body></html>";
20     private String url = "http://baidu.com";
21 
22     @Override
23     protected void onCreate(Bundle savedInstanceState) {
24         super.onCreate(savedInstanceState);
25         setContentView(R.layout.activity_main);
26 
27         //解析html文本
28         //载入数据
29         Document doc = Jsoup.parse(html);
30         //直接获取title
31         doc.title();
32 
33 
34         //解析并提取 HTML 元素
35         Elements eles = doc.getElementsByTag("a");
36         for (Element ele : eles) {
37             String linkHref = ele.attr("href");
38             String text = ele.text();
39         }
40 
41         //数据筛选、检索
42         Elements elements = doc.select("a[href]");
43         Elements elements1 = doc.select("img[src$=.png]");
44         Element element = doc.select("div.className").first();
45 
46         //修改数据
47         doc.select("div.className").attr("key", "value");
48         doc.select("div.className").addClass("myClass");//class="myClass"
49         //清理数据
50         doc.select("img").removeAttr("onClick");
51         //转换
52         String htmls = "";//不安全的
53         String safe = Jsoup.clean(htmls, Whitelist.basic());//安全的
54 
55 
56         //解析url
57         //get方式
58         try {
59             Document document = Jsoup.connect(url).get();
60         } catch (IOException e) {
61             e.printStackTrace();
62         }
63         //post方式
64         try {
65             Document document = Jsoup.connect(url).data("key", "value").timeout(3000).post();
66         } catch (IOException e) {
67             e.printStackTrace();
68         }
69 
70         //解析本地html
71         File input = new File(Environment.getExternalStorageDirectory() + "/index.html");
72         try {
73             Document document = Jsoup.parse(input,"utf-8","http://baidu.com");// ../baidu.png -> http://baidu.com/baidu.png
74         } catch (IOException e) {
75             e.printStackTrace();
76         }
77     }
78 }

 

解析Html 和 Epub应用:

 1 package com.zhang.jsoup;
 2 
 3 import android.support.v7.app.AppCompatActivity;
 4 import android.os.Bundle;
 5 
 6 import org.jsoup.Jsoup;
 7 import org.jsoup.nodes.Document;
 8 import org.jsoup.nodes.Element;
 9 import org.jsoup.select.Elements;
10 
11 import java.io.IOException;
12 import java.io.InputStream;
13 
14 public class MainActivity extends AppCompatActivity {
15 
16     private String url = "http://mobile.csdn.net/";
17 
18     @Override
19     protected void onCreate(Bundle savedInstanceState) {
20         super.onCreate(savedInstanceState);
21         setContentView(R.layout.activity_main);
22 
23         new Thread(new Runnable() {
24             @Override
25             public void run() {
26                 parseHtml();
27                 parseEpub();
28             }
29         }).start();
30 
31     }
32 
33     private void parseHtml() {
34         try {
35             Document doc = Jsoup.connect(url).get();
36             Elements eles = doc.select("div.unit");
37             for (Element ele : eles) {
38                 String title = ele.getElementsByTag("h1").first().text();
39                 String href = ele.getElementsByTag("h1").first().getElementsByTag("a").attr("href");
40                 System.out.println(title + "\n" + href);
41             }
42         } catch (IOException e) {
43             e.printStackTrace();
44         }
45     }
46 
47     private void parseEpub() {
48         try {
49             InputStream inputStream = getAssets().open("fb.ncx");
50             int size = inputStream.available();
51             byte[] buffer = new byte[size];
52             inputStream.read(buffer);
53             inputStream.close();
54 
55             String epubText = new String(buffer, "UTF-8");
56 
57             Document document = Jsoup.parse(epubText);
58             String title = document.getElementsByTag("docTitle").first().text();
59             System.out.println(title + "\n");
60 
61             Elements elements = document.getElementsByTag("navPoint");
62             for (Element element : elements) {
63                 String s = element.text();
64                 String imgHref = element.getElementsByTag("content").first().attr("src");
65                 System.out.println(s + ":" + imgHref);
66             }
67 
68 
69         } catch (IOException e) {
70             e.printStackTrace();
71         }
72     }
73 }

 

posted @ 2016-12-21 23:56  changchou  阅读(656)  评论(0编辑  收藏  举报