使用Jsoup抓取网页数据

Jsoup是一款Java的HTML解析器,可直接解析某个URL地址、HTML文本内容。它提供了一套非常省力的API,可通过DOM,CSS以及类似于jQuery的操作方法来取出和操作数据。

基本了解参考中文文档:http://www.open-open.com/jsoup/

下面介绍一个具体例子:

比如我们要抓取:http://datacenter.mep.gov.cn/report/air_daily/air_dairy.jsp上某个时段的所有城市的天气信息并且保存为csv格式的文件。

在Chrome下先执行一次访问,查看页面访问参数:


其中得到请求方式、URL、参数列表等信息。

然后根据文档开始编写代码。

项目结构如下:

CityWeather.java:

 1 package bean;
 2 
 3 public class CityWeather 
 4 {
 5     public final static int ITEM_NUMBER = 6;
 6     private String id;
 7     private String city;
 8     private String date;
 9     private String aqi;
10     private String aql;
11     private String pp;
12     
13     public CityWeather(){}
14     
15     public CityWeather(String id, String city, String date, 
16             String aqi, String aql, String pp)
17     {
18         this.id = id;
19         this.city = city;
20         this.date = date;
21         this.aqi = aqi;
22         this.aql = aql;
23         this.pp = pp;
24     }
25     
26     public String getId() 
27     {
28         return id;
29     }
30     
31     public void setId(String id) 
32     {
33         this.id = id;
34     }
35     
36     public String getCity() 
37     {
38         return city;
39     }
40     
41     public void setCity(String city) 
42     {
43         this.city = city;
44     }
45     
46     public String getDate() 
47     {
48         return date;
49     }
50     
51     public void setDate(String date) 
52     {
53         this.date = date;
54     }
55     
56     public String getAqi() 
57     {
58         return aqi;
59     }
60     
61     public void setAqi(String aqi) 
62     {
63         this.aqi = aqi;
64     }
65     
66     public String getAql() 
67     {
68         return aql;
69     }
70     
71     public void setAql(String aql) 
72     {
73         this.aql = aql;
74     }
75     
76     public String getPp() 
77     {
78         return pp;
79     }
80     
81     public void setPp(String pp) 
82     {
83         this.pp = pp;
84     }
85     
86     @Override
87     public String toString()
88     {
89         return id + ", " + city + ", " + date + ", " + aqi +
90                 ", " + aql + ", " + pp + "\n";
91     }
92 }
View Code

War.java:

 1 package bean;
 2 
 3 public class War 
 4 {
 5     //定义常量
 6     public final static int GET = 0;
 7     public final static int POST = 1;
 8     //待访问网址的url
 9     private String url;
10     //参数列表
11     private String[] params;
12     //对应参数值
13     private String[] values;
14     //请求类型,默认为GET
15     private int requestMethod;
16     //待获取元素的选择器语法表示
17     private String selector;
18     
19     public War(String url, String[] params, String[] values, 
20             int requestMethod, String selector)
21     {
22         this.url = url;
23         this.params = params;
24         this.values = values;
25         this.requestMethod = requestMethod;
26         this.setSelector(selector);
27     }
28     
29     public String getUrl()
30     {
31         return url;
32     }
33     
34     public void setUrl(String url)
35     {
36         this.url = url;
37     }
38     
39     public String[] getParams()
40     {
41         return params;
42     }
43     
44     public void setParams(String[] params)
45     {
46         this.params = params;
47     }
48     
49     public String[] getValues()
50     {
51         return values;
52     }
53     
54     public void setValues(String[] values)
55     {
56         this.values = values;
57     }
58     
59     public int getRequestMethod()
60     {
61         return requestMethod;
62     }
63     
64     public void setRequestMethod(int requestMethod)
65     {
66         this.requestMethod = requestMethod;
67     }
68 
69     public String getSelector() 
70     {
71         return selector;
72     }
73 
74     public void setSelector(String selector) 
75     {
76         this.selector = selector;
77     }
78 }
View Code

Op.java:

 1 package io;
 2 import java.io.BufferedWriter;
 3 import java.io.FileWriter;
 4 import java.io.IOException;
 5 import java.io.Writer;
 6 import java.util.ArrayList;
 7 import bean.CityWeather;
 8 
 9 public class Op 
10 {
11     public static void List2File(ArrayList<CityWeather> list, String file) throws IOException
12     {
13           Writer w = new FileWriter(file, true);
14         BufferedWriter buffWriter = new BufferedWriter(w);
15         for(CityWeather cw : list)
16         {
17             buffWriter.write(cw.toString());
18         }
19         buffWriter.close();
20         w.close();
21     }
22 }
View Code

Crawler.java:

 1 package service;
 2 import bean.CityWeather;
 3 import bean.War;
 4 import org.jsoup.Jsoup;
 5 import org.jsoup.nodes.Document;
 6 import org.jsoup.nodes.Element;
 7 import org.jsoup.select.Elements;
 8 import org.jsoup.Connection;
 9 import java.io.IOException;
10 import java.util.ArrayList;
11 
12 public class Crawler 
13 {
14     public static ArrayList<CityWeather> getResults(War war) throws IOException  
15     {
16         Connection conn = Jsoup.connect(war.getUrl());
17         if(war.getParams() != null)
18         {
19             for(int i = 0; i < war.getParams().length; i++)
20             {
21                 conn.data(war.getParams()[i], war.getValues()[i]);
22             }
23         }
24         Document doc = null;
25         if(war.getRequestMethod() == War.GET)
26         {
27             doc = conn.timeout(10000).get();
28         }
29         else
30         {
31             doc = conn.timeout(10000).post();
32         }
33         if(doc == null)
34         {
35             System.out.print("unavailable\n");
36             return null;
37         }
38         Elements results = doc.select(war.getSelector());
39         ArrayList<CityWeather> list = new ArrayList<CityWeather>();
40         int i = 0;
41         CityWeather cw = null;
42         for(Element e: results)
43         {
44             switch(i % CityWeather.ITEM_NUMBER)
45             {
46             case 0:
47                 cw = new CityWeather();
48                 cw.setId(e.text());
49                 break;
50             case 1:
51                 cw.setCity(e.text());
52                 break;
53             case 2:
54                 cw.setDate(e.text());
55                 break;
56             case 3:
57                 cw.setAqi(e.text());
58                 break;
59             case 4:
60                 cw.setAql(e.text());
61                 break;
62             default:
63                 cw.setPp(e.text().replace(',', '|'));
64                 list.add(cw);
65             }
66             i++;
67         }
68         return list;
69     }
70 }
View Code

Test.java:

 1 package test;
 2 import java.text.DateFormat;
 3 import java.util.ArrayList;
 4 import java.util.Date;
 5 import service.Crawler;
 6 import bean.CityWeather;
 7 import bean.War;
 8 import io.Op;
 9 
10 public class Test 
11 {
12     public static void main(String[] args) throws Exception
13     {
14         System.out.println(DateFormat.getDateTimeInstance().format(new Date()) + " 开始运行");
15         String url = "http://datacenter.mep.gov.cn/report/air_daily/air_dairy.jsp";
16         String[] params = { "city", "startdate", "enddate", "page" };
17         String[] values = { "", "2016-05-01", "2016-05-01", "2" };
18         int requestMethod = War.GET;
19         String selector = "td.report1_6";
20         War war = new War(url, params, values, requestMethod, selector);
21         ArrayList<CityWeather> list = Crawler.getResults(war);
22         Op.List2File(list, "data/file.csv");
23         System.out.println(DateFormat.getDateTimeInstance().format(new Date()) + " 运行结束");
24     }
25 }
View Code

经过测试:抓取14年到16年的天气信息,跑的非常慢...大概跑了一天?...

而且访问有时候会失败,需要对ArrayList<CityWeather> list = Crawler.getResults(war);循环调用直到list的size不为0表示访问成功才可以访问下一页。

没有异常处理...

我在考虑是不是可以多线程同时访问一页最后再把结果merge起来,这样肯定快很多吧。

有空实现一下。

今天试了一下,开了1000个线程网站就暂时性bug了...

看来还要适当控制一下...果然多线程、分布式之类的还需要花时间仔细研究一下。

代码下载:https://github.com/SplayHuo/JavaProject

posted @ 2016-05-04 17:15  hxy_has_been_used  阅读(464)  评论(0编辑  收藏  举报