网页数据获取小案例(java控制台程序)

  1 /**
  2  * 实体类(网页数据的实体类)
  3  * @author DSH
  4  */
  5 public class Content {
  6     
  7     private String name;//名称
  8     private String address;//地址
  9     private String phone;//电话
 10     private String type;//所属分类
 11     private String province;//所在省份
 12     private String area;//所在区县
 13     private String address_detail;//详细地址
 14     private String label;//所属标签
 15     private String dd_x;//大地坐标_X
 16     private String dd_y;//大地坐标_Y
 17     private String hx_x;//火星坐标_X
 18     private String hx_y;//火星坐标_Y
 19     private String bd_x;//百度坐标_X
 20     private String bd_y;//百度坐标_Y
 21     
 22     private String record;//条目
 23     
 24     public String getRecord() {
 25         return record;
 26     }
 27     public void setRecord(String record) {
 28         this.record = record;
 29     }
 30     public String getName() {
 31         return name;
 32     }
 33     public void setName(String name) {
 34         this.name = name;
 35     }
 36     public String getAddress() {
 37         return address;
 38     }
 39     public void setAddress(String address) {
 40         this.address = address;
 41     }
 42     public String getPhone() {
 43         return phone;
 44     }
 45     public void setPhone(String phone) {
 46         this.phone = phone;
 47     }
 48     public String getType() {
 49         return type;
 50     }
 51     public void setType(String type) {
 52         this.type = type;
 53     }
 54     public String getProvince() {
 55         return province;
 56     }
 57     public void setProvince(String province) {
 58         this.province = province;
 59     }
 60     public String getArea() {
 61         return area;
 62     }
 63     public void setArea(String area) {
 64         this.area = area;
 65     }
 66     public String getAddress_detail() {
 67         return address_detail;
 68     }
 69     public void setAddress_detail(String address_detail) {
 70         this.address_detail = address_detail;
 71     }
 72     public String getLabel() {
 73         return label;
 74     }
 75     public void setLabel(String label) {
 76         this.label = label;
 77     }
 78     public String getDd_x() {
 79         return dd_x;
 80     }
 81     public void setDd_x(String dd_x) {
 82         this.dd_x = dd_x;
 83     }
 84     public String getDd_y() {
 85         return dd_y;
 86     }
 87     public void setDd_y(String dd_y) {
 88         this.dd_y = dd_y;
 89     }
 90     public String getHx_x() {
 91         return hx_x;
 92     }
 93     public void setHx_x(String hx_x) {
 94         this.hx_x = hx_x;
 95     }
 96     public String getHx_y() {
 97         return hx_y;
 98     }
 99     public void setHx_y(String hx_y) {
100         this.hx_y = hx_y;
101     }
102     public String getBd_x() {
103         return bd_x;
104     }
105     public void setBd_x(String bd_x) {
106         this.bd_x = bd_x;
107     }
108     public String getBd_y() {
109         return bd_y;
110     }
111     public void setBd_y(String bd_y) {
112         this.bd_y = bd_y;
113     }
114   
115 }
  1 import java.io.IOException;
  2 import java.util.ArrayList;
  3 import java.util.List;
  4 
  5 import org.jsoup.Connection;
  6 import org.jsoup.Jsoup;
  7 import org.jsoup.nodes.Document;
  8 import org.jsoup.nodes.Element;
  9 import org.jsoup.select.Elements;
 10 /**
 11  * 导出工具类
 12  * @author DSH
 13  *
 14  */
 15 public class HtmlParseUtil {
 16     /**
 17      * 请求网络加载数据,得到记录信息
 18      * @param url:网站地址
 19      * @param page:分页页码
 20      */
 21     public List<Content> getContents(String url,Integer page){
 22         List<Content> contents = new ArrayList<Content>();
 23         Connection conn = Jsoup.connect(url);
 24         try {
 25             // 10秒超时时间,发起get请求,也可以是post
 26             Document doc = conn.timeout(60000).get();
 27             
 28             Element table = doc.select(".panel-body tbody").get(0);
 29             //获得所有的tr
 30             Elements trs = table.getElementsByTag("tr");
 31             for(int i=1;i<trs.size();i++){ // 通过FileBug发现这个网页里面第一个li不是我们要的类型,所以从1开始
 32                 Content content = new Content();
 33                 Element tr = trs.get(i);
 34                 
 35                 //获取到名称
 36                 Element e_name = tr.getElementsByTag("td").first();
 37                 String name = e_name.getElementsByTag("a").text();
 38                 //获取到地址
 39                 Element e_address = tr.getElementsByTag("td").get(1);
 40                 String address = e_address.getElementsByTag("td").text();
 41                 //获取电话
 42                 Element e_phone = tr.getElementsByTag("td").get(2);
 43                 String phone = e_phone.getElementsByTag("td").text();
 44                 //获取所属类型
 45                 Element e_type = tr.getElementsByTag("td").get(3);
 46                 String type = e_type.getElementsByTag("td").text();
 47                 
 48                 //获取详细页面地址
 49                 String str = "http://www.poi86.com" + e_name.getElementsByTag("a").attr("href");
 50                 Content con = this.getDetail(str);
 51                 
 52                 content.setName(name);
 53                 content.setAddress(address);
 54                 content.setPhone(phone);
 55                 content.setType(type);
 56                 content.setProvince(con.getProvince());
 57                 content.setArea(con.getArea());
 58                 content.setAddress_detail(con.getAddress_detail());
 59                 content.setLabel(con.getLabel());
 60                 content.setDd_x(con.getDd_x());
 61                 content.setDd_y(con.getDd_y());
 62                 content.setHx_x(con.getHx_x());
 63                 content.setHx_y(con.getHx_y());
 64                 content.setBd_x(con.getBd_x());
 65                 content.setBd_y(con.getBd_y());
 66                 content.setRecord("第"+page+"页-第"+i+"条");
 67                
 68                 contents.add(content);
 69                 System.out.println("=====第"+page+"页-第"+i+"条=====");
 70             }
 71         } catch (Exception ex) {
 72             ex.printStackTrace();
 73         } 
 74         return contents;
 75     }
 76     
 77     //获取详细页面信息
 78     private Content getDetail(String url){
 79         Connection conn = Jsoup.connect(url);
 80         Content content = new Content();
 81         try {
 82             Document doc = conn.timeout(60000).get();
 83             Element ul = doc.select(".list-group").get(0);
 84             Elements lis = ul.getElementsByTag("li");
 85             for (int i = 0; i < lis.size(); i++) {
 86                 Element li = lis.get(i);
 87                 String str = "";
 88                 Element obj = li.getElementsByTag("a").first();
 89                 if(obj != null){
 90                     str = obj.getElementsByTag("a").text();
 91                     switch (i) {
 92                     case 0://所属省份
 93                         content.setProvince(str);
 94                         break;
 95                     case 1://所属区县
 96                         content.setArea(str);
 97                         break;
 98                     case 4://所属分类
 99                         content.setType(str);
100                         break;
101                     }
102                 }else{
103                     Element obj1 = li.getElementsByTag("li").first();
104                     str = obj1.getElementsByTag("li").text();
105                     switch (i) {
106                     case 2://详细地址
107                         content.setAddress_detail(str.substring(6));
108                         break;
109                     case 3://号码
110                         content.setPhone(str.substring(5));
111                         break;
112                     case 5://所属标签
113                         content.setLabel(str.substring(5));
114                         break;
115                     case 6://大地坐标
116                         content.setDd_x(str.substring(6, 16));
117                         content.setDd_y(str.substring(17));
118                         break;
119                     case 7://火星坐标
120                         content.setHx_x(str.substring(6, 16));
121                         content.setHx_y(str.substring(17));
122                         break;
123                     case 8://百度坐标
124                         content.setBd_x(str.substring(6, 16));
125                         content.setBd_y(str.substring(17));
126                         break;
127                     }
128                 }
129             }
130         } catch (IOException e) {
131             e.printStackTrace();
132         }
133         return content;
134     }
135 }
  1 import java.io.FileOutputStream;
  2 import java.text.SimpleDateFormat;
  3 import java.util.ArrayList;
  4 import java.util.Date;
  5 import java.util.List;
  6 
  7 import org.apache.poi.hssf.usermodel.HSSFCell;
  8 import org.apache.poi.hssf.usermodel.HSSFCellStyle;
  9 import org.apache.poi.hssf.usermodel.HSSFRow;
 10 import org.apache.poi.hssf.usermodel.HSSFSheet;
 11 import org.apache.poi.hssf.usermodel.HSSFWorkbook;
 12 
 13 /**
 14  * 程序入口
 15  * @author DSH
 16  *
 17  */
 18 public class Output {
 19 
 20     @SuppressWarnings({ "deprecation", "unchecked", "rawtypes" })
 21     public static void main(String[] args) throws Exception {
 22 
 23         //创建一个webbook,对应一个Excel文件
 24         HSSFWorkbook wb = new HSSFWorkbook();
 25         //在webbook中添加一个sheet,对应Excel文件中的sheet
 26         HSSFSheet sheet = wb.createSheet("POI数据");
 27         //在sheet中添加表头第0行,注意老版本poi对Excel的行数列数有限制short
 28         HSSFRow row = sheet.createRow((int) 0);
 29         //创建单元格,并设置值表头 设置表头居中
 30         HSSFCellStyle style = wb.createCellStyle();
 31         style.setAlignment(HSSFCellStyle.ALIGN_CENTER); // 创建一个居中格式
 32 
 33         //设置表头
 34         HSSFCell cell = row.createCell((short) 0);
 35         cell.setCellValue("名称");
 36         cell.setCellStyle(style);
 37         cell = row.createCell((short) 1);
 38         cell.setCellValue("地址");
 39         cell.setCellStyle(style);
 40         cell = row.createCell((short) 2);
 41         cell.setCellValue("电话号码");
 42         cell.setCellStyle(style);
 43         cell = row.createCell((short) 3);
 44         cell.setCellValue("所属分类");
 45         cell.setCellStyle(style);
 46         cell = row.createCell((short) 4);
 47         cell.setCellValue("所在省份");
 48         cell.setCellStyle(style);
 49         cell = row.createCell((short) 5);
 50         cell.setCellValue("所在区县");
 51         cell.setCellStyle(style);
 52         cell = row.createCell((short) 6);
 53         cell.setCellValue("详细地址");
 54         cell.setCellStyle(style);
 55         cell = row.createCell((short) 7);
 56         cell.setCellValue("所属标签");
 57         cell.setCellStyle(style);
 58         cell = row.createCell((short) 8);
 59         cell.setCellValue("大地坐标——X");
 60         cell.setCellStyle(style);
 61         cell = row.createCell((short) 9);
 62         cell.setCellValue("大地坐标——Y");
 63         cell.setCellStyle(style);
 64         cell = row.createCell((short) 10);
 65         cell.setCellValue("火星坐标——X");
 66         cell.setCellStyle(style);
 67         cell = row.createCell((short) 11);
 68         cell.setCellValue("火星坐标——Y");
 69         cell.setCellStyle(style);
 70         cell = row.createCell((short) 12);
 71         cell.setCellValue("百度坐标——X");
 72         cell.setCellStyle(style);
 73         cell = row.createCell((short) 13);
 74         cell.setCellValue("百度坐标——Y");
 75         cell.setCellStyle(style);
 76         cell = row.createCell((short) 14);
 77         cell.setCellValue("条目");
 78         cell.setCellStyle(style);
 79 
 80         
 81         List list = null;
 82         List listAll = new ArrayList();//保存全部数据
 83         for (int i = 1; i <= 2; i++) {//从第一页开始,,,,第二页截止(包含第二页)
 84             HtmlParseUtil htmlParseUtil = new HtmlParseUtil();
 85             //获取详细页
 86             list = htmlParseUtil.getContents("http://www.poi86.com/poi/district/1550/"+i+".html",i);
 87             listAll.addAll(list);//将所有数据放到一个list集合中
 88         }
 89         
 90         for (int i = 0; i < listAll.size(); i++) {
 91             row = sheet.createRow((int) i + 1);
 92             
 93             Content con = (Content) listAll.get(i);
 94             // 创建单元格,并设置值
 95             row.createCell((short) 0).setCellValue(con.getName());
 96             row.createCell((short) 1).setCellValue(con.getAddress());
 97             row.createCell((short) 2).setCellValue(con.getPhone());
 98             row.createCell((short) 3).setCellValue(con.getType());
 99             row.createCell((short) 4).setCellValue(con.getProvince());
100             row.createCell((short) 5).setCellValue(con.getArea());
101             row.createCell((short) 6).setCellValue(con.getAddress_detail());
102             row.createCell((short) 7).setCellValue(con.getLabel());
103             row.createCell((short) 8).setCellValue(con.getDd_x());
104             row.createCell((short) 9).setCellValue(con.getDd_y());
105             row.createCell((short) 10).setCellValue(con.getHx_x());
106             row.createCell((short) 11).setCellValue(con.getHx_y());
107             row.createCell((short) 12).setCellValue(con.getBd_x());
108             row.createCell((short) 13).setCellValue(con.getBd_y());
109             row.createCell((short) 14).setCellValue(con.getRecord());
110         }
111         //将文件存到指定位置
112         try {
113             Date currentTime = new Date();
114             SimpleDateFormat formatter = new SimpleDateFormat("yyyyMMddHHmmss");
115             String date = formatter.format(currentTime);
116             
117             FileOutputStream fout = new FileOutputStream("F:/content" + date + ".xls");
118             wb.write(fout);
119             fout.close();
120         } catch (Exception e) {
121             e.printStackTrace();
122         }
123     }
124 }

该程序用到的四个jar包下载地址:http://pan.baidu.com/s/1c207Q6k

 

该文案仅仅作为网页数据获取学习案例,不可获取网页数据作为恶意用途,如有违反,本人不承担任何责任!

posted @ 2017-05-22 12:02  大瘦猴  阅读(245)  评论(0)    收藏  举报