网页数据获取小案例(java控制台程序)
1 /** 2 * 实体类(网页数据的实体类) 3 * @author DSH 4 */ 5 public class Content { 6 7 private String name;//名称 8 private String address;//地址 9 private String phone;//电话 10 private String type;//所属分类 11 private String province;//所在省份 12 private String area;//所在区县 13 private String address_detail;//详细地址 14 private String label;//所属标签 15 private String dd_x;//大地坐标_X 16 private String dd_y;//大地坐标_Y 17 private String hx_x;//火星坐标_X 18 private String hx_y;//火星坐标_Y 19 private String bd_x;//百度坐标_X 20 private String bd_y;//百度坐标_Y 21 22 private String record;//条目 23 24 public String getRecord() { 25 return record; 26 } 27 public void setRecord(String record) { 28 this.record = record; 29 } 30 public String getName() { 31 return name; 32 } 33 public void setName(String name) { 34 this.name = name; 35 } 36 public String getAddress() { 37 return address; 38 } 39 public void setAddress(String address) { 40 this.address = address; 41 } 42 public String getPhone() { 43 return phone; 44 } 45 public void setPhone(String phone) { 46 this.phone = phone; 47 } 48 public String getType() { 49 return type; 50 } 51 public void setType(String type) { 52 this.type = type; 53 } 54 public String getProvince() { 55 return province; 56 } 57 public void setProvince(String province) { 58 this.province = province; 59 } 60 public String getArea() { 61 return area; 62 } 63 public void setArea(String area) { 64 this.area = area; 65 } 66 public String getAddress_detail() { 67 return address_detail; 68 } 69 public void setAddress_detail(String address_detail) { 70 this.address_detail = address_detail; 71 } 72 public String getLabel() { 73 return label; 74 } 75 public void setLabel(String label) { 76 this.label = label; 77 } 78 public String getDd_x() { 79 return dd_x; 80 } 81 public void setDd_x(String dd_x) { 82 this.dd_x = dd_x; 83 } 84 public String getDd_y() { 85 return dd_y; 86 } 87 public void setDd_y(String dd_y) { 88 this.dd_y = dd_y; 89 } 90 public String getHx_x() { 91 return hx_x; 92 } 93 public void setHx_x(String hx_x) { 94 this.hx_x = hx_x; 95 } 96 public String getHx_y() { 97 return hx_y; 98 } 99 public void setHx_y(String hx_y) { 100 this.hx_y = hx_y; 101 } 102 public String getBd_x() { 103 return bd_x; 104 } 105 public void setBd_x(String bd_x) { 106 this.bd_x = bd_x; 107 } 108 public String getBd_y() { 109 return bd_y; 110 } 111 public void setBd_y(String bd_y) { 112 this.bd_y = bd_y; 113 } 114 115 }
1 import java.io.IOException; 2 import java.util.ArrayList; 3 import java.util.List; 4 5 import org.jsoup.Connection; 6 import org.jsoup.Jsoup; 7 import org.jsoup.nodes.Document; 8 import org.jsoup.nodes.Element; 9 import org.jsoup.select.Elements; 10 /** 11 * 导出工具类 12 * @author DSH 13 * 14 */ 15 public class HtmlParseUtil { 16 /** 17 * 请求网络加载数据,得到记录信息 18 * @param url:网站地址 19 * @param page:分页页码 20 */ 21 public List<Content> getContents(String url,Integer page){ 22 List<Content> contents = new ArrayList<Content>(); 23 Connection conn = Jsoup.connect(url); 24 try { 25 // 10秒超时时间,发起get请求,也可以是post 26 Document doc = conn.timeout(60000).get(); 27 28 Element table = doc.select(".panel-body tbody").get(0); 29 //获得所有的tr 30 Elements trs = table.getElementsByTag("tr"); 31 for(int i=1;i<trs.size();i++){ // 通过FileBug发现这个网页里面第一个li不是我们要的类型,所以从1开始 32 Content content = new Content(); 33 Element tr = trs.get(i); 34 35 //获取到名称 36 Element e_name = tr.getElementsByTag("td").first(); 37 String name = e_name.getElementsByTag("a").text(); 38 //获取到地址 39 Element e_address = tr.getElementsByTag("td").get(1); 40 String address = e_address.getElementsByTag("td").text(); 41 //获取电话 42 Element e_phone = tr.getElementsByTag("td").get(2); 43 String phone = e_phone.getElementsByTag("td").text(); 44 //获取所属类型 45 Element e_type = tr.getElementsByTag("td").get(3); 46 String type = e_type.getElementsByTag("td").text(); 47 48 //获取详细页面地址 49 String str = "http://www.poi86.com" + e_name.getElementsByTag("a").attr("href"); 50 Content con = this.getDetail(str); 51 52 content.setName(name); 53 content.setAddress(address); 54 content.setPhone(phone); 55 content.setType(type); 56 content.setProvince(con.getProvince()); 57 content.setArea(con.getArea()); 58 content.setAddress_detail(con.getAddress_detail()); 59 content.setLabel(con.getLabel()); 60 content.setDd_x(con.getDd_x()); 61 content.setDd_y(con.getDd_y()); 62 content.setHx_x(con.getHx_x()); 63 content.setHx_y(con.getHx_y()); 64 content.setBd_x(con.getBd_x()); 65 content.setBd_y(con.getBd_y()); 66 content.setRecord("第"+page+"页-第"+i+"条"); 67 68 contents.add(content); 69 System.out.println("=====第"+page+"页-第"+i+"条====="); 70 } 71 } catch (Exception ex) { 72 ex.printStackTrace(); 73 } 74 return contents; 75 } 76 77 //获取详细页面信息 78 private Content getDetail(String url){ 79 Connection conn = Jsoup.connect(url); 80 Content content = new Content(); 81 try { 82 Document doc = conn.timeout(60000).get(); 83 Element ul = doc.select(".list-group").get(0); 84 Elements lis = ul.getElementsByTag("li"); 85 for (int i = 0; i < lis.size(); i++) { 86 Element li = lis.get(i); 87 String str = ""; 88 Element obj = li.getElementsByTag("a").first(); 89 if(obj != null){ 90 str = obj.getElementsByTag("a").text(); 91 switch (i) { 92 case 0://所属省份 93 content.setProvince(str); 94 break; 95 case 1://所属区县 96 content.setArea(str); 97 break; 98 case 4://所属分类 99 content.setType(str); 100 break; 101 } 102 }else{ 103 Element obj1 = li.getElementsByTag("li").first(); 104 str = obj1.getElementsByTag("li").text(); 105 switch (i) { 106 case 2://详细地址 107 content.setAddress_detail(str.substring(6)); 108 break; 109 case 3://号码 110 content.setPhone(str.substring(5)); 111 break; 112 case 5://所属标签 113 content.setLabel(str.substring(5)); 114 break; 115 case 6://大地坐标 116 content.setDd_x(str.substring(6, 16)); 117 content.setDd_y(str.substring(17)); 118 break; 119 case 7://火星坐标 120 content.setHx_x(str.substring(6, 16)); 121 content.setHx_y(str.substring(17)); 122 break; 123 case 8://百度坐标 124 content.setBd_x(str.substring(6, 16)); 125 content.setBd_y(str.substring(17)); 126 break; 127 } 128 } 129 } 130 } catch (IOException e) { 131 e.printStackTrace(); 132 } 133 return content; 134 } 135 }
1 import java.io.FileOutputStream; 2 import java.text.SimpleDateFormat; 3 import java.util.ArrayList; 4 import java.util.Date; 5 import java.util.List; 6 7 import org.apache.poi.hssf.usermodel.HSSFCell; 8 import org.apache.poi.hssf.usermodel.HSSFCellStyle; 9 import org.apache.poi.hssf.usermodel.HSSFRow; 10 import org.apache.poi.hssf.usermodel.HSSFSheet; 11 import org.apache.poi.hssf.usermodel.HSSFWorkbook; 12 13 /** 14 * 程序入口 15 * @author DSH 16 * 17 */ 18 public class Output { 19 20 @SuppressWarnings({ "deprecation", "unchecked", "rawtypes" }) 21 public static void main(String[] args) throws Exception { 22 23 //创建一个webbook,对应一个Excel文件 24 HSSFWorkbook wb = new HSSFWorkbook(); 25 //在webbook中添加一个sheet,对应Excel文件中的sheet 26 HSSFSheet sheet = wb.createSheet("POI数据"); 27 //在sheet中添加表头第0行,注意老版本poi对Excel的行数列数有限制short 28 HSSFRow row = sheet.createRow((int) 0); 29 //创建单元格,并设置值表头 设置表头居中 30 HSSFCellStyle style = wb.createCellStyle(); 31 style.setAlignment(HSSFCellStyle.ALIGN_CENTER); // 创建一个居中格式 32 33 //设置表头 34 HSSFCell cell = row.createCell((short) 0); 35 cell.setCellValue("名称"); 36 cell.setCellStyle(style); 37 cell = row.createCell((short) 1); 38 cell.setCellValue("地址"); 39 cell.setCellStyle(style); 40 cell = row.createCell((short) 2); 41 cell.setCellValue("电话号码"); 42 cell.setCellStyle(style); 43 cell = row.createCell((short) 3); 44 cell.setCellValue("所属分类"); 45 cell.setCellStyle(style); 46 cell = row.createCell((short) 4); 47 cell.setCellValue("所在省份"); 48 cell.setCellStyle(style); 49 cell = row.createCell((short) 5); 50 cell.setCellValue("所在区县"); 51 cell.setCellStyle(style); 52 cell = row.createCell((short) 6); 53 cell.setCellValue("详细地址"); 54 cell.setCellStyle(style); 55 cell = row.createCell((short) 7); 56 cell.setCellValue("所属标签"); 57 cell.setCellStyle(style); 58 cell = row.createCell((short) 8); 59 cell.setCellValue("大地坐标——X"); 60 cell.setCellStyle(style); 61 cell = row.createCell((short) 9); 62 cell.setCellValue("大地坐标——Y"); 63 cell.setCellStyle(style); 64 cell = row.createCell((short) 10); 65 cell.setCellValue("火星坐标——X"); 66 cell.setCellStyle(style); 67 cell = row.createCell((short) 11); 68 cell.setCellValue("火星坐标——Y"); 69 cell.setCellStyle(style); 70 cell = row.createCell((short) 12); 71 cell.setCellValue("百度坐标——X"); 72 cell.setCellStyle(style); 73 cell = row.createCell((short) 13); 74 cell.setCellValue("百度坐标——Y"); 75 cell.setCellStyle(style); 76 cell = row.createCell((short) 14); 77 cell.setCellValue("条目"); 78 cell.setCellStyle(style); 79 80 81 List list = null; 82 List listAll = new ArrayList();//保存全部数据 83 for (int i = 1; i <= 2; i++) {//从第一页开始,,,,第二页截止(包含第二页) 84 HtmlParseUtil htmlParseUtil = new HtmlParseUtil(); 85 //获取详细页 86 list = htmlParseUtil.getContents("http://www.poi86.com/poi/district/1550/"+i+".html",i); 87 listAll.addAll(list);//将所有数据放到一个list集合中 88 } 89 90 for (int i = 0; i < listAll.size(); i++) { 91 row = sheet.createRow((int) i + 1); 92 93 Content con = (Content) listAll.get(i); 94 // 创建单元格,并设置值 95 row.createCell((short) 0).setCellValue(con.getName()); 96 row.createCell((short) 1).setCellValue(con.getAddress()); 97 row.createCell((short) 2).setCellValue(con.getPhone()); 98 row.createCell((short) 3).setCellValue(con.getType()); 99 row.createCell((short) 4).setCellValue(con.getProvince()); 100 row.createCell((short) 5).setCellValue(con.getArea()); 101 row.createCell((short) 6).setCellValue(con.getAddress_detail()); 102 row.createCell((short) 7).setCellValue(con.getLabel()); 103 row.createCell((short) 8).setCellValue(con.getDd_x()); 104 row.createCell((short) 9).setCellValue(con.getDd_y()); 105 row.createCell((short) 10).setCellValue(con.getHx_x()); 106 row.createCell((short) 11).setCellValue(con.getHx_y()); 107 row.createCell((short) 12).setCellValue(con.getBd_x()); 108 row.createCell((short) 13).setCellValue(con.getBd_y()); 109 row.createCell((short) 14).setCellValue(con.getRecord()); 110 } 111 //将文件存到指定位置 112 try { 113 Date currentTime = new Date(); 114 SimpleDateFormat formatter = new SimpleDateFormat("yyyyMMddHHmmss"); 115 String date = formatter.format(currentTime); 116 117 FileOutputStream fout = new FileOutputStream("F:/content" + date + ".xls"); 118 wb.write(fout); 119 fout.close(); 120 } catch (Exception e) { 121 e.printStackTrace(); 122 } 123 } 124 }
该程序用到的四个jar包下载地址:http://pan.baidu.com/s/1c207Q6k
该文案仅仅作为网页数据获取学习案例,不可获取网页数据作为恶意用途,如有违反,本人不承担任何责任!
成功不是终点,失败也并非末日,重要的是前行的勇气!