java利用爬虫技术抓取(省、市(区号\邮编)、县)数据

近期项目须要用到 城市的地址信息,但从网上下载的xml数据没有几个是最新的地址信息.....数据太老,导致有些地区不全。所以才想到天气预报官网特定有最新最全的数据。贴出代码,希望能给有相同困惑的朋友。降低一些时间。

	/**
	 * @param var  城市名称
	 * @return	string数组。0表示邮编	1表示区号
	 */
	@SuppressWarnings("deprecation")
	private String[] getZipCode(String var) {
		String[] code = new String[2];
		String zipCode_S = "邮编:";
		String zipCode_E = " ";
		String qhCode_S = "区号:";
		String qhCode_E = "</td>";
		String encode = URLEncoder.encode(var);
		try {
			URL url = new URL("http://www.ip138.com/post/search.asp?

area=" + encode + "&action=area2zone"); BufferedReader br = new BufferedReader(new InputStreamReader( url.openStream(), "GBK")); for (String line; (line = br.readLine()) != null;) { int zipNum = line.indexOf(zipCode_S); if (zipNum > 1) { String str = line.substring(zipNum + zipCode_S.length()); str = str.substring(0, str.indexOf(zipCode_E)); code[0] = str; } int qhNum = line.indexOf(qhCode_S); if(qhNum > 1) { String str = line.substring(qhNum + qhCode_S.length()); str = str.substring(0, str.indexOf(qhCode_E)); code[1] = str; break; } } } catch (Exception e) { System.out.println(var +"\t错误"+e.toString()); } return code; } /** * 主程序 * @throws Exception */ @Test public void main() throws Exception { //1:获取全部省份 TreeMap<String,String> provincesBuffer = getAddressInfo("http://www.weather.com.cn//data/city3jdata/china.html"); Element prcEle = DocumentHelper.createElement("Provinces"); //2:依据省份获取城市 Element citysEle = DocumentHelper.createElement("Citys"); //3:依据省份城市获取区、县 Element distEle = DocumentHelper.createElement("Districts"); int p = 1; int c = 1; int d = 1; for(Entry<String, String> prc : provincesBuffer.entrySet()) { Element province = DocumentHelper.createElement("Province"); province.addAttribute("ID",""+(p)).addAttribute("ProvinceName", prc.getValue()).addText(prc.getValue()); //获取邮政编号 TreeMap<String,String> cityBuffer = getAddressInfo("http://www.weather.com.cn/data/city3jdata/provshi/"+prc.getKey()+".html"); for(Entry<String, String> citys : cityBuffer.entrySet()) { Element city = DocumentHelper.createElement("City"); String[] zipCode = getZipCode(citys.getValue()); if(zipCode[0]==null||zipCode[1]==null) System.out.println("缺少"+citys.getValue()+"邮政或区号!"); city.addAttribute("ID", ""+c).addAttribute("CityName", citys.getValue()).addAttribute("PID",p+"").addAttribute("ZipCode", zipCode[0]).addAttribute("AreaCode", zipCode[1]).addText(citys.getValue()); TreeMap<String, String> distsBuffer = getAddressInfo("http://www.weather.com.cn/data/city3jdata/station/"+prc.getKey()+""+citys.getKey()+".html"); for(Entry<String, String> dists : distsBuffer.entrySet()) { String value = dists.getValue(); if(value.equals(citys.getValue())) continue; Element district = DocumentHelper.createElement("District"); district.addAttribute("ID",""+(d++)).addAttribute("DistrictName", dists.getValue()).addAttribute("CID", c+"").addText(dists.getValue()); distEle.add(district); } citysEle.add(city); c++; } prcEle.add(province); p++; } //4:保存到本地 saveInf("f:\\Provinces.xml",prcEle); saveInf("f:\\Citys.xml",citysEle); saveInf("f:\\Districts.xml",distEle); } /** 保存xml * @param savePath xml保存路径 * @param varEle 根元素 */ private void saveInf(String savePath, Element varEle) { Document varDoc = DocumentHelper.createDocument(); varDoc.add(varEle); try { XMLWriter xmlwri = new XMLWriter(new FileOutputStream(new File(savePath)), new OutputFormat("\t", true, "UTF-8")); xmlwri.write(varDoc); xmlwri.close(); } catch (Exception e) { System.out.println(savePath +"失败,原因例如以下"); throw new RuntimeException(e); } } /** * 获取信息 * @param address url路径 * @return key :信息编号 value:信息名称 */ private TreeMap<String, String> getAddressInfo(String address) { TreeMap<String,String> china = new TreeMap<String, String>(); BufferedReader br = null; String buffer = null; try { URL url = new URL(address); br = new BufferedReader(new InputStreamReader(url.openStream(),"UTF-8")); buffer = br.readLine(); } catch (Exception e) { System.out.println("错误:"+e.getMessage()); }finally{ if(br != null) try { br.close(); } catch (IOException e) { e.printStackTrace(); } } if(buffer==null) return china; buffer = buffer.replaceAll("\\{|\\}|\"",""); String[] splits = buffer.split(","); for(String sp : splits) { String[] split = sp.split(":"); if(split!=null && split.length == 2) china.put(split[0], split[1]); else System.out.println(address); } buffer = null; return china; }


下载xml数据

posted @ 2017-05-03 13:04  wzjhoutai  阅读(981)  评论(0编辑  收藏  举报