java 爬取 国税局 省市区级联关系

爬取网址 http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/index.html

因为数据比较大,存储为一个json,会内存溢出。

所以按照每个省市进行存储。

同时因为远程访问链接拿取数据,所以会将已经拿到网页进行缓存,以便下次使用。

package com.witwicky.jsoup;

import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.witwicky.vo.CrawlingVo;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.*;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;

public class Crawling {
    private static final String BASE_SAVE_DIR = "E:\\工作\\extract";
    private static final String RESULT_SAVE_DIR = "E:\\工作\\extract_result";

    public static void main(String[] args) throws Exception {
        Gson gsonPretty = new GsonBuilder().setPrettyPrinting().create();
        Gson gsonSimple = new GsonBuilder().create();
        List<CrawlingVo> crawlingVos = new ArrayList<CrawlingVo>();
        Elements select = getElements("index.html", "tr.provincetr > td > a");
        for (Element element : select) {
            List<CrawlingVo> crawlingVos1 = new ArrayList<CrawlingVo>();
            String val = element.attr("href");
            crawlingVos.add(new CrawlingVo(val.substring(0, val.indexOf(".")), element.text(), crawlingVos1));

            String baseUrl = element.attr("href");
            String baseUrlPre = baseUrl.substring(0, baseUrl.indexOf("."));
            Elements ele = getElements(baseUrl, "tr.citytr");
            for (Element nextE : ele) {
                List<CrawlingVo> crawlingVos2 = new ArrayList<CrawlingVo>();
                crawlingVos1.add(new CrawlingVo(nextE.select("td:eq(0) a").text(), nextE.select("td:eq(1) a").text(), crawlingVos2));

                String href = nextE.select("td:eq(1) a").attr("href");
                String substring = href.substring(0, baseUrl.indexOf("."));
                Elements contryElements = getElements(href, "tr.countytr");
                for (Element contryElement : contryElements) {
                    List<CrawlingVo> crawlingVos3 = new ArrayList<CrawlingVo>();
                    crawlingVos2.add(new CrawlingVo(contryElement.select("td:eq(0) a").text(), contryElement.select("td:eq(1) a").text(), crawlingVos3));

                    String href1 = contryElement.select("td:eq(1) a").attr("href");

                    if (!"".equalsIgnoreCase(href1)) {
                        String substring1 = href1.substring(0, baseUrl.indexOf("."));
                        Elements elements = getElements(substring + "/" + href1, "tr.towntr");
                        for (Element element1 : elements) {
                            List<CrawlingVo> crawlingVos4 = new ArrayList<CrawlingVo>();
                            crawlingVos3.add(new CrawlingVo(element1.select("td:eq(0) a").text(), element1.select("td:eq(1) a").text(), crawlingVos4));

                            String href2 = element1.select("td:eq(1) a").attr("href");
                            Elements elements1 = getElements(baseUrlPre + "/" + substring1 + "/" + href2, "tr.villagetr");
                            for (Element element2 : elements1) {
                                crawlingVos4.add(new CrawlingVo(element2.select("td:eq(0)").text(), element2.select("td:eq(2)").text(), new ArrayList<CrawlingVo>()));
                            }
                        }
                    }
                }
            }
            save2File(gsonSimple.toJson(crawlingVos), element.text() + ".json", RESULT_SAVE_DIR);
            save2File(gsonPretty.toJson(crawlingVos), element.text() + "_pretty.json", RESULT_SAVE_DIR);

            System.out.println(element.text() + " is complete!");
        }
    }

    private static Elements getElements(String u, String selector) throws IOException, InterruptedException {
        String url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/" + u;
        String cleanUrl = cleanName(url);
        Document select = null;
        File localFile = new File(BASE_SAVE_DIR, cleanUrl);
        if (localFile.exists()) {
            select = Jsoup.parse(localFile, "UTF-8");
        }
        boolean remoteUrl = false;
        if (select == null) {
            int intRd = new Random().nextInt(5) + 1;
            Thread.sleep(intRd * 1000);
            select = Jsoup.connect(url).get();
            remoteUrl = true;
        }
        if (remoteUrl) {
            save2File(select.toString(), cleanName(url), BASE_SAVE_DIR);
        }
        return select.select(selector);
    }

    private static String cleanName(String name) {
        return name
                .replace("\\", "_")
                .replace("/", "_")
                .replace("//", "_")
                .replace(".", "_")
                .replace(":", "_");
    }

    private static void save2File(String content, String fileName, String saveDir) {
        File dir = new File(saveDir);
        if (!dir.exists()) {
            boolean mkdirs = dir.mkdirs();
            if (!mkdirs) {
                return;
            }
        }

        File file = new File(dir, fileName);
        if (file.exists()) {
           return;
        }

        try {
            FileOutputStream outSTr = new FileOutputStream(file);
            BufferedOutputStream Buff = new BufferedOutputStream(outSTr);
            Buff.write(content.getBytes());
            Buff.flush();
            Buff.close();
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}

 

package com.witwicky.vo;

import java.util.List;

public class CrawlingVo {
    private String value;
    private String label;
    private List<CrawlingVo> children;

    public CrawlingVo() {
    }

    public CrawlingVo(String value, String label, List<CrawlingVo> children) {
        this.value = value;
        this.label = label;
        this.children = children;
    }

    public String getValue() {
        return value;
    }

    public void setValue(String value) {
        this.value = value;
    }

    public String getLabel() {
        return label;
    }

    public void setLabel(String label) {
        this.label = label;
    }

    public List<CrawlingVo> getChildren() {
        return children;
    }

    public void setChildren(List<CrawlingVo> children) {
        this.children = children;
    }
}

 

\\审判系统
[\\Shěnpàn xìtǒng]
\\ trial system
posted @ 2019-04-03 18:06  GordonDicaprio  阅读(607)  评论(0编辑  收藏  举报