爬虫Jsoup

 @Test
    public void jsoup1() throws IOException {
        String targetUrl = "https://www.runoob.com/";
        //获取链接
        Connection connect = Jsoup.connect(targetUrl);
        //伪造请求头
        connect.header("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
        connect.header("accept-encoding", "gzip, deflate, br");
        connect.header("accept-language", "zh-CN,zh;q=0.9");
        connect.header("cache-control", "max-age=0");
        connect.header("user-agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.106 Safari/537.36");
        //忽略错误
        connect.ignoreHttpErrors(true);
        Connection.Response response = connect.method(Connection.Method.GET).execute();
//        System.out.println(response.body());
        Document document = response.parse();
        //获取爬取道德html的body标签
        Element body = document.body();
//        System.out.println(body);
        Element quiz = body.getElementById("quiz");
////        System.out.println(quiz);
        Elements li = quiz.getElementsByTag("li");
//        //text()获取标签里的文本
////        for (Element element : li) {
////            System.out.println(element.text());
////        }
        li = body.getElementById("index-nav").getElementsByTag("li");
        for (Element element : li) {
            System.out.println(element.text());
//            System.out.println(element.getElementsByTag("a").attr("href"));
            String href = element.getElementsByTag("a").attr("href");
            System.out.println(href);
            if (!href.contains("//")) {
                continue;
            }
            if (!href.contains("https://")) {
                href = "https:" + href;
            }
            System.out.println(href);
            Connection con = Jsoup.connect(href);
            System.out.println(con.execute().parse().body());
            System.out.println("-------------------------------------------------");
        }

    }
posted @ 2021-07-19 08:37 王^.^令阅读(41) 评论(0) 收藏举报
刷新页面返回顶部
王^.^令

爬虫Jsoup

公告