爬虫Jsoup
@Test
public void jsoup1() throws IOException {
String targetUrl = "https://www.runoob.com/";
//获取链接
Connection connect = Jsoup.connect(targetUrl);
//伪造请求头
connect.header("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
connect.header("accept-encoding", "gzip, deflate, br");
connect.header("accept-language", "zh-CN,zh;q=0.9");
connect.header("cache-control", "max-age=0");
connect.header("user-agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.106 Safari/537.36");
//忽略错误
connect.ignoreHttpErrors(true);
Connection.Response response = connect.method(Connection.Method.GET).execute();
// System.out.println(response.body());
Document document = response.parse();
//获取爬取道德html的body标签
Element body = document.body();
// System.out.println(body);
Element quiz = body.getElementById("quiz");
//// System.out.println(quiz);
Elements li = quiz.getElementsByTag("li");
// //text()获取标签里的文本
//// for (Element element : li) {
//// System.out.println(element.text());
//// }
li = body.getElementById("index-nav").getElementsByTag("li");
for (Element element : li) {
System.out.println(element.text());
// System.out.println(element.getElementsByTag("a").attr("href"));
String href = element.getElementsByTag("a").attr("href");
System.out.println(href);
if (!href.contains("//")) {
continue;
}
if (!href.contains("https://")) {
href = "https:" + href;
}
System.out.println(href);
Connection con = Jsoup.connect(href);
System.out.println(con.execute().parse().body());
System.out.println("-------------------------------------------------");
}
}
挣钱养媳妇儿^.^

浙公网安备 33010602011771号