Jsoup爬虫的简单使用
添加POM依赖
<dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.7.3</version> </dependency>
JAVA代码示例
public static void main(String[] args) throws IOException{
// 天眼查
// String result1= HttpRequest.get("http://open.api.tianyancha.com/services/open/cb/ic/2.0?keyword=XXXX公司").header("Authorization", "").execute().body();;
// System.err.println(result1);
/*Document doc = Jsoup.connect("https://www.tianyancha.com/search?key=北京百度网讯科技有限公司").timeout(3000).get();
System.err.println(doc.title());
Elements newsHeadlines = doc.select(".cate_menu_lk");
System.err.println(newsHeadlines.size());
for (Element headline : newsHeadlines) {
System.err.println(
headline.text());
}
*/
try {
Document document = Jsoup.connect("https://www.so.com/s?ie=utf-8&fr=so.com&src=home_so.com&ssid=&q=java")
.timeout(5000)
.get();
Elements elements = document.select(".res-title a");
elements.forEach(element -> {
System.out.println(element.text());
System.err.println(element.attr("href"));
});
System.err.println("---------------------");
for(int i=2;i<=10;i++){
Document documentt = Jsoup.connect("https://www.so.com/s?q=java&pn="+i+"&src=srp_paging&fr=so.com")
.timeout(5000)
.get();
Elements eelements = documentt.select(".res-title a");
eelements.forEach(element -> {
System.out.println(element.text());
System.err.println(element.attr("href"));
});
System.err.println("---------------------");
}
} catch (IOException e) {
e.printStackTrace();
}
}
浙公网安备 33010602011771号