Jsoup爬虫的简单使用

添加POM依赖

<dependency>
    <groupId>org.jsoup</groupId>
        <artifactId>jsoup</artifactId>
    <version>1.7.3</version>
</dependency>

JAVA代码示例

public static void main(String[] args) throws IOException{
		// 天眼查
		// String result1= HttpRequest.get("http://open.api.tianyancha.com/services/open/cb/ic/2.0?keyword=XXXX公司").header("Authorization", "").execute().body();;
		// System.err.println(result1);
		/*Document doc = Jsoup.connect("https://www.tianyancha.com/search?key=北京百度网讯科技有限公司").timeout(3000).get();
		System.err.println(doc.title());
		Elements newsHeadlines = doc.select(".cate_menu_lk");
		System.err.println(newsHeadlines.size());
		for (Element headline : newsHeadlines) {
		  System.err.println( 
		    headline.text());
		}
		*/
		try {
            Document document = Jsoup.connect("https://www.so.com/s?ie=utf-8&fr=so.com&src=home_so.com&ssid=&q=java")
                    .timeout(5000)
                    .get();
            
            Elements elements = document.select(".res-title a");
 
            elements.forEach(element -> {
                System.out.println(element.text());
                System.err.println(element.attr("href"));
            });
            System.err.println("---------------------");
            for(int i=2;i<=10;i++){
            	 Document documentt = Jsoup.connect("https://www.so.com/s?q=java&pn="+i+"&src=srp_paging&fr=so.com")
                         .timeout(5000)
                         .get();
                 
                 Elements eelements = documentt.select(".res-title a");
      
                 eelements.forEach(element -> {
                     System.out.println(element.text());
                     System.err.println(element.attr("href"));
                 });
                 System.err.println("---------------------");
            }
 
        } catch (IOException e) {
            e.printStackTrace();
        }
	}

  

posted on 2022-10-20 16:37  -韩帅  阅读(35)  评论(0编辑  收藏  举报

导航