jsoup获取文章内容

jsoup爬取文章内容

protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
    // TODO Auto-generated method stub
    //response.getWriter().append("Served at: ").append(request.getContextPath());
    String agent1 = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36";
    
    int pageNum=1;
    int pageSize=899;
    //for(pageNum=1;pageNum<101;pageNum++)
    for(pageNum=1;pageNum<2;pageNum++)
    {
        try {
            int page1= 277;
            Map<Integer,String> map1 = ManageMySQL.getNewsLinkInTable(page1,pageSize,"data_szyjglj");
            for(Integer key : map1.keySet())
            {
                System.out.println(key+"  "+map1.get(key));
                String news_link = map1.get(key);
                String context1="";
                String source1="";
                //String context1 = getContentByURL(news_link).replace(" ", "");
                
                Document documentRoot = Jsoup.connect(news_link).userAgent(agent1).get();
                Elements elements1 = documentRoot.select("div.source span");
                if(elements1.size()==2)
                {
                    Element span_ele = elements1.get(0);
                    source1 = span_ele.text();
                }
                
                Elements elements2 = documentRoot.select("div.view_box");
                if(elements2.size()==1)
                {
                    Element div_ele = elements2.get(0);
                    context1 = div_ele.text();
                }
                
                ManageMySQL.updateContextAndPublishDate(key, context1.replace("'", "").replace("\"", ""),source1,"data_szyjglj");
            }
        } catch (Exception e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        
    }
}

 

posted @ 2019-10-22 19:39  西北逍遥  阅读(572)  评论(0编辑  收藏  举报