记录某次对某盗版书的数据抓取

 public List<TblNovel> captureWeb(String httpUri, String defaultCharset) throws IOException {

         if (map.get(httpUri)==null){
             map.put(httpUri,new Object());
             URLConnection urlConnection = new URL(httpUri).openConnection();
             HttpURLConnection connection = (HttpURLConnection) urlConnection;
             connection.setRequestMethod("GET");
             connection.connect();
             if (httpUri != null){
                 if (connection.getResponseCode() == HttpURLConnection.HTTP_OK){

                     CloseableHttpClient httpClient = HttpClientBuilder.create().build();
                     CloseableHttpResponse execute = httpClient.execute(new HttpGet(httpUri));
                     String result = EntityUtils.toString(execute.getEntity(), defaultCharset);
                     Document parse = Jsoup.parse(result);
                     Elements select = parse.select("#centerm table");
                     Element element = select.get(1);
                     Elements tr = element.getElementsByTag("tr");
                     for (Element e :tr){
                         TblNovel novel = new TblNovel();
                         Elements odd = e.getElementsByClass("odd");
                         if (odd.size() == 0){
                             continue;
                         }
                         Elements tom1 = odd.get(0).getElementsByTag("a");
                         String name = tom1.text();
                         novel.setName(name);
                         String url = tom1.get(0).absUrl("href");
                         novel.setUrl(url);
                         String author = odd.get(1).text();
                         novel.setAuthor(author);
                         String lastUpdateTime = odd.get(2).text();
                         Date lastUpdateChapterUrl = getTimeString(lastUpdateTime);
                         novel.setLastUpdateChapterUrl(new SimpleDateFormat().format(lastUpdateChapterUrl));
                         Elements even = e.getElementsByClass("even");
                         if (even.size() == 0){
                             continue;
                         }
                         Elements even1 = even.get(0).getElementsByTag("a");
                         String lastUpdateChapter = even1.text();
                         novel.setLastUpdateChapter(lastUpdateChapter);
                         String Number = even.get(1).text();

//                if (Number.substring(Number.length()-1) .equals("K")){
//                    String substring = Number.substring(0, Number.length() - 2);
//                    Long chapterNumber = Long.valueOf(substring + "000");
//                    novel.setChapterNumber(chapterNumber);
//                }
                         String text = even.get(2).text();
                         if (text.equals("完本")){
                             novel.setStatus(1);
                         }else if (text.equals("连载")){
                             novel.setStatus(2);
                         }
                         novel.setAddTime(new Date());
                         String category = category(url, defaultCharset);
                         if (category !=null){
                             novel.setType(category);
                         }
                       BxwxChapterDaoImpl.tblNovelMapper.insert(novel);
                         System.out.println(novel);

                         arr.add(novel);

                     }

                 }
             }
         }




            return arr;
    }

 

posted @ 2021-11-16 11:00  小福gui  阅读(52)  评论(0)    收藏  举报