Java WebClient 总结

private WebClient getAWebClient() {
        WebClient webClient = new WebClient(BrowserVersion.FIREFOX_24);
        webClient.getOptions().setTimeout(20000);
        // webClient.getCookieManager().setCookiesEnabled(true);
        webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
        webClient.getOptions().setThrowExceptionOnScriptError(false);
        webClient.getOptions().setCssEnabled(false);
        webClient.getOptions().setJavaScriptEnabled(false);
        webClient.addRequestHeader("Accept", "textml,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
        webClient.addRequestHeader("Accept-Encoding", "gzip, deflate");
        webClient.addRequestHeader("Accept-Language", "en-US,en;q=0.5");
        webClient.addRequestHeader("Cache-Control", "max-age=0");
        webClient.addRequestHeader("Connection", "keep-alive");
        webClient.addRequestHeader("Host", "www.amazon.com");
        webClient.addRequestHeader("User-Agent", "Mozilla/5.0 (X11; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0");
        return webClient;
    }
/**
     * 采集网页
     */
    public StringBuilder crawlPage(String url) {
        StringBuilder builder = new StringBuilder();
        logger.info(Thread.currentThread().getName() + " crawl " + url);
        // mygetpage代码放在这里
        webClient.getCookieManager().clearCookies();
        logger.info(Thread.currentThread().getName() + " webClient.getCookieManager().clearCookies();");
        File file = new File(cookiePathAppendRandom());
        logger.info(Thread.currentThread().getName() + " File file = new File(cookiePathAppendRandom());");
        if (file.exists()) {
            FileInputStream fin = null;
            try {
                fin = new FileInputStream(file);
            } catch (FileNotFoundException e1) {
                e1.printStackTrace();
            }
            CookieStore cookieStore = null;
            ObjectInputStream in;
            try {
                in = new ObjectInputStream(fin);
                cookieStore = (CookieStore) in.readObject();
                in.close();
            } catch (IOException e) {
                logger.error(e);
            } catch (ClassNotFoundException e) {
                logger.error(e);
            }
            List<org.apache.http.cookie.Cookie> l = cookieStore.getCookies();
            for (org.apache.http.cookie.Cookie temp : l) {
                Cookie cookie = new Cookie(temp.getDomain(), temp.getName(), temp.getValue(), temp.getPath(),
                        temp.getExpiryDate(), false);
                webClient.getCookieManager().addCookie(cookie);
            }
        }
        logger.info(Thread.currentThread().getName() + " MyGetPage start,url:" + url);
        HtmlPage page = MyGetPage(new StringBuffer(url));
        logger.info(Thread.currentThread().getName() + " MyGetPage end,url:" + url);
        if (page == null) {
            // 采集过程中出现异常的model,可以统一放在一个list中,发送给server重新加入到采集分配队列
            logger.info("Page null!");
            AmazonCrawlModel model=new AmazonCrawlModel(crawlId, crawlURLId, url, depth,ischange);
            exceptionFun(model);
            return (new StringBuilder("getNullPage"));
        }
        logger.info(Thread.currentThread().getName() + " builder.append(page.asXml());");
        builder.append(page.asXml());
        logger.info(Thread.currentThread().getName() + " return builder;");
        logger.info(Thread.currentThread().getName() +" CrawlPage $Length="+builder.toString().length());
        if(builder.toString().length()<=300){
            AmazonCrawlModel model=new AmazonCrawlModel(crawlId, crawlURLId, url, depth,ischange);
            exceptionFun(model);
            return (new StringBuilder("getNullPage"));
        }
        return builder;
    }

 

/***
     * 自定义的getpage,遇到验证码页面识别直至成功
     * 
     */
    private HtmlPage MyGetPage(StringBuffer URL) {
        HtmlPage page = null;
        boolean flag = true;
        int TryTimeCnt = 1;
        int UnknowHostTryTimeCnt = 1;
        while (flag) {
            flag = false;
            try {
                logger.info(Thread.currentThread().getName() + " webClient.getPage : " + URL + ",CrawlURL_id:"
                        + crawlURLId);
                page = webClient.getPage(URL.toString());
                Document doc = Jsoup.parse(page.asXml());
                int robotchecknum = 1;
                while (doc.select("title").text().equals("Robot Check")) {
                    logger.info(Thread.currentThread().getName() + " " + dayformat1.format(System.currentTimeMillis())
                            + " [Robot Check,URL:" + URL + "]");
                    String captcha_str = AmazonGetCaptcha.GetCaptcha(new StringBuilder(doc.toString()));
                    logger.info(Thread.currentThread().getName() + " " + dayformat1.format(System.currentTimeMillis())
                            + " end AmazonGetCaptcha.GetCaptcha");
                    logger.info(dayformat1.format(new Date()) + " " + Thread.currentThread().getName() + " : "
                            + captcha_str);

                    HtmlForm form = null;

                    logger.info(Thread.currentThread().getName() + " page.getForms().get(0) Start");
                    form = page.getForms().get(0);
                    logger.info(Thread.currentThread().getName() + " page.getForms().get(0) End");

                    HtmlButton button = null;

                    logger.info(Thread.currentThread().getName() + " form.getElementsByTagName(button).get(0) Start");
                    button = (HtmlButton) form.getElementsByTagName("button").get(0);
                    logger.info(Thread.currentThread().getName() + " form.getElementsByTagName(button).get(0) End");

                    logger.info(Thread.currentThread().getName() + " setValueAttribute Start");
                    form.getInputByName("field-keywords").setValueAttribute(captcha_str);
                    logger.info(Thread.currentThread().getName() + " setValueAttribute End");

                    logger.info(Thread.currentThread().getName() + " button.click Start");
                    boolean click_flag = false;
                    while (!click_flag) {
                        try {
                            click_flag = true;
                            page = button.click();
                        } catch (Exception e1) {
                            logger.error(Thread.currentThread().getName() + " button.click出错了: " + e1);
                            //e1.printStackTrace();
                            click_flag = false;
                        }
                    }
                    logger.info(Thread.currentThread().getName() + " button.click end");
                    while (page.asXml() == null) {
                        logger.info(Thread.currentThread().getName() + " page xml null");
                        logger.info(Thread.currentThread().getName() +" "+ page.asXml());
                        page.refresh();
                        logger.info(Thread.currentThread().getName() + " refresh End!");
                    }
                    logger.info(Thread.currentThread().getName() + " button.click End");

                    logger.info(Thread.currentThread().getName() + " Start ParsePage!");
                    doc = Jsoup.parse(page.asXml());
                    if (!doc.select("title").text().equals("Robot Check")) {
                        logger.info(Thread.currentThread().getName() + " " + doc.select("title").text());
                        logger.info(Thread.currentThread().getName() + " "
                                + dayformat1.format(System.currentTimeMillis()) + " [Robot Check,captcha success:"
                                + captcha_str + ",try num:" + robotchecknum + "]");
                    }
                    robotchecknum++;
                }

            } catch (FailingHttpStatusCodeException e) {
                logger.error(Thread.currentThread().getName() +" "+ e);
                flag = true;
            } catch (MalformedURLException e) {
                logger.error(Thread.currentThread().getName() +" "+ e);
                flag = true;
            }catch(UnknownHostException e) {
                logger.error(Thread.currentThread().getName() +" "+ e);
                flag = true;
                logger.info("found UnknownHostException,start sleep 20 min");
                try {
                    Thread.sleep(1000*60*Integer.parseInt(Configuration.getProperties("unknowhost_sleeptime")));
                } catch (InterruptedException e1) {
                    logger.error(Thread.currentThread().getName() +" "+ e1);
                }
                logger.info("found UnknownHostException,end sleep 20 min");
                UnknowHostTryTimeCnt++;// 访问异常数加一
                logger.info(Thread.currentThread().getName() + " " + dayformat1.format(System.currentTimeMillis())
                        + " [UnknowHostTryTimeCnt:" + UnknowHostTryTimeCnt + "]");
                if (UnknowHostTryTimeCnt > Integer.parseInt(Configuration.getProperties("unknowhost_maxtrytime"))) {
                    return null;
                }
            }catch (Exception eq) {
                logger.error(Thread.currentThread().getName() + " "+eq);
                TryTimeCnt++;// 访问异常数加一
                logger.info(Thread.currentThread().getName() + " " + dayformat1.format(System.currentTimeMillis())
                        + " [TryTimeCnt:" + TryTimeCnt + "]");
                if (TryTimeCnt > 5) {
                    return null;
                }
                try {
                    Thread.sleep(1000);
                } catch (InterruptedException e) {
                    e.printStackTrace();
                    logger.error(Thread.currentThread().getName() + e);
                }
                flag = true;
            }
            try {
                Thread.sleep(random.nextInt(500) + 1500);
            } catch (InterruptedException e) {
                logger.error(Thread.currentThread().getName() + e);
                flag = true;
            }
        }
        return page;
    }

 

posted @ 2016-11-08 15:42  陈泽泽  阅读(6398)  评论(0编辑  收藏  举报