懒码农。。。。。。

  博客园 :: 首页 :: 博问 :: 闪存 :: 新随笔 :: 联系 :: 订阅 订阅 :: 管理 ::
package com.test.crawler.service;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.protocol.BasicHttpContext;
import org.apache.http.protocol.HttpContext;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
import java.util.List;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import com.test.db.po.Tb_test_company_info;
import com.test.crawler.htmlHandler.CompanyDetailHtmlHandler;

public class ViewCompanyDetailService {
    
    private static final int MAX_THREAD_NUM = 100;
    
    public void ViewCompanyDetail(List<Tb_test_company_info> companyList) throws InterruptedException{
        
        if(companyList==null||companyList.size()==0){return;}
        PoolingHttpClientConnectionManager connManager = new PoolingHttpClientConnectionManager();
        connManager.setMaxTotal(MAX_THREAD_NUM);
        CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(connManager).build();
        try{
            //公司信息总数
            int iTotalComanyInfoNum = companyList.size();
            //多线程执行的次数
            int iMulitThreadRunTimes = 1;
            if(iTotalComanyInfoNum > MAX_THREAD_NUM){
                iMulitThreadRunTimes = (iTotalComanyInfoNum/MAX_THREAD_NUM) + ( (iTotalComanyInfoNum%MAX_THREAD_NUM ==0) ? 0:1);
            }
            for(int iCurMulitThreadRunTimes = 0 ;iCurMulitThreadRunTimes < iMulitThreadRunTimes ;iCurMulitThreadRunTimes++ ){
                //线程数
                int iThreadNum = (iCurMulitThreadRunTimes+1)* MAX_THREAD_NUM <= iTotalComanyInfoNum ? 
                        MAX_THREAD_NUM : (iTotalComanyInfoNum - iCurMulitThreadRunTimes*MAX_THREAD_NUM);
                ExecutorService exe = Executors.newFixedThreadPool(iThreadNum);
                for (int i = 0; i < iThreadNum; i++) {
                    HttpGet httpget = new HttpGet(companyList.get(iCurMulitThreadRunTimes*MAX_THREAD_NUM+i).getCompanyUrl());
                    exe.execute(new ViewCompanyDetailThread(httpClient, httpget, companyList.get(iCurMulitThreadRunTimes*MAX_THREAD_NUM+i).getId()));
                }
                exe.shutdown();
                while (true) {  
                    if (exe.isTerminated()) {  
                        System.out.println(MAX_THREAD_NUM + " Over !!");
                        Thread.sleep(15000);  
                        break;  
                    }  
                    Thread.sleep(200);  
                }
            }
        }finally{
            try {
                httpClient.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        
    }
    static class ViewCompanyDetailThread extends Thread {
        private final CloseableHttpClient httpClient;
        private final HttpContext context;
        private final HttpGet httpget;
        private final int shopId;
        
        public ViewCompanyDetailThread(CloseableHttpClient httpClient, HttpGet httpget, int shopId) {
            this.httpClient = httpClient;
            this.context = new BasicHttpContext();
            this.httpget = httpget;
            this.shopId = shopId;
        }
        
        @Override
        public void run() {
            try {
                //System.out.println(shopId + "Get");
                CloseableHttpResponse response = httpClient.execute(httpget, context);
                try {
                    HttpEntity entity = response.getEntity();
                    if (entity != null) {
                        String pageContent = EntityUtils.toString(entity,"UTF-8");
                        CompanyDetailHtmlHandler companyDetailHtmlHandler = new CompanyDetailHtmlHandler();
                        if(!companyDetailHtmlHandler.CompanyInfoParseAndSave(shopId, pageContent)){
                            System.out.println(shopId + " - CompanyInfoParseAndSave Failure");
                        }
                    }
                } finally {
                    response.close();
                }
            } catch (Exception e) {
                System.out.println(shopId + " - error: " + e);
            }
        }
    }
}

 

package com.test.crawler.htmlHandler;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;

import com.test.db.dao.CompanyInfoDao;

public class CompanyDetailHtmlHandler {
    
    /**
     * 公司信息解析并且保存
     * @param shopId
     * @param pageContent
     * @return
     */
    public synchronized boolean CompanyInfoParseAndSave(int shopId,String pageContent){
        if(shopId<=0 || pageContent == null){return false;}
        Document doc = Jsoup.parse(pageContent);
        String CompanyPhone = "";
        String CompanyBoss = "";
        String CompanyMobil = "";
        String CompanyAddr = "";
        String QQ = "";
        String Jyms = "";
        String createDatetime = "";
        Elements eleContents = doc.select("省略...");
        if(eleContents!=null && eleContents.size() >0 ){
            CompanyBoss = eleContents.first().select("省略...").first().text();
            try{
                String qqHref = eleContents.first().select("省略...").first().attr("href");
                Pattern p = Pattern.compile("http://wpa.qq.com/msgrd\\?v=3\\&uin=(\\d*?)\\&site=qq\\&menu=yes");
                Matcher m = p.matcher(qqHref);
                if(m.find()) {
                    QQ = m.group(1);
                }
            }catch(Exception e){}
            try{
                Jyms = eleContents.first().select("省略...").get(0).text();
                CompanyAddr = eleContents.first().select("省略...").get(1).text();
                createDatetime = eleContents.first().select("省略...").get(2).text();
            }catch(Exception e){}
        }
        Elements eleContents2 = doc.select("div.wp-colsub div.wp-mdl div.wp-contact ul.contact-lst");
        if(eleContents2!=null && eleContents2.size() >0 ){
            try{
                String regEx="[^0-9]";   
                Pattern p = Pattern.compile(regEx);   
                Matcher m = p.matcher(eleContents2.select("li").get(1).text());   
                CompanyMobil = m.replaceAll("").trim();
                String regEx2="[^0-9\\-]";   
                Pattern p2 = Pattern.compile(regEx2);   
                Matcher m2 = p2.matcher(eleContents2.select("li").get(2).text());   
                CompanyPhone = m2.replaceAll("").trim();
            }catch(Exception e){}
        }
        CompanyInfoDao dao = new CompanyInfoDao();
        if(CompanyBoss==null||"".equals(CompanyBoss.trim())){CompanyBoss="-";}
        return dao.Update(shopId, CompanyPhone, CompanyBoss, CompanyMobil, CompanyAddr, QQ, Jyms, createDatetime);
    }
}

 

 

package com.test.crawler.main;

import com.test.crawler.service.ViewCompanyDetailService;
import com.test.db.dao.CompanyInfoDao;

public class TestMain {

    public static void main(String[] args) {
        
        //关闭httpclient多余日志
        System.setProperty("org.apache.commons.logging.Log", "org.apache.commons.logging.impl.SimpleLog");
        System.setProperty("org.apache.commons.logging.simplelog.showdatetime", "true");
        System.setProperty("org.apache.commons.logging.simplelog.log.org.apache.commons.httpclient", "stdout");
        
        try{
            ViewCompanyDetailService ss = new ViewCompanyDetailService();
            CompanyInfoDao dao = new CompanyInfoDao();
            ss.ViewCompanyDetail(dao.ListForViewDetail());
        }catch(Exception ex){
            ex.printStackTrace();
        }
    }

}

 

 

Over

 

posted on 2015-11-02 11:14  阿彬  阅读(288)  评论(0编辑  收藏  举报