采用java,phantomjs页面渲染爬取阿里巴巴会员信息

阿里巴巴对于企业名录的保护早有措施应对,比如需要登录采集,非登录状态下采集数据有限,而且时不时还弹出登录页面,采集中还会出现验证码,这个好控制,只要对接大码平台,花销一笔小小的费用即可。

如何突破阿里巴巴防采集措施,规避验证码的方法,下面这段代码提供借鉴。

用java,phantomjs爬取阿里巴巴企业名录

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.Map;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.jcraft.jsch.JSch;
import com.jcraft.jsch.Session;

import cn.outshine.crawler.util.JDBCHelper;


public class AlibabaDetailCrawlerSimple {
	static private Logger logger = LoggerFactory.getLogger(AlibabaDetailCrawlerSimple.class);
	protected static int pageID=0;
    private static String projectPath = System.getProperty("user.dir");  
    private static String jsPath = projectPath + File.separator + "crawl.js";
    private static String exePath = projectPath + File.separator + "phantomjs1.9.exe";
    static String tableName = "alibaba_new";
   
    public static void main(String[] args) {
    	getRomoteMysql();
    	JDBCHelper.createMysqlTemplate("temp_alibaba", "jdbc:mysql://127.0.0.1:3306/alibaba?useUnicode=true&characterEncoding=UTF-8", "root", "", 5, 20);
        new AlibabaDetailCrawlerSimple().run();
    }
    
	public void run() {
        logger.info("Spider started!");
    	ExecutorService executor = Executors.newFixedThreadPool(1);
    	while (!Thread.currentThread().isInterrupted()){
    		sleep(1000);
    		final Future<?> future = executor.submit(new Runnable() {
                @Override
                public void run() {
                	while(true){
                		int i = new java.util.Random().nextInt(1);
                		Map company = JDBCHelper.getJdbcTemplate("temp_alibaba").queryForMap("SELECT * from "+tableName+" where remark='lock' limit ?,1",i);
                		String memberid = company.get("memberid").toString();
                		JDBCHelper.getJdbcTemplate("temp_alibaba").update("update "+tableName+" set remark=? where memberid=?","lock",company_url);
                		
                    	if(company.get("memberid")==null) break;
                    	logger.info("序号:"+ ++pageID+" 搜索:"+memberid);
                    	getCompanyDetail(memberid);
                	}
                }
            });
    	}
	}
    
    public void getCompanyDetail(String company_url) {
        String url = company_url;

    	try{
        	String html = getAjaxContnent(url);
        	if(html.contains("Unable to post")){
        		logger.error("无效网址:" + url);
//        		JDBCHelper.getJdbcTemplate("temp_alibaba").update("delete from alibaba_new where remark is null and memberid=?",memberid);
        		JDBCHelper.getJdbcTemplate("temp_alibaba").update("update "+tableName+" set remark=?,update_on=now() where memberid=?","delete",memberid);
        		return;
        	}
        	AlibabaPageAnalysis.alibabaMobilePage(html, memberid, tableName);
            
    	} catch (Exception e) {
    		logger.error(e.toString());
    		JDBCHelper.getJdbcTemplate("temp_alibaba").update("update "+tableName+" set remark=? where memberid=?","false",memberid);
			return;
		} finally{
//			webDriver.quit();
		}
    }
    
	public static String getAjaxContnent(String url) throws IOException, InterruptedException {
        Runtime rt = Runtime.getRuntime();
        Process p = rt.exec(exePath + " " + jsPath + " " + url);
//        Process p = rt.exec("/usr/phantomjs/bin/phantomjs" + " " + jsPath + " " + url);
        InputStream is = p.getInputStream();
        BufferedReader br = new BufferedReader(new InputStreamReader(is, Charset.forName("UTF-8")));
        StringBuffer htmlString = new StringBuffer();
        String tmp = "";
        while((tmp = br.readLine())!=null){
        	htmlString.append(tmp);
        }
        return htmlString.toString();
    }
	
    protected static void sleep(int time) {
        try {
            Thread.sleep(time);
        } catch (InterruptedException e) {
            logger.error("Thread interrupted when sleep",e);
        }
    }
}
posted @ 2018-09-26 10:16 outshinecn 阅读(...) 评论(...) 编辑 收藏