代码改变世界

爬虫获取数据

2017-08-01 10:21  sihao560  阅读(540)  评论(0编辑  收藏  举报

1.pom.xml文件

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
  <modelVersion>4.0.0</modelVersion>
  <groupId>test01</groupId>
  <artifactId>test01</artifactId>
	<version>1.0</version>
	<packaging>jar</packaging>
	
 	<properties> 
 		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> 
		<target.version>1.0</target.version>
		<spring.version>4.2.3.RELEASE</spring.version>
 		<quartz.version>1.8.6</quartz.version> 
	</properties> 
	
	<dependencies>
		<dependency>
			<groupId>junit</groupId>
			<artifactId>junit</artifactId>
			<version>4.11</version>
		</dependency>
		<dependency>
			<groupId>log4j</groupId>
			<artifactId>log4j</artifactId>
			<version>1.2.17</version>
		</dependency>
		<dependency>
			<groupId>org.slf4j</groupId>
			<artifactId>slf4j-log4j12</artifactId>
			<version>1.7.5</version>
		</dependency>
		<!-- WebCollector dependency -->
		<dependency>
			<groupId>cn.edu.hfut.dmic.webcollector</groupId>
			<artifactId>WebCollector</artifactId>
			<version>2.09</version>
		</dependency>
		<!-- selenium -->
		<dependency>
			<groupId>org.seleniumhq.selenium</groupId>
			<artifactId>selenium-java</artifactId>
			<version>2.44.0</version>
		</dependency>
		<!-- phantomjsdriver(selenium webdriver 第三方支持) -->
		<dependency>
			<groupId>com.github.detro</groupId>
			<artifactId>phantomjsdriver</artifactId>
			<version>1.2.0</version>
		</dependency>
		<dependency>
			<groupId>com.alibaba</groupId>
			<artifactId>druid</artifactId>
			<version>1.0.31</version>
		</dependency>
		<dependency>
			<groupId>mysql</groupId>
			<artifactId>mysql-connector-java</artifactId>
			<version>6.0.6</version>
		</dependency>
		<dependency>
			<groupId>org.springframework</groupId>
			<artifactId>spring-context</artifactId>
			<version>${spring.version}</version>
			<exclusions>
				<!-- Exclude Commons Logging in favor of SLF4j -->
				<exclusion>
					<groupId>commons-logging</groupId>
					<artifactId>commons-logging</artifactId>
				</exclusion>
			</exclusions>
		</dependency>
		<!-- jsonpath -->
		<dependency>
		    <groupId>net.minidev</groupId>
		    <artifactId>json-smart</artifactId>
		    <version>2.2.1</version>
		</dependency>
		<dependency>
		    <groupId>com.jayway.jsonpath</groupId>
		    <artifactId>json-path</artifactId>
		    <version>2.2.0</version>
		</dependency>
		<dependency><!--3.0.7没这个包 -->
			<groupId>org.springframework</groupId>
			<artifactId>spring-context-support</artifactId>
			<version>${spring.version}</version>
		</dependency>
		<dependency>
			<groupId>org.springframework</groupId>
			<artifactId>spring-webmvc</artifactId>
			<version>${spring.version}</version>
		</dependency>
		<dependency>
			<groupId>org.springframework</groupId>
			<artifactId>spring-orm</artifactId>
			<version>${spring.version}</version>
			<type>jar</type>
			<scope>compile</scope>
		</dependency>

		<dependency>
			<groupId>org.springframework</groupId>
			<artifactId>spring-test</artifactId>
			<version>${spring.version}</version>
			<type>jar</type>
			<scope>test</scope>
		</dependency>
		<dependency>
			<groupId>org.quartz-scheduler</groupId>
			<artifactId>quartz</artifactId>
			<version>${quartz.version}</version>
		</dependency>
		<dependency>
			<groupId>net.sf.json-lib</groupId>
			<artifactId>json-lib</artifactId>
			<version>2.4</version>
		</dependency>
		<dependency>
			<groupId>com.alibaba</groupId>
			<artifactId>fastjson</artifactId>
			<version>1.2.16.sec01</version>
		</dependency>
	</dependencies>
	<build>
    <finalName>test01</finalName>
  </build>
</project>
  

  2.测试文件

package test01;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class test {
	public static void main(String[] args) {
		System.setProperty("http.maxRedirects", "50");
		System.getProperties().setProperty("proxySet", "true");
		System.getProperties().setProperty("http.proxyHost", "10.19.110.55");
		System.getProperties().setProperty("http.proxyPort", "8080");
		System.getProperties().setProperty("https.proxyHost", "10.19.110.55");
		System.getProperties().setProperty("https.proxyPort", "8080");
		getCountry();
		System.out.println(111);
	}
	
	/**
	 * 模板
	 * @return
	 */
	public static List<Map<String, Object>> getCountry() {
		List<Map<String, Object>> list = new ArrayList<Map<String,Object>>();
		try {
			Document doc = Jsoup
					.connect("https://news.zhibo8.cc/nba/more.htm")
					.timeout(3000)
					.get();
			
			Element e = doc
					.getElementById("boxlist");
			Elements c = e
					.select("div.dataList ul li");
			for (Element e2 : c) {
				Map<String, Object> map = new HashMap<String, Object>();
				//关键字
				String data_country_id = e2.attr("data-label");
				//目标网站来源
				map.put("fromStation", "直播吧");
				//抓取频道
				String channel;
				map.put("fromStation", "NBA新闻滚动");
				//列表图
				String colImg;
				map.put("colImg", "无");
				//标题
				String title  = e2.select(".articleTitle a").html();
				map.put("title", title);
				//作者
				String author;
				//时间
				String time = e2.select(".postTime").html();
				map.put("time", time);
				//参考来源
				String ReferenceSource = e2.select(".source").html();;
				map.put("ReferenceSource", ReferenceSource);
				//评论数
				String commentsNumber;
				//评论列表
				String commentsList;
				//正文
				String content;
				//详情图片
				String imgDetail ;
				//新闻URL
				String newsURL = e2.select(".articleTitle a").attr("href");
				map.put("newsURL", newsURL);
				list.add(map);
			}
		} catch (IOException e) {
			e.printStackTrace();
		}
		System.out.println(list);
		return list;
	}
}

  

package test01;

import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.alibaba.fastjson.JSON;
import com.jayway.jsonpath.Configuration;
import com.jayway.jsonpath.JsonPath;
import com.suning.web.service.NewerService;
import com.suning.web.util.JDBCUtil;
import com.suning.web.util.JsonpUntil;

public class SportsTest {
	public static JDBCUtil jdbcutil;
	public static NewerService newerService = new NewerService();
	public static void main(String[] args) {
		System.setProperty("http.maxRedirects", "50");
		System.getProperties().setProperty("proxySet", "true");
		System.getProperties().setProperty("http.proxyHost", "10.19.110.55");
		System.getProperties().setProperty("http.proxyPort", "8080");
		System.getProperties().setProperty("https.proxyHost", "10.19.110.55");
		System.getProperties().setProperty("https.proxyPort", "8080");
		/*Runnable runnable1 = new Runnable() {  
            public void run() {  
            	String[] keyword = {"day.html","interfb.html","innerfb.html","nba.html","cba.html","sports.html"};
            	for(String key : keyword){          		
            		getSportsList(key);
            	}
            }  
        };  
        ScheduledExecutorService service = Executors  
                .newSingleThreadScheduledExecutor();  
        // 第二个参数为首次执行的延时时间,第三个参数为定时执行的间隔时间  
        service.scheduleAtFixedRate(runnable1, 0, 86400, TimeUnit.SECONDS);*/
		//getSportsList("day.html");
		//首页详情
		//getMainContent("http://resource.ttplus.cn/publish/app/data/2017/07/20/67522/share1.html");
		//新闻详情
		getSportContent("http://www.ttplus.cn/publish/app/data/2017/07/20/67559/share1.html");
        //getRealTime();
	}
	/**
	 * 24小时
	 */
	private static List<Map<String,Object>> getRealTime() {
		List<Map<String, Object>> list = new ArrayList<Map<String,Object>>();
		SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
		String getUrl = "http://www.ttplus.cn/24h?lastid=";
		String key = "";		
		try {
			String commentDe = JsonpUntil.encode(getUrl, key).toString();
			//----------------------------------------------------------------jsonpath------------------start
			System.out.println(commentDe);
			String type1 = JsonPath.parse(commentDe).read("$.type");//返回数据的状态
			System.out.println(type1);
			if("success".equals(type1)){
				List<Map<String, Object>> pData = JsonPath
						.using(Configuration.defaultConfiguration())
						.parse(commentDe)
						.read("$.content[?(@.newstime > 0)]", List.class);
				for(Map<String,Object> comm : pData){
					Map<String, Object> map2 = new HashMap<String, Object>();					
					//标题
					String title = (String) comm.get("title");
					map2.put("title", title);
					//目标网站来源
					map2.put("fromStation", "体坛+");					
					//抓取频道
					map2.put("channel", "24H");					
					//作者
					String author = (String) comm.get("authorName");
					map2.put("author", author);
					//时间
					String time = formatter.format(new Date((Long) (comm.get("newstime"))));
					map2.put("time", time);
					//新闻URL
					String newsURL = "";
					map2.put("newsURL", newsURL);	
					//在原网站数据库中id
					int aid = (Integer) comm.get("id");
					//详情图片
					String imgUrl = "";
					map2.put("imgUrl", imgUrl);
					//评论数
					String commentsNumber = "";
					map2.put("commentsNumber", commentsNumber);
					//关键字
					map2.put("keyword", "");//用来分开保存
					
					//新闻内容--------------------------start
					List<Map<String,Object>> commentsList = new ArrayList<Map<String,Object>>();
					Map<String, Object> map = new HashMap<String, Object>();
					//标题
					map.put("title",title);
					//作者
					map.put("author",author);
					//时间
					map.put("article_info",time);
					//关键字
					String tags = "";
					map.put("tags",tags);
					//图文信息
					String detail = "";
					List<Map<String,Object>> imgS = (List<Map<String, Object>>) JSON.parse(comm.get("img").toString());
					if(imgS.size() > 0){
						for(Map<String,Object> img : imgS){
							String imgHref = (String) img.get("imgurl");
							detail = detail + imgHref + "@/";
						}
					}
					detail = detail + (String) comm.get("content")+"@/";
					map.put("detail",detail);
					
					//评论
					List<Map<String,Object>> commentList = new ArrayList<Map<String,Object>>();
					map.put("commentsList", commentList);
					commentsList.add(map);
					//新闻内容--------------------------end
					map2.put("commentsList", commentsList);
					
					list.add(map2);					
				}				
			}
			
			
			
			
			
			//----------------------------------------------------------------jsonpath------------------end
			//把json乱码转成utf-8并以集合形式存贮
			Map<String,Object> parseData = (Map<String, Object>) JSON.parse(commentDe.toString());
			String type = parseData.get("type").toString();//返回数据的状态
			if("success".equals(type)){
				List<Map<String,Object>> pData = (List<Map<String, Object>>) JSON.parse(parseData.get("content").toString());
				for(Map<String,Object> comm : pData){
					Map<String, Object> map2 = new HashMap<String, Object>();					
					//标题
					String title = (String) comm.get("title");
					map2.put("title", title);
					//目标网站来源
					map2.put("fromStation", "体坛+");					
					//抓取频道
					map2.put("channel", "24H");					
					//作者
					String author = (String) comm.get("authorName");
					map2.put("author", author);
					//时间
					String time = formatter.format(new Date((Long) (comm.get("newstime"))));
					map2.put("time", time);
					//新闻URL
					String newsURL = "";
					map2.put("newsURL", newsURL);	
					//在原网站数据库中id
					int aid = (Integer) comm.get("id");
					//详情图片
					String imgUrl = "";
					map2.put("imgUrl", imgUrl);
					//评论数
					String commentsNumber = "";
					map2.put("commentsNumber", commentsNumber);
					//关键字
					map2.put("keyword", "");//用来分开保存
					
					//新闻内容--------------------------start
					List<Map<String,Object>> commentsList = new ArrayList<Map<String,Object>>();
					Map<String, Object> map = new HashMap<String, Object>();
					//标题
					map.put("title",title);
					//作者
					map.put("author",author);
					//时间
					map.put("article_info",time);
					//关键字
					String tags = "";
					map.put("tags",tags);
					//图文信息
					String detail = "";
					List<Map<String,Object>> imgS = (List<Map<String, Object>>) JSON.parse(comm.get("img").toString());
					if(imgS.size() > 0){
						for(Map<String,Object> img : imgS){
							String imgHref = (String) img.get("imgurl");
							detail = detail + imgHref + "@/";
						}
					}
					detail = detail + (String) comm.get("content")+"@/";
					map.put("detail",detail);
					
					//评论
					List<Map<String,Object>> commentList = new ArrayList<Map<String,Object>>();
					map.put("commentsList", commentList);
					commentsList.add(map);
					//新闻内容--------------------------end
					map2.put("commentsList", commentsList);
					
					list.add(map2);					
				}				
			}
		} catch (Exception e) {
			e.printStackTrace();
		}
		System.out.println(list);
		return list;
	}
	/**
	 * 获取体坛+网站所有信息
	 */
	public static List<Map<String,Object>> getSportsList(String val){
		List<Map<String, Object>> list = new ArrayList<Map<String,Object>>();
		String url = "http://www.ttplus.cn/";
		//新建一个数组用来存放已经保存的新闻id
		try {
			Document doc = Jsoup.connect(url+val).timeout(3000).get();
			Map<String, Object> map1 = new HashMap<String, Object>();
			Map<String, Object> map2 = new HashMap<String, Object>();
			//轮播图片的跳转
			Elements main = doc.select("#swiper-wrapper .swiper-slide");
			if(main.size() > 0){
				for (Element li : main) {
					//标题
					String title = li.select("a p").text();
					map1.put("title", title);
					//目标网站来源
					map1.put("fromStation", "体坛+");
					//抓取频道
					map1.put("channel", "首页滚动");
					//作者
					String author="";
					map1.put("author", author);
					//时间
					String time="";
					map1.put("time_info", time);
					
					//列表图
					String imgUrl = li.select("a img").attr("src");
					map1.put("imgUrl", imgUrl);
					//评论数
					String commentsNumber = "";
					map1.put("commentsNumber", commentsNumber);
					//关键字
					map1.put("keyword", "main");
					//新闻URL
					String newsURL = li.select("a").attr("href");
					
					List<Map<String,Object>> detail = new ArrayList<Map<String,Object>>();
					if(newsURL.contains("http://resource.ttplus.cn/publish/app/data/")){
						//标题id
						String aid = newsURL.split("/")[9];
						map1.put("newsURL", newsURL);	
						/**
						 * 轮播图详情
						 */
						detail = getSportContent(newsURL);
						map1.put("detail", detail);	
						list.add(map1);
					}else{
						continue;
					}
				}
			}
			//模块部分
			Elements part = doc.select("#newsListBox #newsList li");
			if(part.size() > 0){
				for(Element li : part){
					//标题
					String title = li.select("a .newsBox-bd h3").text();
					map2.put("title", title);
					//目标网站来源
					map2.put("fromStation", "体坛+");
					
					//抓取频道
					map2.put("channel", "首页滚动");
					
					Elements deta = li.select("a .newsBox-bd p span");
					//作者
					String author = deta.get(0).text();
					map2.put("author", author);
					//时间
					String time = deta.get(1).text();
					map2.put("time", time);
					//新闻URL
					String newsURL = li.select("a").attr("href");
					map2.put("newsURL", newsURL);	
					//在原网站数据库中id
					String aid = newsURL.split("/")[9];
					//详情图片
					String imgUrl = li.select("a .newsBox-hd img").attr("src");
					map2.put("imgUrl", imgUrl);
					//评论数
					String commentsNumber = deta.get(2).text();
					map2.put("commentsNumber", commentsNumber);
					//关键字
					map2.put("keyword", val);//用来分开保存
					
					//评论列表
					if(!newsURL.contains("video.html")){						
						List<Map<String,Object>> commentsList = getSportContent(newsURL);
						map2.put("commentsList", commentsList);
					}else{
						continue;
					}
					list.add(map2);
				}
			}
			
		} catch (IOException e) {
			e.printStackTrace();
		}
		//System.out.println(list);
		return list;
	}
	/**
	 * 获取详细信息
	 */
	@SuppressWarnings("unchecked")
	public static List<Map<String,Object>> getSportContent(String newsURL){
		List<Map<String, Object>> list = new ArrayList<Map<String,Object>>();
		SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
		Calendar cal = Calendar.getInstance();
	    int year = cal.get(Calendar.YEAR);
		try {
			Map<String, Object> map = new HashMap<String, Object>();
			Document doc = Jsoup.connect(newsURL).timeout(3000).get();
			String pubtime = doc.select("#author_id h6").attr("id");
			//详情图片
			String detail = "";
			if("pubtime3".equals(pubtime)){
				//标题
				String title = doc.select(".d-title .h1-title").text();
				map.put("title",title);
				//作者
				String author = doc.select("#author_id #authorMass .m-detail-source-cnt .m-detail-source-cnt-inner span").text();
				map.put("author",author);
				//时间
				String article_info = year + "-" +doc.select("#author_id #pubtime3 .pull-left").text();
				map.put("article_info",article_info);
				//关键字
				String tags = doc.select("#author_id #pubtime3 .original").text();
				map.put("tags",tags);
			}else if("pubtime1".equals(pubtime)){				
				//标题
				String title = doc.select(".d-title .h1-title").text();
				map.put("title",title);
				//作者
				String author = doc.select("#author_id #authorMass .m-detail-source-cnt .m-detail-source-cnt-inner span").text();
				map.put("author",author);
				//时间
				String article_info = year + "-" +doc.select("#author_id #pubtime").text();
				map.put("article_info",article_info);
				//关键字
				String tags = "";
				map.put("tags",tags);
			}else if("pubtime".equals(pubtime)){				
				//标题
				String title = doc.select(".d-title .h1-title").text();
				map.put("title",title);
				Elements pull_left = doc.select("#author_id #pubtime span");
				//时间
				String article_info = year + "-" +pull_left.get(1).text();
				map.put("article_info",article_info);
				//作者
				String author = pull_left.get(0).text();
				map.put("author",author);
				//关键字
				String tags = "";
				map.put("tags",tags);
			}else if("pubtime4".equals(pubtime)){
				//标题
				String title = doc.select(".d-title .h1-title").text();
				map.put("title",title);
				Elements pull_left = doc.select("#author_id #pubtime4 span");
				//时间
				String article_info = year + "-" +pull_left.get(1).text();
				map.put("article_info",article_info);
				//作者
				String author = pull_left.get(0).text();
				map.put("author",author);
				//关键字
				String tags = pull_left.get(2).text();
				String tag = doc.select(".m-detail .m-detail-hd-ft .m-detail-type span").text();
				if(!"".equals(tag) && null != tag){
					tags = tags + ";" + tag;
				}
				map.put("tags",tags);
				//标题图
				String titleImg = doc.select(".m-detail .m-detail-hd img").attr("src");
				if(!"".equals(titleImg) && null != titleImg){
					detail = detail + titleImg + "@/";
				}
			}
			
			
			Elements pList = doc
					.select(".m-detail-bd p");
			if(pList.size() > 0){//图文信息获取
				for(Element p : pList){						
					String data_src = p.select("img").attr("src");
					if("".equals(data_src) || null ==data_src){
						detail = detail + p.text()+ "@/";
					}else if(!"".equals(p.select("strong").text()) || null != p.select("strong").text()){
						detail = detail + p.select("strong").text() + "@/";
					}else{
						detail = detail + data_src + "@/";
					}
				}					
			}
			map.put("detail",detail);
			
			//评论
			String aid = newsURL.split("/")[9];			//当前新闻的id
			String getUrl = "http://app.ttplus.cn:1102/v2/commpent/news/www/"+aid+"/0";
			String key = "callback=callback_cmt&_="+System.currentTimeMillis();
			
			String commentDe = JsonpUntil.encode(getUrl, key).toString();
			commentDe = commentDe.substring(13, commentDe.length() - 2);
			System.out.println(commentDe);
			
			//---------jsonPath--------------start
			int count1 = JsonPath.parse(commentDe).read("$.count");
			if(count1 > 0){
				List<Map<String,Object>> pData = JsonPath
						.using(Configuration.defaultConfiguration())
						.parse(commentDe)
						.read("$.comment[?(@.id > 0)]", List.class);
				for(Map<String,Object> comm : pData){
					Map<String, Object> commentMap = new HashMap<String, Object>();
					//评论人信息
					String comment_user = (String) comm.get("username");
					commentMap.put("comment_user", comment_user);
					//评论时间
					String comment_time = formatter.format(new Date((Long) (comm.get("time"))));
					commentMap.put("comment_time", comment_time);
					//评论内容
					String comment_content = (String) comm.get("content");
					commentMap.put("comment_content", comment_content);
					
				}
			}
			
			
			
			//---------jsonPath-------------end		
			//把json乱码转成utf-8并以集合形式存贮
			Map<String,Object> parseData = (Map<String, Object>) JSON.parse(commentDe.toString());
			List<Map<String,Object>> commentList = new ArrayList<Map<String,Object>>();
			int count = (Integer) JSON.parse(parseData.get("count").toString());

			if(count > 0){
				List<Map<String,Object>> pData = (List<Map<String, Object>>) JSON.parse(parseData.get("comment").toString());
				for(Map<String,Object> comm : pData){
					Map<String, Object> commentMap = new HashMap<String, Object>();
					//评论人信息
					String comment_user = (String) comm.get("username");
					commentMap.put("comment_user", comment_user);
					//评论时间
					String comment_time = formatter.format(new Date((Long) (comm.get("time"))));
					commentMap.put("comment_time", comment_time);
					//评论内容
					String comment_content = (String) comm.get("content");
					commentMap.put("comment_content", comment_content);
					commentList.add(commentMap);
				}
				map.put("commentNumber", commentList.size());
			}
			map.put("commentsList", commentList);			
			list.add(map);						
		} catch (IOException e) {
			e.printStackTrace();
		} catch (Exception e) {
			e.printStackTrace();
		}
		//System.out.println(list);
		return list;
	}
}

  

package test01;

import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.alibaba.fastjson.JSON;
import com.suning.web.util.JsonpUntil;
import com.suning.web.util.StringUtil;

public class OnFiresTest {
	public static void main(String[] args) {
		System.setProperty("http.maxRedirects", "50");
		System.getProperties().setProperty("proxySet", "true");
		System.getProperties().setProperty("http.proxyHost", "10.19.110.55");
		System.getProperties().setProperty("http.proxyPort", "8080");
		System.getProperties().setProperty("https.proxyHost", "10.19.110.55");
		System.getProperties().setProperty("https.proxyPort", "8080");
		System.out.println("onfire");
		//OnFire篮球APP获取
		//Set aids = new HashSet();
		//getOnFireList(1,aids);
		getContent("http://www.bbonfire.com/news/detail?p=pc&aid=56374");
		/*Runnable runnable1 = new Runnable() {  
			Set aids = new HashSet();
            public void run() {  
            	getOnFireList(1,aids); 
            	//System.out.println(aids);
            }  
        };  
        Runnable runnable2 = new Runnable() {  
        	Set aids = new HashSet();
            public void run() {  
            	getOnFireList(2,aids);
            	getOnFireList(3,aids);
            }  
        }; 
        ScheduledExecutorService service = Executors  
                .newSingleThreadScheduledExecutor();  
        // 第二个参数为首次执行的延时时间,第三个参数为定时执行的间隔时间  
        service.scheduleAtFixedRate(runnable1, 0, 1800, TimeUnit.SECONDS);
        service.scheduleAtFixedRate(runnable2, 0, 86400, TimeUnit.SECONDS);*/
	}
	/**
	 * 抓取OnFire篮球APP包
	 * 当i为1时为推荐,30分钟抓取一次;
	 * 2时为专栏,24小时抓取一次
	 * 3时为精译,24小时抓取一次
	 */
	public static List<Map<String,Object>> getOnFireList(int i,Set aids){
		List<Map<String, Object>> list = new ArrayList<Map<String,Object>>();
		String url = "http://www.bbonfire.com";
		//新建一个数组用来存放已经保存的新闻id
		try {
			Document doc = Jsoup
					.connect(url+"/news/index?c="+i+"&p=pc")
					.timeout(3000)
					.get();
			
			Elements e = doc
					.select(".news-list .news-item");
			if(e.size() > 0){
				for (Element e2 : e) {
					Map<String, Object> map = new HashMap<String, Object>();
					//标题
					String title  = e2.select(".news-title a").text();
					map.put("title", title);
					//目标网站来源
					map.put("fromStation", "OnFire");
					//抓取频道
					String channel = "";
					if(i == 1){
						channel =  "推荐";
					}else if(i == 2){
						channel =  "专栏";
					}else{
						channel = "精译";
					}
					map.put("channel", channel);
					//作者
					String author = "";
					map.put("author", author);
					//时间
					String time_info = e2.select(".news-info .time-info").text();
					map.put("time_info", time_info);
					//新闻URL
					String newsURL = e2.select(".news-title a").attr("href");
					map.put("newsURL", url+newsURL);
					//在原网站数据库中id
					String aid = StringUtil.getNumbers(e2.select(".news-title a").attr("href"));
					//判断数组中是否已经有此id,有跳过循环,没有存入
					if(aids.contains(aid)){
						continue;
					}else{					
						map.put("aid", aid);
						aids.add(aid);
					}
					//标题图地址
					String imgUrl = e2.select(".news-thumb a img").attr("src");
					map.put("imgUrl", imgUrl);
					//评论数
					String commentsNumber = e2.select(".news-rel .news-comment").text().replace("评论", "").replace(" ", "");
					map.put("commentsNumber", commentsNumber);
					//关键字
					map.put("keyword", "");//用来分开保存
					//获取详情
					List<Map<String,Object>> commentsList = getContent(url+newsURL);
					if(commentsList.size() > 0){//不是图文信息则跳过当前循环				
						map.put("content", commentsList);
						
						list.add(map);
					}else{
						continue;
					}
				}
			}

		} catch (IOException e) {
			e.printStackTrace();
		}
		System.out.println(list);
		return list;		
	}
	/**
	 * 获取详情信息
	 * @return
	 */
	private static List<Map<String, Object>> getContent(String contentUrl) {
		List<Map<String, Object>> list = new ArrayList<Map<String,Object>>();
		SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
		
		try {
			Document doc = Jsoup
					.connect(contentUrl)
					.timeout(3000)
					.get();
			//System.out.println(doc);
			Map<String, Object> map = new HashMap<String, Object>();
			//图文信息
			if(!"transparent".equals(doc.select("embed").attr("wmode"))){//判断图文消息		
				//标题
				String title = doc.select(".article h1").text();
				map.put("title",title);
				//时间
				SimpleDateFormat form1 = new SimpleDateFormat("yyyy年MM月dd日 HH:mm");
				String article_info = formatter.format(form1.parse(doc.select(".article-info .time").text())).toString();
				map.put("article_info",article_info);
				//作者
				String author = doc.select(".article-info .author").text();
				map.put("author",author);
				//详情图片
				String detail = "";
				Elements pList = doc
						.select(".article-content p");
				if(pList.size() > 0){//图文信息获取
					for(Element p : pList){						
						String data_src = p.select("img").attr("data-src");
						if("".equals(data_src) || null ==data_src){
							detail = detail + p.text()+ "@/";
						}else{
							detail = detail + data_src + "@/";
						}
					}					
				}
				map.put("detail",detail);
				//关键字
				String tags = "";
				Elements spanList = doc.select(".article-tag span");
				if(spanList.size() > 0){
					for(Element span : spanList){
						tags = tags + span.text() + ";";
					}
				}
				map.put("tags",tags);
				
				//评论
				String aid = doc.select("#commentHTML").attr("data-articleid");				//当前新闻的id
				String getUrl = "http://www.bbonfire.com/api/list";
				String key = "p=comment&isjs=1&articleid="+aid+"&len=15&hotlen=5";
				String commentDe = JsonpUntil.encode(getUrl, key).toString();
				//System.out.println(commentDe);
				//把json乱码转成utf-8并以集合形式存贮
				Map<String,Object> parseData = (Map<String, Object>) JSON.parse(commentDe.toString());
				List<Map<String,Object>> pData = (List<Map<String, Object>>) JSON.parse(parseData.get("data").toString());
				List<Map<String,Object>> commentList = new ArrayList<Map<String,Object>>();
				if(pData.size() > 0){
					for(Map<String,Object> comm : pData){
						Map<String, Object> commentMap = new HashMap<String, Object>();
						//评论人信息
						Map<String,Object> comment_user = (Map<String, Object>) comm.get("userInfo");
						commentMap.put("comment_user", comment_user.get("screen_name").toString());
						//评论时间
						SimpleDateFormat form = new SimpleDateFormat("EEE MMM dd HH:mm:ss Z yyyy", Locale.US);
						String comment_time = formatter.format(form.parse((String)comm.get("ctime")));
						commentMap.put("comment_time", comment_time);
						//评论内容
						String comment_content = (String) comm.get("content");
						commentMap.put("comment_content", comment_content);
						commentList.add(commentMap);
					}
					map.put("commentsList", commentList);
					map.put("commentNumber", commentList.size());
				}
				
				
				list.add(map);
			}

		} catch (IOException e) {
			e.printStackTrace();
		} catch (Exception e) {
			e.printStackTrace();
		}
		System.out.println(list);
		return list;		
	}
}

  3.ajxa请求

package com.suning.web.util;

import java.io.StringWriter;

import org.apache.commons.codec.Charsets;
import org.apache.commons.io.output.WriterOutputStream;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.conn.params.ConnRouteParams;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.message.BasicHeader;

public class JsonpUntil {

	public static StringWriter encode(String url,String key) throws  Exception{
		StringWriter sw = null ;
		HttpClient httpClient = new DefaultHttpClient();
		HttpHost proxy = new HttpHost("10.19.110.55", 8080);
		httpClient.getParams().setParameter(ConnRouteParams.DEFAULT_PROXY,proxy);
		if(!"".equals(key) && null != key){
			url = url+"?"+key;
		}
		HttpGet httpGet = new HttpGet(url);
		httpGet.addHeader(new BasicHeader("Cookie", "_snma=1%7C149567342565754882%7C1495673425657%7C1495673446005%7C1495714227730%7C3%7C3; idsLoginUserIdLastTime=16030136; authId=si9343022161FCD46A3745D6F3A1BCB180; secureToken=5E769A7ADD32F1977AC2104266C010F3"));
		HttpResponse loginResponse = httpClient.execute(httpGet);
		HttpEntity loginEntity = loginResponse.getEntity();
		if("HTTP/1.1 404 Not Found".trim().equals(loginResponse.getStatusLine().toString().trim()))
		{
			System.out.println(url);
			System.out.println("此条信息异常!");
		}
		else
		{
			sw = new StringWriter();
			try (WriterOutputStream out = new WriterOutputStream(sw, Charsets.UTF_8))
			{
				loginEntity.writeTo(out);
			}
		}
		return sw;
	}
}