package com.microbekb.crawler.cnki;
import com.microbekb.crawler.cnki.jpa.CnkiSpacePaper;
import com.microbekb.crawler.cnki.jpa.CnkiSpacePaperRepository;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selectable;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
/**
* Created by DELL on 2017/2/17.
*/
@Component
public class CnkiConferencePaperCrawler implements PageProcessor {
private Site site = Site.me().setRetryTimes(3).setSleepTime(500); //设置参数
private static HashSet<String> urls=new HashSet<>();
private static final org.slf4j.Logger log = LoggerFactory.getLogger(CnkiConferencePaperCrawler.class);
@Autowired
private CnkiSpacePaperRepository cnkiSpacePaperRepository; //用于与数据库关联,通过spring注入
@Autowired
private CnkiConferencePaperPipeline cnkiJournalPaperPipeline; //用于处理数据,通过spring注入
@Override
public void process(Page page) {
//需要爬取的字段
String url= page.getUrl().toString();
String title_cn= "";
String title_en= "";
String organization = "";
String abstract_cn= "";
String abstract_en= "";
String proceedings_name= "";
String conference_name= "";
String date= "";
String place= "";
String code= "";
String organizor= "";
String foundation="";
String authors_cn = "";
String authors_en = "";
//#chTitle
title_cn = page.getHtml().xpath("//span[@id='chTitle']/text()").toString();
title_en = page.getHtml().xpath("//span[@id='enTitle']/text()").toString();
List<Selectable> pList = page.getHtml().xpath("//div[@class='summary']/p").nodes();
for(Selectable p:pList){
//System.out.println(p.toString());
if(p.toString().contains("作者")){
List<String> author_cns = new ArrayList<>();
author_cns = p.xpath("//a[@class='KnowledgeNetLink']/text()").all();
for(String str:author_cns){
authors_cn = authors_cn+str+";";
}
}
if(p.toString().contains("机构")){
List<String> organizations = new ArrayList<>();
organizations = p.xpath("//a[@class='KnowledgeNetLink']/text()").all();
for(String str:organizations){
organization = organization+str+";";
}
}
if(p.toString().contains("摘要")){
abstract_cn = p.xpath("//span/text()").toString();
}
}
//#content > div:nth-child(1) > div:nth-child(5) > ul:nth-child(1) > li > a
proceedings_name = page.getHtml().xpath("//div[@class='summary']/ul/li/a/text()").toString();
List<String> itemStrList = page.getHtml().xpath("//div[@class='summary']/ul/li/text()").all();
for(String str:itemStrList){
if(str.contains("会议名称")){
conference_name = str.replaceAll("【会议名称】","").trim();
}
if(str.contains("会议时间")){
date = str.replaceAll("【会议时间】","").trim();
}
if(str.contains("会议地点")){
place = str.replaceAll("【会议地点】","").trim();
}
if(str.contains("分类号")){
code = str.replaceAll("【分类号】","").trim();
}
if(str.contains("主办单位")){
organizor = str.replaceAll("【主办单位】","").trim();
}
}
foundation = page.getHtml().xpath("//div[@class='summary']/div[@class='keywords']/text()").toString();
page.putField("url",url);
page.putField("title_cn",title_cn);
page.putField("title_en",title_en);
page.putField("organization",organization);
page.putField("abstract_cn",abstract_cn);
page.putField("abstract_en",abstract_en);
page.putField("proceedings_name",proceedings_name);
page.putField("conference_name",conference_name);
page.putField("date",date);
page.putField("place",place);
page.putField("code",code);
page.putField("organizor",organizor);
page.putField("foundation",foundation);
page.putField("authors_cn",authors_cn);
page.putField("authors_en",authors_en);
}
@Override
public Site getSite() {
return site;
}
//测试url,初始url地址,可根据需要修改
private void addUrlTest(){
urls.add("http://www.cnki.net/KCMS/detail/detail.aspx?dbcode=CPFD&filename=KAXH201012001028");
}
//调用此方法,为爬虫入口
public void start(){
//CnkiJournalPaperCrawler cnkiJournalPaperCrawler = new CnkiJournalPaperCrawler();
addTest();
//addUrlText();
Spider spider = Spider.create(this);
for(String url:urls){
spider = spider.addUrl(url);
}
try{
spider.thread(1).addPipeline(cnkiJournalPaperPipeline).run();
}catch (Exception e){
}
}
package com.microbekb.crawler.cnki;
import com.microbekb.crawler.cnki.jpa.CnkiConferencePaper;
import com.microbekb.crawler.cnki.jpa.CnkiConferencePaperRepository;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.annotation.ComponentScan;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
/**
* Created by DELL on 2017/2/17.
*/
@Component
@ComponentScan
public class CnkiConferencePaperPipeline implements Pipeline {
public static int count=0;
@Autowired
CnkiConferencePaperRepository cnkiConferencePaperRepository; //用于存储数据,通过spring注入
@Override
public void process(ResultItems resultItems, Task task) {
String url= "";
String title_cn= "";
String title_en= "";
String organization = "";
String abstract_cn= "";
String abstract_en= "";
String proceedings_name= "";
String conference_name= "";
String date= "";
String place= "";
String code= "";
String organizor= "";
String foundation="";
String authors_cn = "";
String authors_en = "";
url = resultItems.get("url");
title_cn = resultItems.get("title_cn");
title_en = resultItems.get("title_en");
organization = resultItems.get("organization");
abstract_cn = resultItems.get("abstract_cn");
abstract_en = resultItems.get("abstract_en");
proceedings_name = resultItems.get("proceedings_name");
conference_name = resultItems.get("conference_name");
date = resultItems.get("date");
place = resultItems.get("place");
code = resultItems.get("code");
organizor = resultItems.get("organizor");
foundation = resultItems.get("foundation");
authors_cn = resultItems.get("authors_cn");
authors_en = resultItems.get("authors_en");
CnkiConferencePaper cnkiConferencePaper = new CnkiConferencePaper();
String cnkiId = "";
String[] strs = url.split("filename");
cnkiId = strs[1].replaceAll("=","");
cnkiConferencePaper.setCnkiId(cnkiId);
cnkiConferencePaper.setUrl(url);
cnkiConferencePaper.setTitleCn(title_cn);
cnkiConferencePaper.setTitleEn(title_en);
cnkiConferencePaper.setOrganization(organization);
cnkiConferencePaper.setAbstractCn(abstract_cn);
cnkiConferencePaper.setAbstractEn(abstract_en);
cnkiConferencePaper.setProceedingsName(proceedings_name);
cnkiConferencePaper.setConferenceName(conference_name);
cnkiConferencePaper.setDate(date);
cnkiConferencePaper.setPlace(place);
cnkiConferencePaper.setCode(code);
cnkiConferencePaper.setAssociation(organizor);
cnkiConferencePaper.setAuthorsCn(authors_cn);
cnkiConferencePaper.setAuthorsEn(authors_en);
cnkiConferencePaperRepository.save(cnkiConferencePaper); //保存数据
System.out.println("已完成"+ (count++)+"条" );
}
}
package com.microbekb.crawler.cnki.jpa;
import javax.persistence.Entity;
import javax.persistence.Id;
/**
* Created by DELL on 2017/2/18.
*/
//实体类,对应数据库的表单
@Entity
public class CnkiConferencePaper {
@Id
private String cnkiId;
private String url;
private String titleCn;
private String titleEn;
private String organization;
private String abstractCn;
private String abstractEn;
private String proceedingsName;
private String conferenceName;
private String date;
private String place;
private String code;
private String association;
private String authorsCn;
private String authorsEn;
private String foundation;
public String getCnkiId() {
return cnkiId;
}
public void setCnkiId(String cnkiId) {
this.cnkiId = cnkiId;
}
public String getFoundation() {
return foundation;
}
public void setFoundation(String foundation) {
this.foundation = foundation;
}
public String getUrl() {
return url;
}
public String getAssociation() {
return association;
}
public void setAssociation(String association) {
this.association = association;
}
public String getAuthorsCn() {
return authorsCn;
}
public void setAuthorsCn(String authorsCn) {
this.authorsCn = authorsCn;
}
public String getAuthorsEn() {
return authorsEn;
}
public void setAuthorsEn(String authorsEn) {
this.authorsEn = authorsEn;
}
public void setUrl(String url) {
this.url = url;
}
public String getDate() {
return date;
}
public void setDate(String date) {
this.date = date;
}
public String getPlace() {
return place;
}
public void setPlace(String place) {
this.place = place;
}
public String getCode() {
return code;
}
public void setCode(String code) {
this.code = code;
}
public String getTitleCn() {
return titleCn;
}
public void setTitleCn(String titleCn) {
this.titleCn = titleCn;
}
public String getTitleEn() {
return titleEn;
}
public void setTitleEn(String titleEn) {
this.titleEn = titleEn;
}
public String getOrganization() {
return organization;
}
public void setOrganization(String organization) {
this.organization = organization;
}
public String getAbstractCn() {
return abstractCn;
}
public void setAbstractCn(String abstractCn) {
this.abstractCn = abstractCn;
}
public String getAbstractEn() {
return abstractEn;
}
public void setAbstractEn(String abstractEn) {
this.abstractEn = abstractEn;
}
public String getProceedingsName() {
return proceedingsName;
}
public void setProceedingsName(String proceedingsName) {
this.proceedingsName = proceedingsName;
}
public String getConferenceName() {
return conferenceName;
}
public void setConferenceName(String conferenceName) {
this.conferenceName = conferenceName;
}
}