jeecms 强大的采集功能优化 转载 https://blog.csdn.net/jeff06143132/article/details/7099003

========================================================= 
 
没办法附件上传不了,AcquisitionSvcImpl.java类: 
//---------------------------------------------------------------------------- 
package com.jeecms.cms.service; 
 
import java.io.IOException; 
import java.net.URI; 
import java.util.ArrayList; 
import java.util.List; 
import java.util.regex.Matcher; 
import java.util.regex.Pattern; 
import org.apache.commons.lang.StringUtils; 
import org.apache.http.HttpEntity; 
import org.apache.http.HttpResponse; 
import org.apache.http.StatusLine; 
import org.apache.http.client.ClientProtocolException; 
import org.apache.http.client.HttpClient; 
import org.apache.http.client.HttpResponseException; 
import org.apache.http.client.ResponseHandler; 
import org.apache.http.client.methods.HttpGet; 
import org.apache.http.impl.client.DefaultHttpClient; 
import org.apache.http.util.EntityUtils; 
import org.slf4j.Logger; 
import org.slf4j.LoggerFactory; 
import org.springframework.beans.factory.annotation.Autowired; 
import org.springframework.stereotype.Service; 
import com.jeecms.cms.entity.assist.CmsAcquisition; 
import com.jeecms.cms.entity.main.Content; 
import com.jeecms.cms.manager.assist.CmsAcquisitionMng; 
 
@Service 
public class AcquisitionSvcImpl implements AcquisitionSvc { 
private Logger log = LoggerFactory.getLogger(AcquisitionSvcImpl.class); 
 
public boolean start(Integer id) {  
CmsAcquisition acqu = cmsAcquisitionMng.findById(id); 
if (acqu == null || acqu.getStatus() == CmsAcquisition.START) { 
return false; 
} 
Thread thread = new AcquisitionThread(acqu); 
thread.start(); 
return true; 
} 
 
private CmsAcquisitionMng cmsAcquisitionMng; 
 
@Autowired 
public void setCmsAcquisitionMng(CmsAcquisitionMng cmsAcquisitionMng) { 
this.cmsAcquisitionMng = cmsAcquisitionMng; 
} 
 
private class AcquisitionThread extends Thread { 
private CmsAcquisition acqu; 
 
public AcquisitionThread(CmsAcquisition acqu) {  
super(acqu.getClass().getName() + "#" + acqu.getId()); 
this.acqu = acqu; 
}  
 
@Override 
public void run() { 
if (acqu == null) { 
return; 
} 
acqu = cmsAcquisitionMng.start(acqu.getId()); 
String[] plans = acqu.getAllPlans(); 
HttpClient client = new DefaultHttpClient(); 
CharsetHandler handler = new CharsetHandler(acqu.getPageEncoding()); 
List<String> contentList; 
String url; 
int currNum = acqu.getCurrNum(); 
int currItem = acqu.getCurrItem(); 
Integer acquId = acqu.getId(); 
 
for (int i = plans.length - currNum; i >= 0; i--)  
{ 
url = plans[i]; 
 
contentList = getContentList(client, handler, url, acqu.getLinksetStart(), acqu.getLinksetEnd(), acqu.getLinkStart(), acqu.getLinkEnd()); 
 
String link; 
 
if(contentList!=null) 
{ 
for (int j = contentList.size() - currItem; j >= 0; j--)  
{ 
if (cmsAcquisitionMng.isNeedBreak(acqu.getId(), plans.length - i, contentList.size() - j, contentList.size()))  
{ 
client.getConnectionManager().shutdown(); 
log.info("Acquisition#{} breaked", acqu.getId()); 
return; 
} 
if (acqu.getPauseTime() > 0)  
{ 
try  
{ 
Thread.sleep(acqu.getPauseTime()); 
}  
catch (InterruptedException e)  
{ 
log.warn("", e); 
} 
} 
link = contentList.get(j); 
saveContent(client, handler, acquId, link, acqu.getTitleStart(), acqu.getTitleEnd(), acqu.getContentStart(), acqu.getContentEnd()); 
} 
} 
currItem = 1; 
} 
client.getConnectionManager().shutdown(); 
cmsAcquisitionMng.end(acqu.getId()); 
log.info("Acquisition#{} complete", acqu.getId()); 
} 
 
 
private List<String> getContentList(HttpClient client, 
CharsetHandler handler, String url, String linksetStart, 
String linksetEnd, String linkStart, String linkEnd) { 
 
List<String> list = new ArrayList<String>(); 
 
try  
{ 
HttpGet httpget = new HttpGet(new URI(url)); 
String html = client.execute(httpget, handler); 
 
Pattern pt = Pattern.compile(linksetStart.trim()); 
    Matcher m = pt.matcher(html); 
     
    if(m.find()) 
    { 
     html = m.group(); 
    } 
     
    if(html!=null)  
    { 
     list = getUrlsList(html,linkStart); 
    } 
     
}  
catch (Exception e)   
{ 
log.warn(null, e); 
} 
return list; 
} 
 
/** 
 * 得到地址集 
 *  
 * @param html 
 * @param linkStart 
 * @return 
 */ 
private List<String> getUrlsList(String html,String linkStart) 
{ 
List<String> list = new ArrayList<String>(); 
 
     Pattern pt = Pattern.compile(linkStart); 
     
     Matcher m = pt.matcher(html); 
     
     while(m.find()) 
     { 
     String link = m.group(1); 
     
     if(null!=link && !"".equals(link)) 
     { 
     //System.out.println("url : " + link); 
     list.add(link); 
     } 
     } 
     return list; 
} 
 
private Content saveContent(HttpClient client, CharsetHandler handler, 
Integer acquId, String url, String titleStart, String titleEnd, 
String contentStart, String contentEnd) { 
 
try { 
 
HttpGet httpget = new HttpGet(new URI(url)); 
String html = client.execute(httpget, handler); 
 
String title = ""; 
Pattern pt = Pattern.compile(titleStart.trim()); 
Matcher mt = pt.matcher(html); 
 
if (mt.find())  
{ 
title = mt.group(1); 
//System.out.println("title : " + title); 
} 
 
String txt = ""; 
pt = Pattern.compile(contentStart.trim()); 
mt = pt.matcher(html); 
if(mt.find()){ 
txt = mt.group(); 
//System.out.println("txt : " + txt); 
} 
 
return cmsAcquisitionMng.saveContent(title, txt, acquId); 
 
}  
catch (Exception e)  
{ 
log.warn(null, e);  
e.printStackTrace(); 
return null; 
} 
} 
} 
 
private class CharsetHandler implements ResponseHandler<String> { 
private String charset; 
 
public CharsetHandler(String charset) { 
this.charset = charset; 
} 
 
public String handleResponse(HttpResponse response) 
throws ClientProtocolException, IOException { 
StatusLine statusLine = response.getStatusLine(); 
if (statusLine.getStatusCode() >= 300) { 
throw new HttpResponseException(statusLine.getStatusCode(), 
statusLine.getReasonPhrase()); 
} 
HttpEntity entity = response.getEntity(); 
if (entity != null) { 
if (!StringUtils.isBlank(charset)) { 
return EntityUtils.toString(entity, charset); 
} else { 
return EntityUtils.toString(entity); 
} 
} else { 
return null; 
} 
} 
} 
} 
//-------------------------------------------------------------------------------- 

1:将AcquisitionSvcImpl.java 替换原工程项目com.jeecms.cms.service包下的对应文件。 

2:编译工程即可 

3:登陆后台配相关规则,如下所示参数: 

==================================== 
*采集名称: 韩寒博客 

*页面编码: UTF-8 

  动态地址: http://blog.sina.com.cn/s/articlelist_1191258123_0_[page].html 

                        页码 从   1  到:  2 

内容地址集:   <!-- 列表 START -->.*?<!-- 列表END --> 

内容地址: target="_blank" href="(.*?)">(.*?)</a></span> 

标题:         <title>(.*?)_韩寒_新浪博客</title> 

内容:         <!-- 正文开始 -->(.*?)<!-- 正文结束 -->

posted @ 2019-07-16 17:07  DarJeely  阅读(355)  评论(0编辑  收藏  举报