java爬虫笔记:使用WebCollector增量采集www.baiduyunsousou.com
WebCollector可以配置短点爬取,历史数据根据Key去重,也就是url
最近在采集百度云网盘,记录一下
/**
* @author Liu
* @create 2022-08-02 11:48
*/
@Component
@Slf4j
public class DeepCrawler extends BaseCrawler {
private CrawlerConfig crawlerConfig;
@Override
public void execute() {
List<CrawlerConfig> crawlerConfigs = new ArrayList<>();
if (this.crawlerConfig != null) {
crawlerConfigs.add(this.crawlerConfig);
} else {
crawlerConfigs = this.crawlerConfigService.getDeepCrawlerConfig();
}
super.initCrawlerConfig(crawlerConfigs);
//多站点多线程爬取
for (CrawlerConfig config : crawlerConfigs) {
try {
if (SimpleCrawlerStoreMap.deepCrawlerThreadMap.get(config.getId()) == null) {
simpleCrawlerPool.execute(() -> {
DeepCrawlerThread deepCrawlerThread = new DeepCrawlerThread(config);
SimpleCrawlerStoreMap.deepCrawlerThreadMap.put(config.getId(), deepCrawlerThread);
deepCrawlerThread.setNextFilter(new HashSetNextFilter());
try {
deepCrawlerThread.start(config.getDeep());
} catch (Exception e) {
e.printStackTrace();
log.error(config.getSiteName() + "=>爬取任务异常");
log.error(e.getMessage(), e);
}
});
} else {
log.info(config.getSiteName() + "=>爬取任务进行中……");
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
public CrawlerConfig getCrawlerConfig() {
return crawlerConfig;
}
public void setCrawlerConfig(CrawlerConfig crawlerConfig) {
this.crawlerConfig = crawlerConfig;
}
}

浙公网安备 33010602011771号