爬取数据,es搜索渲染实战
实战
爬虫
爬取数据:获取请求返回的页面信息,筛选出我们想要的数据
java中jsoup包
只能爬取网页,爬取电影,音乐用tika包
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.14.3</version>
</dependency>
爬数据工具类
@Component
public class ParseJdUtils {
public List<Good> parseGoods(String keywords) throws Exception {
String url = "https://search.jd.com/Search?enc=utf-8&keyword=" + keywords;
Document document = Jsoup.parse(new URL(url), 30000);
Element element = document.getElementById("J_goodsList");
Elements elements = element.getElementsByTag("li");
List<Good> list = new ArrayList<>();
for (Element el : elements) {
String img = el.getElementsByClass("p-img").get(0).getElementsByTag("img").attr("data-lazy-img");
String price = el.getElementsByClass("p-price").get(0).text();
String title = el.getElementsByClass("p-name").get(0).text();
Good good = new Good();
good.setTitle(title);
good.setImg(img);
good.setPrice(price);
list.add(good);
}
return list;
}
}
service爬取数据放入es
@Override
public boolean addGood(String keywords) throws Exception {
List<Good> goods = parseJdUtils.parseGoods(keywords);
BulkRequest bulkRequest = new BulkRequest("jd_good");
bulkRequest.timeout("2m");
for (int i = 0; i < goods.size(); i++) {
Good good = goods.get(i);
bulkRequest.add(new IndexRequest().id((i + 1) + "").source(JSONObject.toJSONString(good), XContentType.JSON));
}
BulkResponse bulk = restHighLevelClient.bulk(bulkRequest, RequestOptions.DEFAULT);
return !bulk.hasFailures();
}
搜索和高亮搜索
@Override
public List<Map<String, Object>> search(String keywords, int pageNo, int pageSize) throws IOException {
//搜索请求
SearchRequest searchRequest = new SearchRequest("jd_good");
//搜索条件
SearchSourceBuilder sourceBuilder = new SearchSourceBuilder();
TermQueryBuilder termQueryBuilder = QueryBuilders.termQuery("title", "java");
sourceBuilder.query(termQueryBuilder);
sourceBuilder.timeout(new TimeValue(60, TimeUnit.SECONDS));
//高亮
HighlightBuilder highlightBuilder = new HighlightBuilder();
highlightBuilder.field("title");//高亮字段
highlightBuilder.requireFieldMatch(false);//多个高亮显示关闭
highlightBuilder.preTags("<span style='color:red'>");//高亮标签和样式前缀
highlightBuilder.postTags("</span>");//高亮猴准
sourceBuilder.highlighter(highlightBuilder);
//分页
sourceBuilder.from((pageNo - 1) * pageSize);//从第几条开始=size*(pageNo-1)
sourceBuilder.size(pageSize);
searchRequest.source(sourceBuilder);
SearchResponse searchResponse = restHighLevelClient.search(searchRequest, RequestOptions.DEFAULT);
List<Map<String, Object>> result = new ArrayList<>();
//1.非高亮展示
/*for (SearchHit hit : searchResponse.getHits().getHits()) {
result.add(hit.getSourceAsMap());
}*/
//2.高亮
for (SearchHit hit : searchResponse.getHits().getHits()) {
Map<String, HighlightField> highlightFields = hit.getHighlightFields();
HighlightField title = highlightFields.get("title");
Map<String, Object> sourceAsMap = hit.getSourceAsMap();
if (title != null) {
Text[] fragments = title.fragments();
StringBuilder newTitle = new StringBuilder();
for (Text fragment : fragments) {
newTitle.append(fragment);
}
sourceAsMap.put("title", newTitle.toString());
}
result.add(sourceAsMap);
}
return result;
}

浙公网安备 33010602011771号