Elasticsearch7和Elasticsearch8数据批量写入优化
Elasticsearch7和Elasticsearch8数据批量写入优化
背景
最近写的项目涉及了ES的检索,需要将大量的数据在短时间内写入到ES中,第一版的检索使用的是ES7,第二版添需要做语义检索,所以升级为了ES8
遇到的问题
写入速度极慢,一天最多百万级数据。
当前配置
JDK8 ES7(3台128G的节点) 使用Bulk批量API写入
优化措施
升级JDK到11 使用BulkProcessor API进行数据写入
默认情况下,使用Bulk API进行写入的时候,为了保障数据写入之后能立即被检索到,es会进行refresh操作。而refresh操作是比较影响ES写入速度的。下面是具体的Java代码
/**
* 配置es客户端
*/
@Bean
public RestHighLevelClient restHighLevelClient() {
BasicCredentialsProvider basicCredentialsProvider = new BasicCredentialsProvider();
basicCredentialsProvider.setCredentials(AuthScope.ANY, new UsernamePasswordCredentials(username, password));
String[] esAddresses = esAddress.split(",");
RestClientBuilder builder;
if (esAddresses.length > 1) {
HttpHost[] httpHosts = new HttpHost[esAddresses.length];
for (int i = 0; i < esAddresses.length; i++) {
httpHosts[i] = HttpHost.create(esAddresses[i]);
}
builder = RestClient.builder(httpHosts);
} else {
HttpHost httpHost = HttpHost.create(esAddresses[0]);
builder = RestClient.builder(httpHost);
}
builder.setRequestConfigCallback(f -> {
f.setConnectTimeout(600000);
f.setSocketTimeout(600000);
f.setConnectionRequestTimeout(600000);
return f;
});
builder.setHttpClientConfigCallback(f -> f.setDefaultCredentialsProvider(basicCredentialsProvider));
return new RestHighLevelClient(builder);
}
@Bean
public BulkProcessor bulkProcessor(RestHighLevelClient restHighLevelClient) {
BulkProcessor.Listener listener = new BulkProcessor.Listener() {
@Override
public void beforeBulk(long executionId, BulkRequest request) {
LOGGER.info("【beforeBulk】批次[{}] 携带 {} 请求数量", executionId, request.numberOfActions());
}
@Override
public void afterBulk(long executionId, BulkRequest request, BulkResponse response) {
if (!response.hasFailures()) {
LOGGER.info("【afterBulk-成功】批量 [{}] 完成在 {} ms", executionId, response.getTook().getMillis());
} else {
BulkItemResponse[] items = response.getItems();
for (BulkItemResponse item : items) {
if (item.isFailed()) {
LOGGER.info("afterBulk-失败】批量 [{}] 出现异常的原因 : {}", executionId, item.getFailureMessage());
break;
}
}
}
}
@Override
public void afterBulk(long l, BulkRequest bulkRequest, Throwable throwable) {
throwable.printStackTrace();
}
};
BulkProcessor.Builder builder = BulkProcessor.builder(((bulkRequest, bulkResponseActionListener) -> {
restHighLevelClient.bulkAsync(bulkRequest, RequestOptions.DEFAULT, bulkResponseActionListener);
}), listener);
//到达指定条数时刷新 -1则禁用该配置
builder.setBulkActions(bulkActions);
//内存到达指定大小时刷新
builder.setBulkSize(new ByteSizeValue(bulkSize, ByteSizeUnit.MB));
//设置的刷新间隔 单位是s -1则禁用该配置
builder.setFlushInterval(TimeValue.timeValueSeconds(flushInterval));
//设置允许执行的并发请求数
builder.setConcurrentRequests(concurrentRequests);
//设置重试策略
builder.setBackoffPolicy(BackoffPolicy.constantBackoff(TimeValue.timeValueSeconds(1), maxNumberOfRetries));
return builder.build();
}
如何使用
@Resource
private BulkProcessor bulkProcessor;
IndexRequest request = new IndexRequest();
request.id(id);
request.index(tableToEs.getIndexName());
request.source(JSON.toJSONString(esTopicCollectModel, serializeConfig), XContentType.JSON);
bulkProcessor.add(request);
速度提供 每个小时千万级数据写入 完美!
项目中添加了语义检索,总体来说就是数据在写入到es的时候 将数据进行向量化,然后在检索的时候 先将keyword进行向量化 然后在进行向量检索 以此达到语义检索的效果
由于ES8中废弃了RestHighLevelClient,BulkProcessor类也没有了 所以经过查看es文档后 整理了一份最新的api
代码示例
/**
* 配置es客户端
*/
@Bean
public ElasticsearchClient elasticsearchClient() throws Exception {
String[] split = esAddress.split(",");
HttpHost[] httpHosts = new HttpHost[split.length];
for (int i = 0; i < split.length; i++) {
httpHosts[i] = HttpHost.create(split[i]);
}
// 账号密码的配置
CredentialsProvider credentialsProvider = new BasicCredentialsProvider();
credentialsProvider.setCredentials(AuthScope.ANY, new UsernamePasswordCredentials(username, password));
// 创建SSLContext以跳过SSL证书验证
SSLContext sslContext = SSLContextBuilder.create()
.loadTrustMaterial((chain, authType) -> true)
.build();
// 配置HTTP客户端以使用SSLContext和跳过SSL主机名验证
RestClientBuilder builder = RestClient.builder(httpHosts)
.setHttpClientConfigCallback(httpClientBuilder ->
httpClientBuilder
.setSSLContext(sslContext)
.setDefaultCredentialsProvider(credentialsProvider)
.setSSLHostnameVerifier(NoopHostnameVerifier.INSTANCE)
.setDefaultIOReactorConfig(
IOReactorConfig.custom()
.setIoThreadCount(1)
.build()));
RestClientTransport transport = new RestClientTransport(builder.build(), new JacksonJsonpMapper());
return new ElasticsearchClient(transport);
}
/**
* 异步处理批量请求的对象
* 这里泛型使用的是字符串 目前还没验证在使用该对象进行add的时候 是不是需要设置doc为字符串类型
*/
@Bean
public BulkIngester<String> bulkIngester() throws Exception {
BulkListener<String> listener = new BulkListener<String>() {
/**
*
* @param executionId 此请求的id
* @param request 将发送的批量请求
* @param contexts 数据集
*/
@Override
public void beforeBulk(long executionId, BulkRequest request, List<String> contexts) {
LOGGER.info("【beforeBulk】批次[{}】 携带 【{}】 请求数量", executionId, contexts.size());
}
/**
* 批量请求之后调用
* @param executionId 此请求的id
* @param request 将发送的批量请求
* @param contexts 数据集
* @param response 返回值
*/
@Override
public void afterBulk(long executionId, BulkRequest request, List<String> contexts, BulkResponse response) {
LOGGER.info("【afterBulk】批次[{}】", executionId);
for (int i = 0; i < contexts.size(); i++) {
BulkResponseItem item = response.items().get(i);
if (item.error() != null) {
LOGGER.error("Failed to index file " + contexts.get(i) + " - " + item.error().reason());
}
}
}
/**
* 当批量请求无法发送到Elasticsearch时调用
* @param executionId 此请求的id
* @param request 将发送的批量请求
* @param contexts 数据集
* @param failure 异常信息
*/
@Override
public void afterBulk(long executionId, BulkRequest request, List<String> contexts, Throwable failure) {
LOGGER.error("Bulk request " + executionId + " failed", failure);
}
};
ElasticsearchClient elasticsearchClient = elasticsearchClient();
BulkIngester<String> ingester = BulkIngester.of(b -> b
.client(elasticsearchClient)
.maxOperations(-1)
.maxSize(bulkSize)
.maxConcurrentRequests(concurrentRequests)
.flushInterval(1, TimeUnit.MINUTES)
.listener(listener)
);
return ingester;
}
如何使用BulkIngester
@Resource
private BulkIngester<String> bulkIngester;
// 操作对象(可新建/可更新)
IndexOperation<EsTopicCollectModel> indexOperation = new IndexOperation.Builder<EsTopicCollectModel>()
// 索引
.index(tableToEs.getIndexName())
// 文档id
.id(tableToEs.getTableName() + "_" + data.getOrDefault(StrUtil.toCamelCase(tableToEs.getPkColumn()), ""))
// 文档内容
.document(esTopicCollectModel)
.build();
BulkOperation bulkOperation = new BulkOperation.Builder()
.index(indexOperation)
.build();
bulkIngester.add(bulkOperation);
最后
未经作者允许 请勿转载!!!
本文章来自于博客园 实习小生 博客地址:https://www.cnblogs.com/sxxs

浙公网安备 33010602011771号