CDP业务搜索数据库调研之组合index
背景:
ES不支持多index联合查询,只有通过如下方案实现:
- 定义一个组合的index,将多个index的值合并到一个index中
- 以嵌套文档的形式存储:通过数据冗余的方式来提高性能
- 以父子文档的形式存储: 通过在同个文档中定义父子关系来模拟实现多个index的联合查询
- 客户端将多个index查询出来的数据进行组合
-
使用场景浅析:什么场景下使用嵌套文档,什么场景下使用父子文档,什么场景下使用客户端组合?
A. 组合文档(包括嵌套文档和父子文档)与客户端组合
- 当各个index联系非常紧密,共同属于某个模块或服务,不可分割的情况下,优先选择组合文档,比如在商城订单系统中,有母订单和子订单的说法,母订单可以认为是同一批次的订单,而子订单表示每一种订单的商品,这种母子订单关联比较紧密,建议使用组合文档.
- 当某个index与其他index有一定的关联,但它可以作为一个独立的系统或服务,这种,建议使用客户端组合.如,活动系统中,有活动信息,和活动参与信息(参与信息一般只存关联关系),还有活动参与的用户的信息,这里用户的基本信息与活动参与信息有一定的关联,但是用户信息不只是单独为活动系统服务的,它可以认为是一个独立的服务,因此,比较适合使用客户端组合
B.嵌套文档与父子文档
- 但某个index与另外一个index是1:1或者1:n(但n是很小的值时),比较适合嵌套文档
- 但某个index与另外一个index是典型的1:n的情况(而且n比较大),比较适合使用父子文档
以上摘自:
ElasticSearch扫盲之十二:ElasticSearch关联查询即父子文档查询和嵌套文档nested查询 诗心博客
综上所述,根据CDP用户-事件场景,选取B父子文档方案来测试。
工具和数据准备
Docker-ElasticSearch - 7.14.1
Docker-rally - latest
步骤1:
拉取docker镜像:
docker pull elasticsearch:7.14.1
docker pull elastic/rally:latest
修改属性:
wsl -d docker-desktop
sysctl -w vm.max_map_count=262144
步骤2:
造数据,使用python,核心代码逻辑如下:
import time
import numpy as np
from elasticsearch import Elasticsearch
from elasticsearch import helpers
from faker import Faker
es = Elasticsearch(hosts=["127.0.0.1:9200"], timeout=30, maxsize=4)
fake = Faker(locale='zh_CN')
client_num = 10000000
event_num = 100000000
def timer(func):
def wrapper(*args, **kwargs):
start = time.time()
res = func(*args, **kwargs)
print('共耗时约 {:.2f} 秒'.format(time.time() - start))
return res
return wrapper
def gen_client(corp_id, i):
return {
"follow_id": fake.random_number(),
"corp_id": corp_id,
"app_id": corp_id,
"ext_user_id": fake.random_number(),
"user_type": fake.random_digit(),
"name": fake.name(),
"gender": fake.boolean(),
"deleted_by_ext_contact": fake.boolean(),
"uid": i + 1,
"add_way": fake.random_digit(),
"channel_id": fake.numerify(),
"add_at": fake.past_date(),
"tags": fake.words(),
"corp_name": fake.company_prefix(),
"corp_full_name": fake.company(),
"deleted_by_staff": fake.boolean(),
"user_event": {
"name": "user"
}
}
def gen_event(corp_id, i):
parent = np.random.randint(1, client_num)
return {
"uid": parent,
"event_type": fake.random_digit(),
"status": fake.random_digit(),
"origin": "%s - - [%s] %s %s %d" % (
fake.ipv4(), fake.past_date(), fake.uri(), fake.user_name(), corp_id),
"req_id": fake.md5(),
"user_event": {
"name": "event",
"parent": parent
}
}
@timer
def gen():
index = "{}".format(0) + "_combine"
mapping = {
"mappings": {
"properties": {
"add_at": {
"type": "date"
},
"add_way": {
"type": "long"
},
"app_id": {
"type": "long"
},
"channel_id": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"corp_full_name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"corp_id": {
"type": "long"
},
"corp_name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"deleted_by_ext_contact": {
"type": "boolean"
},
"deleted_by_staff": {
"type": "boolean"
},
"ext_user_id": {
"type": "long"
},
"follow_id": {
"type": "long"
},
"gender": {
"type": "boolean"
},
"name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"tags": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"user_type": {
"type": "long"
},
"user_event": {
"type": "join",
"relations": {
"user": "event"
}
},
},
}
}
es.indices.create(index=index, body=mapping)
print("create index done.")
client_action = ({
'_id': i + 1,
"_index": index,
'_op_type': 'index',
"_source": gen_client(0, i)
} for i in range(client_num))
helpers.bulk(es, client_action, routing=1, request_timeout=100)
print("create client done.")
event_action = ({
"_index": index,
'_op_type': 'index',
"_source": gen_event(0, i)
} for i in range(event_num))
helpers.bulk(es, event_action, routing=1, request_timeout=100)
print("create event done.")
if __name__ == '__main__':
gen()
生成数据如下:
{
"_index" : "0_combine",
"_type" : "_doc",
"_id" : "5035502",
"_score" : 1.0,
"_routing" : "1",
"_source" : {
"follow_id" : 5059704,
"corp_id" : 0,
"app_id" : 0,
"ext_user_id" : 468,
"user_type" : 0,
"name" : "高洁",
"gender" : true,
"deleted_by_ext_contact" : true,
"uid" : 5035502,
"add_way" : 6,
"channel_id" : "473",
"add_at" : "2021-08-17",
"tags" : [
"系列",
"因为",
"个人"
],
"corp_name" : "通际名联",
"corp_full_name" : "群英信息有限公司",
"deleted_by_staff" : false,
"user_event" : {
"name" : "user"
}
}
}
时间关系,没有生成完1亿数据,最终压测数据集如下:

57017000条数据,共16.2G
压测:
rally的challenge如下:两个默认查询,四个子文档查询,一个父文档查询
{
"name": "search",
"description": "基础的查询压力测试",
"default": true,
"schedule": [
{
"operation": "child_query",
"warmup-iterations": 200,
"iterations": 100,
"target-throughput": 6
},
{
"operation": "default",
"clients": 1,
"warmup-iterations": 500,
"iterations": 1000,
"target-throughput": 90
},
{
"operation": "child_query_agg",
"warmup-iterations": 200,
"iterations": 100
},
{
"operation": "child_query_2",
"warmup-iterations": 200,
"iterations": 100,
"target-throughput": 6
},
{
"operation": "child_query_3",
"warmup-iterations": 200,
"iterations": 100,
"target-throughput": 6
},
{
"operation": "scroll",
"warmup-iterations": 200,
"iterations": 100,
"target-throughput": 6
},
{
"operation": "parent_query",
"warmup-iterations": 200,
"iterations": 100,
"target-throughput": 6
}
]
}
每个task如下:
child_query.json
{
"name": "child_query",
"operation-type": "search",
"body": {
"query": {
"has_child": {
"type": "event",
"query": {
"range": {
"status": {
"gte": 4
}
}
}
}
}
}
}
child_query_2.json
{
"name": "child_query_2",
"operation-type": "search",
"body": {
"query": {
"has_child": {
"type": "event",
"score_mode": "max",
"query": {
"match": {
"event_type": 2
}
}
}
}
}
}
child_query_3.json
{
"name": "child_query_3",
"operation-type": "search",
"body": {
"query": {
"has_child": {
"type": "event",
"min_children": 10,
"query": {
"match_all": {
}
}
}
}
}
}
child_query_agg.json
{
"name": "child_query_agg",
"operation-type": "search",
"body": {
"query": {
"bool": {
"must": [
{
"term": {
"user_type": 2
}
}
]
}
},
"aggs": {
"events": {
"children": {
"type": "event"
},
"aggs": {
"avaiable": {
"filter": {
"term": {
"status": 2
}
},
"aggs": {
"count": {
"terms": {
"field": "uid"
}
}
}
}
}
}
}
}
}
default.json
{
"name": "default",
"operation-type": "search",
"body": {
"query": {
"match_all": {}
}
}
}
parent_query.json
{
"name": "parent_query",
"operation-type": "search",
"body": {
"query": {
"has_parent": {
"parent_type": "user",
"query": {
"match": {
"user_type": 9
}
}
}
}
}
}
scroll.json
{
"name": "scroll",
"operation-type": "search",
"pages": 25,
"results-per-page": 20,
"body": {
"query": {
"match_all": {}
}
}
}
运行压测脚本:
docker network create test-network
docker run -d -p 9200:9200 -p 9300:9300 --network test-network --network-alias es01 --name es01 --ulimit nofile=65535:65535 -v /C/persistent/es:/usr/share/elasticsearch/data -e "discovery.type=single-node" elasticsearch:7.14.1
docker run --network test-network --network-alias rally -v /C/persistent/rally:/rally/.rally elastic/rally race --track=cdp --challenge=search --pipeline=benchmark-only --target-hosts=es01:9200
(本机配置:Intel(R) Core(TM) i7-9700 CPU 16GB内存,压测时CPU在25%左右浮动,内存占用10G)
结论:
| Metric | Task | Value | Unit | 说明 |
|---|---|---|---|---|
| Cumulative indexing time of primary shards | 0.0254333 | min | ||
| Median cumulative indexing time across primary shards | 0.0127167 | min | ||
| Max cumulative indexing time across primary shards | 0.0254333 | min | ||
| Cumulative refresh time of primary shards | 0.0220333 | min | ||
| Cumulative refresh count of primary shards | 11 | |||
| Median cumulative refresh time across primary shards | 0.0110167 | min | ||
| Max cumulative refresh time across primary shards | 0.0220333 | min | ||
| Cumulative flush time of primary shards | 0.0167 | min | ||
| Cumulative flush count of primary shards | 3 | |||
| Median cumulative flush time across primary shards | 0.00835 | min | ||
| Max cumulative flush time across primary shards | 0.0167 | min | ||
| Total Young Gen GC time | 0.449 | s | ||
| Total Young Gen GC count | 19 | |||
| Store size | 16.3153 | GB | 总存储量 | |
| Translog size | 1.02445e-07 | GB | ||
| Heap used for segments | 0.128006 | MB | ||
| Heap used for doc values | 0.00714111 | MB | ||
| Heap used for terms | 0.091217 | MB | ||
| Heap used for norms | 0.00616455 | MB | ||
| Heap used for points | 0 | MB | ||
| Heap used for stored fields | 0.0234833 | MB | ||
| Segment count | 42 | |||
| Min Throughput | child_query | 0.49 | ops/s | 吞吐量 |
| Mean Throughput | child_query | 0.5 | ops/s | 吞吐量 |
| Median Throughput | child_query | 0.5 | ops/s | 吞吐量 |
| Max Throughput | child_query | 0.51 | ops/s | |
| 50th percentile latency | child_query | 447324 | ms | 延迟,提交请求到接收完整响应之间的时间段 |
| 90th percentile latency | child_query | 512834 | ms | |
| 99th percentile latency | child_query | 527795 | ms | |
| 100th percentile latency | child_query | 529435 | ms | |
| 50th percentile service time | child_query | 1821.61 | ms | |
| 90th percentile service time | child_query | 1834.36 | ms | |
| 99th percentile service time | child_query | 1857.19 | ms | |
| 100th percentile service time | child_query | 1944.19 | ms | |
| error rate | child_query | 0 | % | 错误率,错误响应相对于响应总数的比率 |
结论:父子索引结构下,涉及到复杂查询,性能较差。
resouces:
Elastic_中国社区官方博客_CSDN博客-Elastic,Elasticsearch,Kibana领域博主
Running Rally with Docker ‒ Rally 2.3.0.dev0 documentation
Install Elasticsearch with Docker | Elasticsearch Guide [7.14] | Elastic

浙公网安备 33010602011771号