es翻页
1.search_after
# -*- coding: utf-8 -*-
from elasticsearch import Elasticsearch
# 初始化客户端
es = Elasticsearch(
hosts=["http://192.168.1.134:19200"],
http_auth=("elastic", "elastic"), # 如果有鉴权
timeout=60
)
index_name = "metric_pl"
page_size = 2 # 每页数量
sort_field = "timestamp" # 你要排序的字段,比如时间戳
sort_order = "desc" # "asc" 或 "desc"
search_after = None
has_more = True
while has_more:
body = {
"size": page_size,
"sort": [
{sort_field: {"order": sort_order}},
{"_id": "desc"} # 避免 sort 值相同导致翻页丢数据
]
}
if search_after:
body["search_after"] = search_after
res = es.search(index=index_name, body=body)
hits = res["hits"]["hits"]
if not hits:
has_more = False
break
for doc in hits:
print(doc["_id"], doc["_source"].get(sort_field))
# 取最后一条的 sort 值作为下一页的起点
search_after = hits[-1]["sort"]
print(search_after)

2.scroll翻页
# -*- coding: utf-8 -*-
from elasticsearch import Elasticsearch
# 初始化客户端
es = Elasticsearch(
hosts=["http://192.168.1.134:19200"],
http_auth=("elastic", "elastic"), # 如果有鉴权
timeout=60
)
index_name = "metric_pl"
scroll_time = "2m" # scroll 游标的有效期
page_size = 2 # 每次取多少条
# 第一次请求,启动 scroll
res = es.search(
index=index_name,
scroll=scroll_time,
size=page_size,
body={
"query": {
"match_all": {}
},
"sort": [
{"_doc": "asc"} # scroll 推荐用 _doc 排序,效率最高
]
}
)
scroll_id = res["_scroll_id"]
print(scroll_id)
hits = res["hits"]["hits"]
while hits:
for doc in hits:
print(doc["_id"], doc["_source"])
# 继续滚动,获取下一批
res = es.scroll(
scroll_id=scroll_id,
scroll=scroll_time
)
scroll_id = res["_scroll_id"]
hits = res["hits"]["hits"]
# 用完要清理 scroll(否则占用集群资源)
es.clear_scroll(scroll_id=scroll_id)
3.scroll scan翻页
# -*- coding: utf-8 -*-
from elasticsearch import Elasticsearch, helpers
# 初始化客户端
es = Elasticsearch(
hosts=["http://192.168.1.134:19200"],
http_auth=("elastic", "elastic"), # 如果有鉴权
timeout=60
)
index_name = "metric_pl"
# 使用 scan 来遍历所有数据
# 默认会使用 scroll,内部自动维护 scroll_id,不需要你手动循环
results = helpers.scan(
client=es,
index=index_name,
query={
"query": {
"match_all": {}
}
},
scroll="2m", # scroll 游标有效期
size=1000, # 每次 scroll 拉取的文档数
preserve_order=False # 如果不需要排序,设置为 False,性能更好
)
# 遍历结果
for doc in results:
print(doc["_id"], doc["_source"])
浙公网安备 33010602011771号