es翻页

 

1.search_after

# -*- coding: utf-8 -*-
from elasticsearch import Elasticsearch

# 初始化客户端
es = Elasticsearch(
    hosts=["http://192.168.1.134:19200"],
    http_auth=("elastic", "elastic"),  # 如果有鉴权
    timeout=60
)

index_name = "metric_pl"
page_size = 2  # 每页数量
sort_field = "timestamp"  # 你要排序的字段,比如时间戳
sort_order = "desc"  # "asc""desc"

search_after = None
has_more = True

while has_more:
    body = {
        "size": page_size,
        "sort": [
            {sort_field: {"order": sort_order}},
            {"_id": "desc"}  # 避免 sort 值相同导致翻页丢数据
        ]
    }

    if search_after:
        body["search_after"] = search_after

    res = es.search(index=index_name, body=body)

    hits = res["hits"]["hits"]
    if not hits:
        has_more = False
        break

    for doc in hits:
        print(doc["_id"], doc["_source"].get(sort_field))

    # 取最后一条的 sort 值作为下一页的起点
    search_after = hits[-1]["sort"]
    print(search_after)

 

image

 

2.scroll翻页

 

# -*- coding: utf-8 -*-
from elasticsearch import Elasticsearch

# 初始化客户端
es = Elasticsearch(
    hosts=["http://192.168.1.134:19200"],
    http_auth=("elastic", "elastic"),  # 如果有鉴权
    timeout=60
)

index_name = "metric_pl"
scroll_time = "2m"   # scroll 游标的有效期
page_size = 2      # 每次取多少条

# 第一次请求,启动 scroll
res = es.search(
    index=index_name,
    scroll=scroll_time,
    size=page_size,
    body={
        "query": {
            "match_all": {}
        },
        "sort": [
            {"_doc": "asc"}   # scroll 推荐用 _doc 排序,效率最高
        ]
    }
)

scroll_id = res["_scroll_id"]
print(scroll_id)
hits = res["hits"]["hits"]

while hits:
    for doc in hits:
        print(doc["_id"], doc["_source"])

    # 继续滚动,获取下一批
    res = es.scroll(
        scroll_id=scroll_id,
        scroll=scroll_time
    )

    scroll_id = res["_scroll_id"]
    hits = res["hits"]["hits"]

# 用完要清理 scroll(否则占用集群资源)
es.clear_scroll(scroll_id=scroll_id)

 

3.scroll scan翻页

# -*- coding: utf-8 -*-
from elasticsearch import Elasticsearch, helpers

# 初始化客户端
es = Elasticsearch(
    hosts=["http://192.168.1.134:19200"],
    http_auth=("elastic", "elastic"),  # 如果有鉴权
    timeout=60
)


index_name = "metric_pl"

# 使用 scan 来遍历所有数据
# 默认会使用 scroll,内部自动维护 scroll_id,不需要你手动循环
results = helpers.scan(
    client=es,
    index=index_name,
    query={
        "query": {
            "match_all": {}
        }
    },
    scroll="2m",       # scroll 游标有效期
    size=1000,         # 每次 scroll 拉取的文档数
    preserve_order=False  # 如果不需要排序,设置为 False,性能更好
)

# 遍历结果
for doc in results:
    print(doc["_id"], doc["_source"])

 

posted @ 2025-09-01 17:44  slnngk  阅读(13)  评论(0)    收藏  举报