es翻页

1.search_after

# -*- coding: utf-8 -*-
from elasticsearch import Elasticsearch

# 初始化客户端
es = Elasticsearch(
    hosts=["http://192.168.1.134:19200"],
    http_auth=("elastic", "elastic"),  # 如果有鉴权
    timeout=60
)

index_name = "metric_pl"
page_size = 2  # 每页数量
sort_field = "timestamp"  # 你要排序的字段，比如时间戳
sort_order = "desc"  # "asc" 或 "desc"

search_after = None
has_more = True

while has_more:
    body = {
        "size": page_size,
        "sort": [
            {sort_field: {"order": sort_order}},
            {"_id": "desc"}  # 避免 sort 值相同导致翻页丢数据
        ]
    }

    if search_after:
        body["search_after"] = search_after

    res = es.search(index=index_name, body=body)

    hits = res["hits"]["hits"]
    if not hits:
        has_more = False
        break

    for doc in hits:
        print(doc["_id"], doc["_source"].get(sort_field))

    # 取最后一条的 sort 值作为下一页的起点
    search_after = hits[-1]["sort"]
    print(search_after)

2.scroll翻页

# -*- coding: utf-8 -*-
from elasticsearch import Elasticsearch

# 初始化客户端
es = Elasticsearch(
    hosts=["http://192.168.1.134:19200"],
    http_auth=("elastic", "elastic"),  # 如果有鉴权
    timeout=60
)

index_name = "metric_pl"
scroll_time = "2m"   # scroll 游标的有效期
page_size = 2      # 每次取多少条

# 第一次请求，启动 scroll
res = es.search(
    index=index_name,
    scroll=scroll_time,
    size=page_size,
    body={
        "query": {
            "match_all": {}
        },
        "sort": [
            {"_doc": "asc"}   # scroll 推荐用 _doc 排序，效率最高
        ]
    }
)

scroll_id = res["_scroll_id"]
print(scroll_id)
hits = res["hits"]["hits"]

while hits:
    for doc in hits:
        print(doc["_id"], doc["_source"])

    # 继续滚动，获取下一批
    res = es.scroll(
        scroll_id=scroll_id,
        scroll=scroll_time
    )

    scroll_id = res["_scroll_id"]
    hits = res["hits"]["hits"]

# 用完要清理 scroll（否则占用集群资源）
es.clear_scroll(scroll_id=scroll_id)

3.scroll scan翻页

# -*- coding: utf-8 -*-
from elasticsearch import Elasticsearch, helpers

# 初始化客户端
es = Elasticsearch(
    hosts=["http://192.168.1.134:19200"],
    http_auth=("elastic", "elastic"),  # 如果有鉴权
    timeout=60
)


index_name = "metric_pl"

# 使用 scan 来遍历所有数据
# 默认会使用 scroll，内部自动维护 scroll_id，不需要你手动循环
results = helpers.scan(
    client=es,
    index=index_name,
    query={
        "query": {
            "match_all": {}
        }
    },
    scroll="2m",       # scroll 游标有效期
    size=1000,         # 每次 scroll 拉取的文档数
    preserve_order=False  # 如果不需要排序，设置为 False，性能更好
)

# 遍历结果
for doc in results:
    print(doc["_id"], doc["_source"])

posted @ 2025-09-01 17:44 slnngk 阅读(13) 评论(0) 收藏举报

刷新页面返回顶部

es翻页

公告