py脚本---将Prometheus数据写入es

定时将Prometheus监控数据取出写入到es

import requests
import json
import schedule
import time
from elasticsearch import Elasticsearch
from datetime import datetime

Prometheus URL

prometheus_url = "http://100.64.0.40:9090/api/v1/query"

Elasticsearch 配置

es_host = "http://100.64.0.5:9200"
es_index_prefix = "gpu_metrics" # 设置索引名称前缀

创建 Elasticsearch 客户端

es = Elasticsearch([es_host])

定义需要查询的指标

queries = [
'DCGM_FI_DEV_POWER_USAGE', # GPU 功率使用情况
'DCGM_FI_DEV_SM_CLOCK', # GPU SM 时钟
'DCGM_FI_DEV_GPU_UTIL', # GPU 使用率
'DCGM_FI_DEV_FB_USED' # GPU 内存使用量
]

定义查询参数

def query_prometheus(query):
params = {'query': query}
try:
response = requests.get(prometheus_url, params=params)
response.raise_for_status() # 如果返回代码不是 200,会抛出异常
return response.json()
except requests.exceptions.RequestException as e:
print(f"Failed to query Prometheus for {query}: {e}")
return None

处理查询结果并写入 Elasticsearch

def write_to_es(query, data):
if data and 'data' in data and 'result' in data['data']:
results = data['data']['result']
for result in results:
metric = result.get("metric", {})
gpu_device = metric.get("device", "Unknown")
gpu_uuid = metric.get("UUID", "Unknown")
gpu_name = metric.get("modelName", "Unknown")
pci_bus_id = metric.get("pci_bus_id", "Unknown")
timestamp = result.get("value", [None])[0]
value = result.get("value", [None, None])[1]

        # 如果数据无效,则跳过该项
        if value is None:
            continue

        # 转换为小写的索引名称
        index_name = f"{es_index_prefix}_{query.lower()}_{gpu_device.lower()}"

        # 构建 Elasticsearch 文档
        doc = {
            "timestamp": datetime.utcfromtimestamp(float(timestamp)),
            "gpu_device": gpu_device,
            "gpu_uuid": gpu_uuid,
            "gpu_name": gpu_name,
            "pci_bus_id": pci_bus_id,
            "query": query,
            "value": value
        }

        # 写入 Elasticsearch
        try:
            es.index(index=index_name, body=doc)
            print(f"Data for GPU {gpu_device} written to Elasticsearch in index {index_name}.")
        except Exception as e:
            print(f"Error writing to Elasticsearch: {e}")
else:
    print(f"No valid data found for {query}.")

定时任务:每小时执行一次

def job():
print("Starting the data collection and writing to Elasticsearch...")
for query in queries:
data = query_prometheus(query)
if data is not None:
write_to_es(query, data)
else:
print(f"Skipping {query} due to failed data retrieval.")
print("Data collection and writing completed.")

设置定时任务:每小时执行一次 job 函数

schedule.every(1).hour.do(job)

启动并持续运行

if name == "main":
while True:
schedule.run_pending() # 检查并执行已安排的任务
time.sleep(1) # 每秒检查一次任务

posted @ 2025-04-02 18:34  A学无止境A  阅读(29)  评论(0)    收藏  举报