py脚本---将Prometheus数据写入es
定时将Prometheus监控数据取出写入到es
import requests
import json
import schedule
import time
from elasticsearch import Elasticsearch
from datetime import datetime
Prometheus URL
prometheus_url = "http://100.64.0.40:9090/api/v1/query"
Elasticsearch 配置
es_host = "http://100.64.0.5:9200"
es_index_prefix = "gpu_metrics" # 设置索引名称前缀
创建 Elasticsearch 客户端
es = Elasticsearch([es_host])
定义需要查询的指标
queries = [
'DCGM_FI_DEV_POWER_USAGE', # GPU 功率使用情况
'DCGM_FI_DEV_SM_CLOCK', # GPU SM 时钟
'DCGM_FI_DEV_GPU_UTIL', # GPU 使用率
'DCGM_FI_DEV_FB_USED' # GPU 内存使用量
]
定义查询参数
def query_prometheus(query):
params = {'query': query}
try:
response = requests.get(prometheus_url, params=params)
response.raise_for_status() # 如果返回代码不是 200,会抛出异常
return response.json()
except requests.exceptions.RequestException as e:
print(f"Failed to query Prometheus for {query}: {e}")
return None
处理查询结果并写入 Elasticsearch
def write_to_es(query, data):
if data and 'data' in data and 'result' in data['data']:
results = data['data']['result']
for result in results:
metric = result.get("metric", {})
gpu_device = metric.get("device", "Unknown")
gpu_uuid = metric.get("UUID", "Unknown")
gpu_name = metric.get("modelName", "Unknown")
pci_bus_id = metric.get("pci_bus_id", "Unknown")
timestamp = result.get("value", [None])[0]
value = result.get("value", [None, None])[1]
# 如果数据无效,则跳过该项
if value is None:
continue
# 转换为小写的索引名称
index_name = f"{es_index_prefix}_{query.lower()}_{gpu_device.lower()}"
# 构建 Elasticsearch 文档
doc = {
"timestamp": datetime.utcfromtimestamp(float(timestamp)),
"gpu_device": gpu_device,
"gpu_uuid": gpu_uuid,
"gpu_name": gpu_name,
"pci_bus_id": pci_bus_id,
"query": query,
"value": value
}
# 写入 Elasticsearch
try:
es.index(index=index_name, body=doc)
print(f"Data for GPU {gpu_device} written to Elasticsearch in index {index_name}.")
except Exception as e:
print(f"Error writing to Elasticsearch: {e}")
else:
print(f"No valid data found for {query}.")
定时任务:每小时执行一次
def job():
print("Starting the data collection and writing to Elasticsearch...")
for query in queries:
data = query_prometheus(query)
if data is not None:
write_to_es(query, data)
else:
print(f"Skipping {query} due to failed data retrieval.")
print("Data collection and writing completed.")
设置定时任务:每小时执行一次 job 函数
schedule.every(1).hour.do(job)
启动并持续运行
if name == "main":
while True:
schedule.run_pending() # 检查并执行已安排的任务
time.sleep(1) # 每秒检查一次任务

浙公网安备 33010602011771号