监控工具 - Prometheus 在 Rockly Linux 9 的二进制安装


下载安装包

https://prometheus.io/download/

# Prometheus  选择LTS版本下载
https://github.com/prometheus/prometheus/releases/download/v2.53.2/prometheus-2.53.2.linux-amd64.tar.gz

# alertmanager
https://github.com/prometheus/alertmanager/releases/download/v0.27.0/alertmanager-0.27.0.linux-amd64.tar.gz

# promlens
https://github.com/prometheus/promlens/releases/download/v0.3.0/promlens-0.3.0.linux-amd64.tar.gz

# pushgateway
https://github.com/prometheus/pushgateway/releases/download/v1.9.0/pushgateway-1.9.0.linux-amd64.tar.gz


二进制方式安装配置

[root@node200 ~]# useradd prometheus && echo "prometheus:prometheus"|chpasswd && chage -M 99999 prometheus
[root@node200 ~]# 
[root@node200 ~]# cat /etc/passwd |grep prometheus
prometheus:x:1001:1001::/home/prometheus:/bin/bash
[root@node200 ~]# 
[root@node200 ~]# ll /home
total 4
drwx------. 14 anliven    anliven    4096 Sep 13 14:24 anliven
drwx------   3 prometheus prometheus   78 Sep 13 14:27 prometheus
[root@node200 ~]# 
[root@node200 ~]# systemctl status firewalld.service
○ firewalld.service - firewalld - dynamic firewall daemon
     Loaded: loaded (/usr/lib/systemd/system/firewalld.service; disabled; preset: enabled)
     Active: inactive (dead)
       Docs: man:firewalld(1)
[root@node200 ~]# 
[root@node200 ~]# ll Prometheus/
total 181232
-rw-r--r-- 1 anliven anliven  30866868 Sep 13 13:59 alertmanager-0.27.0.linux-amd64.tar.gz
-rw-r--r-- 1 anliven anliven  11269099 Sep 13 14:00 blackbox_exporter-0.25.0.linux-amd64.tar.gz
-rw-r--r-- 1 anliven anliven  10676343 Sep 13 14:00 node_exporter-1.8.2.linux-amd64.tar.gz
-rw-r--r-- 1 anliven anliven 104212702 Sep 13 13:59 prometheus-2.53.2.linux-amd64.tar.gz
-rw-r--r-- 1 anliven anliven  17982288 Sep 13 14:03 promlens-0.3.0.linux-amd64.tar.gz
-rw-r--r-- 1 anliven anliven  10563386 Sep 13 14:03 pushgateway-1.9.0.linux-amd64.tar.gz
[root@node200 ~]# cd Prometheus/
[root@node200 Prometheus]#  
[root@node200 Prometheus]# tar -zxf prometheus-2.53.2.linux-amd64.tar.gz -C /opt
[root@node200 Prometheus]# 
[root@node200 Prometheus]# cd /opt
[root@node200 opt]# chown -R prometheus:prometheus /opt/prometheus-2.53.2.linux-amd64
[root@node200 opt]# ln -sv /opt/prometheus-2.53.2.linux-amd64 prometheus
'prometheus' -> '/opt/prometheus-2.53.2.linux-amd64'
[root@node200 opt]# 
[root@node200 opt]# ll |grep prometheus
lrwxrwxrwx  1 root       root        34 Sep 13 14:58 prometheus -> /opt/prometheus-2.53.2.linux-amd64
drwxr-xr-x  4 prometheus prometheus 132 Aug  9 23:16 prometheus-2.53.2.linux-amd64
[root@node200 opt]# cd 
[root@node200 ~]# vim /usr/lib/systemd/system/prometheus.service
[root@node200 ~]# cat /usr/lib/systemd/system/prometheus.service
[Unit]
Description=Prometheus server daemon
After=network.target

[Service]
Type=simple
User=prometheus
Group=prometheus
ExecStart=/opt/prometheus/prometheus \
          --config.file "/opt/prometheus/prometheus.yml" \
          --storage.tsdb.path "/opt/prometheus/data" \
          --storage.tsdb.retention=15d \
          --web.console.templates="/opt/prometheus/consoles" \
          --web.console.libraries="/opt/prometheus/console_libraries" \
          --web.max-connections=512 \
          --web.enable-lifecycle \
          --web.listen-address="0.0.0.0:9090" 
Restart=on-failure

[Install]
WantedBy=multi-user.target
[root@node200 ~]#  
[root@node200 ~]# systemctl daemon-reload
[root@node200 ~]# systemctl enable prometheus.service 
Created symlink /etc/systemd/system/multi-user.target.wants/prometheus.service → /usr/lib/systemd/system/prometheus.service.
[root@node200 ~]# systemctl start prometheus
[root@node200 ~]# systemctl status prometheus
● prometheus.service - Prometheus server daemon
     Loaded: loaded (/usr/lib/systemd/system/prometheus.service; enabled; preset: disabled)
     Active: active (running) since Fri 2024-09-13 15:06:33 CST; 2s ago
   Main PID: 3982 (prometheus)
      Tasks: 8 (limit: 48820)
     Memory: 19.3M
        CPU: 86ms
     CGroup: /system.slice/prometheus.service
             └─3982 /opt/prometheus/prometheus --config.file /opt/prometheus/prometheus.yml --storage.tsdb.path /opt/prometheus/data --stor>

Sep 13 15:06:34 node200 prometheus[3982]: ts=2024-09-13T07:06:34.025Z caller=head.go:721 level=info component=tsdb msg="Replaying WAL, this>
Sep 13 15:06:34 node200 prometheus[3982]: ts=2024-09-13T07:06:34.026Z caller=head.go:793 level=info component=tsdb msg="WAL segment loaded">
Sep 13 15:06:34 node200 prometheus[3982]: ts=2024-09-13T07:06:34.026Z caller=head.go:830 level=info component=tsdb msg="WAL replay complete>
Sep 13 15:06:34 node200 prometheus[3982]: ts=2024-09-13T07:06:34.030Z caller=main.go:1169 level=info fs_type=XFS_SUPER_MAGIC
Sep 13 15:06:34 node200 prometheus[3982]: ts=2024-09-13T07:06:34.030Z caller=main.go:1172 level=info msg="TSDB started"
Sep 13 15:06:34 node200 prometheus[3982]: ts=2024-09-13T07:06:34.030Z caller=main.go:1354 level=info msg="Loading configuration file" filen>
Sep 13 15:06:34 node200 prometheus[3982]: ts=2024-09-13T07:06:34.037Z caller=main.go:1391 level=info msg="updated GOGC" old=100 new=75
Sep 13 15:06:34 node200 prometheus[3982]: ts=2024-09-13T07:06:34.038Z caller=main.go:1402 level=info msg="Completed loading of configuratio>
Sep 13 15:06:34 node200 prometheus[3982]: ts=2024-09-13T07:06:34.038Z caller=main.go:1133 level=info msg="Server is ready to receive web re>
Sep 13 15:06:34 node200 prometheus[3982]: ts=2024-09-13T07:06:34.038Z caller=manager.go:164 level=info component="rule manager" msg="Starti>
lines 1-20/20 (END)
^C
[root@node200 opt]#

登录页面
http://192.168.16.200:9090/ 默认是Graph页面


prometheus命令帮助信息

[root@node200 ~]# /opt/prometheus/prometheus -h
usage: prometheus [<flags>]

The Prometheus monitoring server


Flags:
  -h, --[no-]help                Show context-sensitive help (also try --help-long and --help-man).
      --[no-]version             Show application version.
      --config.file="prometheus.yml"  
                                 Prometheus configuration file path.
      --web.listen-address="0.0.0.0:9090"  
                                 Address to listen on for UI, API, and telemetry.
      --auto-gomemlimit.ratio=0.9  
                                 The ratio of reserved GOMEMLIMIT memory to the detected maximum container or system memory
      --web.config.file=""       [EXPERIMENTAL] Path to configuration file that can enable TLS or authentication.
      --web.read-timeout=5m      Maximum duration before timing out read of the request, and closing idle connections.
      --web.max-connections=512  Maximum number of simultaneous connections.
      --web.external-url=<URL>   The URL under which Prometheus is externally reachable (for example, if Prometheus is served via a reverse
                                 proxy). Used for generating relative and absolute links back to Prometheus itself. If the URL has a path
                                 portion, it will be used to prefix all HTTP endpoints served by Prometheus. If omitted, relevant URL
                                 components will be derived automatically.
      --web.route-prefix=<path>  Prefix for the internal routes of web endpoints. Defaults to path of --web.external-url.
      --web.user-assets=<path>   Path to static asset directory, available at /user.
      --[no-]web.enable-lifecycle  
                                 Enable shutdown and reload via HTTP request.
      --[no-]web.enable-admin-api  
                                 Enable API endpoints for admin control actions.
      --[no-]web.enable-remote-write-receiver  
                                 Enable API endpoint accepting remote write requests.
      --web.console.templates="consoles"  
                                 Path to the console template directory, available at /consoles.
      --web.console.libraries="console_libraries"  
                                 Path to the console library directory.
      --web.page-title="Prometheus Time Series Collection and Processing Server"  
                                 Document title of Prometheus instance.
      --web.cors.origin=".*"     Regex for CORS origin. It is fully anchored. Example: 'https?://(domain1|domain2)\.com'
      --storage.tsdb.path="data/"  
                                 Base path for metrics storage. Use with server mode only.
      --storage.tsdb.retention=STORAGE.TSDB.RETENTION  
                                 [DEPRECATED] How long to retain samples in storage. This flag has been deprecated, use
                                 "storage.tsdb.retention.time" instead. Use with server mode only.
      --storage.tsdb.retention.time=STORAGE.TSDB.RETENTION.TIME  
                                 How long to retain samples in storage. When this flag is set it overrides "storage.tsdb.retention".
                                 If neither this flag nor "storage.tsdb.retention" nor "storage.tsdb.retention.size" is set, the retention
                                 time defaults to 15d. Units Supported: y, w, d, h, m, s, ms. Use with server mode only.
      --storage.tsdb.retention.size=STORAGE.TSDB.RETENTION.SIZE  
                                 Maximum number of bytes that can be stored for blocks. A unit is required, supported units: B, KB, MB, GB,
                                 TB, PB, EB. Ex: "512MB". Based on powers-of-2, so 1KB is 1024B. Use with server mode only.
      --[no-]storage.tsdb.no-lockfile  
                                 Do not create lockfile in data directory. Use with server mode only.
      --storage.tsdb.head-chunks-write-queue-size=0  
                                 Size of the queue through which head chunks are written to the disk to be m-mapped, 0 disables the queue
                                 completely. Experimental. Use with server mode only.
      --storage.agent.path="data-agent/"  
                                 Base path for metrics storage. Use with agent mode only.
      --[no-]storage.agent.wal-compression  
                                 Compress the agent WAL. Use with agent mode only.
      --storage.agent.retention.min-time=STORAGE.AGENT.RETENTION.MIN-TIME  
                                 Minimum age samples may be before being considered for deletion when the WAL is truncated Use with agent
                                 mode only.
      --storage.agent.retention.max-time=STORAGE.AGENT.RETENTION.MAX-TIME  
                                 Maximum age samples may be before being forcibly deleted when the WAL is truncated Use with agent mode
                                 only.
      --[no-]storage.agent.no-lockfile  
                                 Do not create lockfile in data directory. Use with agent mode only.
      --storage.remote.flush-deadline=<duration>  
                                 How long to wait flushing sample on shutdown or config reload.
      --storage.remote.read-sample-limit=5e7  
                                 Maximum overall number of samples to return via the remote read interface, in a single query. 0 means no
                                 limit. This limit is ignored for streamed response types. Use with server mode only.
      --storage.remote.read-concurrent-limit=10  
                                 Maximum number of concurrent remote read calls. 0 means no limit. Use with server mode only.
      --storage.remote.read-max-bytes-in-frame=1048576  
                                 Maximum number of bytes in a single frame for streaming remote read response types before marshalling.
                                 Note that client might have limit on frame size as well. 1MB as recommended by protobuf by default.
                                 Use with server mode only.
      --rules.alert.for-outage-tolerance=1h  
                                 Max time to tolerate prometheus outage for restoring "for" state of alert. Use with server mode only.
      --rules.alert.for-grace-period=10m  
                                 Minimum duration between alert and restored "for" state. This is maintained only for alerts with configured
                                 "for" time greater than grace period. Use with server mode only.
      --rules.alert.resend-delay=1m  
                                 Minimum amount of time to wait before resending an alert to Alertmanager. Use with server mode only.
      --rules.max-concurrent-evals=4  
                                 Global concurrency limit for independent rules that can run concurrently. When set, "query.max-concurrency"
                                 may need to be adjusted accordingly. Use with server mode only.
      --alertmanager.notification-queue-capacity=10000  
                                 The capacity of the queue for pending Alertmanager notifications. Use with server mode only.
      --query.lookback-delta=5m  The maximum lookback duration for retrieving metrics during expression evaluations and federation. Use with
                                 server mode only.
      --query.timeout=2m         Maximum time a query may take before being aborted. Use with server mode only.
      --query.max-concurrency=20  
                                 Maximum number of queries executed concurrently. Use with server mode only.
      --query.max-samples=50000000  
                                 Maximum number of samples a single query can load into memory. Note that queries will fail if they try to
                                 load more samples than this into memory, so this also limits the number of samples a query can return.
                                 Use with server mode only.
      --enable-feature= ...      Comma separated feature names to enable. Valid options: agent, auto-gomemlimit, exemplar-storage,
                                 expand-external-labels, memory-snapshot-on-shutdown, promql-per-step-stats, promql-experimental-functions,
                                 remote-write-receiver (DEPRECATED), extra-scrape-metrics, new-service-discovery-manager, auto-gomaxprocs,
                                 no-default-scrape-port, native-histograms, otlp-write-receiver, created-timestamp-zero-ingestion,
                                 concurrent-rule-eval. See https://prometheus.io/docs/prometheus/latest/feature_flags/ for more details.
      --log.level=info           Only log messages with the given severity or above. One of: [debug, info, warn, error]
      --log.format=logfmt        Output format of log messages. One of: [logfmt, json]


更新Prometheus配置

修改配置文件prometheus.yml

  • 重启服务方式:重启Prometheus服务即可加载配置文件
  • 热加载方式:通过API发送post请求 curl -X POST http://192.168.16.200:9090/-/reload
  • 热加载方式需要在Prometheus服务启动时指定--web.enable-lifecycle

验证Prometheus配置

通过promtool工具核查配置。

[root@node200 ~]# /opt/prometheus/promtool check config /opt/prometheus/prometheus.yml 
Checking /opt/prometheus/prometheus.yml
 SUCCESS: /opt/prometheus/prometheus.yml is valid prometheus config file syntax

[root@node200 ~]# 
[root@node200 ~]# /opt/prometheus/promtool -h
usage: promtool [<flags>] <command> [<args> ...]

Tooling for the Prometheus monitoring system.


Flags:
  -h, --[no-]help            Show context-sensitive help (also try --help-long and --help-man).
      --[no-]version         Show application version.
      --[no-]experimental    Enable experimental commands.
      --enable-feature= ...  Comma separated feature names to enable (only PromQL related and no-default-scrape-port). See
                             https://prometheus.io/docs/prometheus/latest/feature_flags/ for the options and more details.

Commands:
help [<command>...]
    Show help.

check service-discovery [<flags>] <config-file> <job>
    Perform service discovery for the given job name and report the results, including relabeling.

check config [<flags>] <config-files>...
    Check if the config files are valid or not.

check web-config <web-config-files>...
    Check if the web config files are valid or not.

check healthy [<flags>]
    Check if the Prometheus server is healthy.

check ready [<flags>]
    Check if the Prometheus server is ready.

check rules [<flags>] [<rule-files>...]
    Check if the rule files are valid or not.

check metrics
    Pass Prometheus metrics over stdin to lint them for consistency and correctness.

    examples:

    $ cat metrics.prom | promtool check metrics

    $ curl -s http://localhost:9090/metrics | promtool check metrics

query instant [<flags>] <server> <expr>
    Run instant query.

query range [<flags>] <server> <expr>
    Run range query.

query series --match=MATCH [<flags>] <server>
    Run series query.

query labels [<flags>] <server> <name>
    Run labels query.

query analyze --server=SERVER --type=TYPE --match=MATCH [<flags>]
    Run queries against your Prometheus to analyze the usage pattern of certain metrics.

debug pprof <server>
    Fetch profiling debug information.

debug metrics <server>
    Fetch metrics debug information.

debug all <server>
    Fetch all debug information.

push metrics [<flags>] <remote-write-url> [<metric-files>...]
    Push metrics to a prometheus remote write (for testing purpose only).

test rules [<flags>] <test-rule-file>...
    Unit tests for rules.

tsdb bench write [<flags>] [<file>]
    Run a write performance benchmark.

tsdb analyze [<flags>] [<db path>] [<block id>]
    Analyze churn, label pair cardinality and compaction efficiency.

tsdb list [<flags>] [<db path>]
    List tsdb blocks.

tsdb dump [<flags>] [<db path>]
    Dump samples from a TSDB.

tsdb dump-openmetrics [<flags>] [<db path>]
    [Experimental] Dump samples from a TSDB into OpenMetrics text format, excluding native histograms and staleness markers, which are not
    representable in OpenMetrics.

tsdb create-blocks-from openmetrics <input file> [<output directory>]
    Import samples from OpenMetrics input and produce TSDB blocks. Please refer to the storage docs for more details.

tsdb create-blocks-from rules --start=START [<flags>] <rule-files>...
    Create blocks of data for new recording rules.

promql format <query>
    Format PromQL query to pretty printed form.

promql label-matchers set [<flags>] <query> <name> <value>
    Set a label matcher in the query.

promql label-matchers delete <query> <name>
    Delete a label from the query.


[root@node200 ~]# 

Docker方式快速安装


docker pull prom/prometheus:v2.53.2

mkdir -p /etc/prometheus/data

docker run -itd --name prometheus \
-p 9090:9090 \
-v /etc/prometheus:/etc/prometheus \
-v /etc/prometheus/data:/prometheus \
prom/prometheus:v2.53.2


常用页面

- 默认页面  http://192.168.16.200:9090/graph
- 告警页面  http://192.168.16.200:9090/alerts
- Targets信息  http://192.168.16.200:9090/targets
- 指标信息Metrics  http://<targets-ip>:9090/metrics
- http://192.168.16.200:9090/status
- http://192.168.16.200:9090/tsdb-status
- http://192.168.16.200:9090/flags
- http://192.168.16.200:9090/config
- http://192.168.16.200:9090/rules
- http://192.168.16.200:9090/service-discovery

posted @ 2018-11-30 23:32  Anliven  阅读(2308)  评论(0)    收藏  举报