[zabbix] 服务和配置 - 迁
配置
1 服务器端配置
1 mysql
mysqladmin -uroot password 'mysql'
2 php
./configure --prefix=/usr/local/php-5.6.15 --with-config-file-path=/usr/local/php-5.6.15/etc --with-bz2 --with-curl --enable-ftp --enable-sockets --disable-ipv6 --with-gd --with-jpeg-dir=/usr/local --with-png-dir=/usr/local --with-freetype-dir=/usr/local --enable-gd-native-ttf --with-iconv-dir=/usr/local --enable-mbstring --enable-calendar --with-gettext --with-libxml-dir=/usr/local --with-zlib --with-pdo-mysql=mysqlnd --with-mysqli=mysqlnd --with-mysql=mysqlnd --enable-dom --enable-xml --enable-fpm --with-libdir=lib64 make && make install
缺少php的gettext 模块
http://blog.csdn.net/u010098331/article/details/50750771
php-fpm的配置
[global] pid = run/php-fpm.pid error_log = log/php-fpm.log [www] user=www group=www listen = 127.0.0.1:9000 pm=dynamic pm.max_children=200 # 配置的太少会把子进程消耗完 pm.start_servers=30 pm.min_spare_servers=30 pm.max_spare_servers=60 pm.status_path = /php_fpm-status # 采集php-fpm的状态 slowlog = log/$pool.log.slow php_admin_flag[log_errors] = on php_admin_value[memory_limit] = 512M # 默认值小,如果zabbix admin页面内容多的话会无响应
3 安装 bcmath
cd ext/bcmath/ /usr/local/php-5.6.15/bin/phpize ./configure --with-php-config=/usr/local/php-5.6.15/bin/php-config make && make install
在php.ini中配置extension_dir ,并把bcmath.so 拷贝过去
extension=/usr/local/php-5.6.15/ext/bcmath.so
cp modules/bcmath.so /usr/local/php-5.6.15/ext/
4 nginx
server { listen 80; server_name love.zabbix; index index.html index.php; root /www/zabbix; location ~ .*\.(php|php5)?$ { fastcgi_pass 127.0.0.1:9000; fastcgi_index index.php; include fastcgi.conf; } location ~ .*\.(gif|jpg|jpeg|png|bmp|swf)$ { expires 30d; } location ~ .*\.(js|css)?$ { expires 1h; } # access_log off; }
5 zabbix server
配置页
http://localhost/setup.php
LoggFile=/tmp/zabbix_server.log DebugLevel=3 DBHost=localhost DBName=zabbix DBUser=xxxx DBPassword=xxxx DBSocket=/tmp/mysql.sock DBPort=3306 JavaGateway=127.0.0.1 JavaGatewayPort=10052 StartJavaPollers=5 StartDiscoverers=10 AlertScriptsPath=/etc/zabbix/alertscripts ExternalScripts=/etc/zabbix/externalscripts #外部脚本目录 #FpingLocation=/usr/sbin/fping StartPollers=500 StartPingers=150 StartTrappers=100 TrapperTimeout=100 StartDiscoverers=100 StartPollersUnreachable=100 #告警线程总数 3.4 之后有效 StartAlerters=50 Timeout=30 UnreachablePeriod=50 UnavailableDelay=60 UnreachableDelay=20 LogSlowQueries=0 TmpDir=/tmp StartProxyPollers=10 #在zabbix proxy被动模式下用此参数 #ProxyConfigFrequency=1000 ProxyDataFrequency=1 CacheSize=1024M StartDBSyncers=20 TrendCacheSize=1024M ValueCacheSize=1024M TrendCacheSize=256M ValueCacheSize=1024M HistoryCacheSize=1024M HistoryIndexCacheSize=512M VMwareCacheSize=200M StartVMwareCollectors=100 VMwareFrequency=60 HousekeepingFrequency=3 #zabbix执行Housekeeping的频率,单位为hours MaxHousekeeperDelete=50000 #每次最多删除历史数据的行
zabbix很容易出现历史表过大的问题.zabbix官方也是建议使用表分区
每日执行脚本
/usr/local/mysql/bin/mysql -uzabbix -pXXXXXX zabbix -e "CALL partition_maintenance_all('zabbix');"
执行日志
Tue Nov 6 17:30:01 CST 2018 call partition_maintenance_all function: ======================================== msg partition_create(zabbix,history,p201812050000,1544025600) table partitions_deleted zabbix.history p201810060000 msg partition_create(zabbix,history_log,p201812050000,1544025600) table partitions_deleted zabbix.history_log p201810060000 msg partition_create(zabbix,history_str,p201812050000,1544025600) table partitions_deleted zabbix.history_str p201810060000 msg partition_create(zabbix,history_text,p201812050000,1544025600) table partitions_deleted zabbix.history_text p201810060000 msg partition_create(zabbix,history_uint,p201812050000,1544025600) table partitions_deleted zabbix.history_uint p201810060000 msg partition_create(zabbix,trends,p201812050000,1544025600) table partitions_deleted zabbix.trends p201805090000 msg partition_create(zabbix,trends_uint,p201812050000,1544025600) table partitions_deleted zabbix.trends_uint p201805090000
调用存储过程
DELIMITER $$ CREATE PROCEDURE `partition_maintenance_all`(SCHEMA_NAME VARCHAR(32)) BEGIN CALL partition_maintenance(SCHEMA_NAME, 'history', 30, 24, 30); CALL partition_maintenance(SCHEMA_NAME, 'history_log', 30, 24, 30); CALL partition_maintenance(SCHEMA_NAME, 'history_str', 30, 24, 30); CALL partition_maintenance(SCHEMA_NAME, 'history_text', 30, 24, 30); CALL partition_maintenance(SCHEMA_NAME, 'history_uint', 30, 24, 30); CALL partition_maintenance(SCHEMA_NAME, 'trends', 180, 24, 30); CALL partition_maintenance(SCHEMA_NAME, 'trends_uint', 180, 24, 30); END$$ DELIMITER ;
分区脚本
DELIMITER $$ CREATE PROCEDURE `partition_create`(SCHEMANAME varchar(64), TABLENAME varchar(64), PARTITIONNAME varchar(64), CLOCK int) BEGIN /* SCHEMANAME = The DB schema in which to make changes TABLENAME = The table with partitions to potentially delete PARTITIONNAME = The name of the partition to create */ /* Verify that the partition does not already exist */ DECLARE RETROWS INT; SELECT COUNT(1) INTO RETROWS FROM information_schema.partitions WHERE table_schema = SCHEMANAME AND table_name = TABLENAME AND partition_description >= CLOCK; IF RETROWS = 0 THEN /* 1. Print a message indicating that a partition was created. 2. Create the SQL to create the partition. 3. Execute the SQL from #2. */ SELECT CONCAT( "partition_create(", SCHEMANAME, ",", TABLENAME, ",", PARTITIONNAME, ",", CLOCK, ")" ) AS msg; SET @sql = CONCAT( 'ALTER TABLE ', SCHEMANAME, '.', TABLENAME, ' ADD PARTITION (PARTITION ', PARTITIONNAME, ' VALUES LESS THAN (', CLOCK, '));' ); PREPARE STMT FROM @sql; EXECUTE STMT; DEALLOCATE PREPARE STMT; END IF; END$$ DELIMITER ; DELIMITER $$ CREATE PROCEDURE `partition_drop`(SCHEMANAME VARCHAR(64), TABLENAME VARCHAR(64), DELETE_BELOW_PARTITION_DATE BIGINT) BEGIN /* SCHEMANAME = The DB schema in which to make changes TABLENAME = The table with partitions to potentially delete DELETE_BELOW_PARTITION_DATE = Delete any partitions with names that are dates older than this one (yyyy-mm-dd) */ DECLARE done INT DEFAULT FALSE; DECLARE drop_part_name VARCHAR(16); /* Get a list of all the partitions that are older than the date in DELETE_BELOW_PARTITION_DATE. All partitions are prefixed with a "p", so use SUBSTRING TO get rid of that character. */ DECLARE myCursor CURSOR FOR SELECT partition_name FROM information_schema.partitions WHERE table_schema = SCHEMANAME AND table_name = TABLENAME AND CAST(SUBSTRING(partition_name FROM 2) AS UNSIGNED) < DELETE_BELOW_PARTITION_DATE; DECLARE CONTINUE HANDLER FOR NOT FOUND SET done = TRUE; /* Create the basics for when we need to drop the partition. Also, create @drop_partitions to hold a comma-delimited list of all partitions that should be deleted. */ SET @alter_header = CONCAT("ALTER TABLE ", SCHEMANAME, ".", TABLENAME, " DROP PARTITION "); SET @drop_partitions = ""; /* Start looping through all the partitions that are too old. */ OPEN myCursor; read_loop: LOOP FETCH myCursor INTO drop_part_name; IF done THEN LEAVE read_loop; END IF; SET @drop_partitions = IF(@drop_partitions = "", drop_part_name, CONCAT(@drop_partitions, ",", drop_part_name)); END LOOP; IF @drop_partitions != "" THEN /* 1. Build the SQL to drop all the necessary partitions. 2. Run the SQL to drop the partitions. 3. Print out the table partitions that were deleted. */ SET @full_sql = CONCAT(@alter_header, @drop_partitions, ";"); PREPARE STMT FROM @full_sql; EXECUTE STMT; DEALLOCATE PREPARE STMT; SELECT CONCAT(SCHEMANAME, ".", TABLENAME) AS `table`, @drop_partitions AS `partitions_deleted`; ELSE /* No partitions are being deleted, so print out "N/A" (Not applicable) to indicate that no changes were made. */ SELECT CONCAT(SCHEMANAME, ".", TABLENAME) AS `table`, "N/A" AS `partitions_deleted`; END IF; END$$ DELIMITER ; DELIMITER $$ CREATE PROCEDURE `partition_maintenance`(SCHEMA_NAME VARCHAR(32), TABLE_NAME VARCHAR(32), KEEP_DATA_DAYS INT, HOURLY_INTERVAL INT, CREATE_NEXT_INTERVALS INT) BEGIN DECLARE OLDER_THAN_PARTITION_DATE VARCHAR(16); DECLARE PARTITION_NAME VARCHAR(16); DECLARE OLD_PARTITION_NAME VARCHAR(16); DECLARE LESS_THAN_TIMESTAMP INT; DECLARE CUR_TIME INT; CALL partition_verify(SCHEMA_NAME, TABLE_NAME, HOURLY_INTERVAL); SET CUR_TIME = UNIX_TIMESTAMP(DATE_FORMAT(NOW(), '%Y-%m-%d 00:00:00')); SET @__interval = 1; create_loop: LOOP IF @__interval > CREATE_NEXT_INTERVALS THEN LEAVE create_loop; END IF; SET LESS_THAN_TIMESTAMP = CUR_TIME + (HOURLY_INTERVAL * @__interval * 3600); SET PARTITION_NAME = FROM_UNIXTIME(CUR_TIME + HOURLY_INTERVAL * (@__interval - 1) * 3600, 'p%Y%m%d%H00'); IF(PARTITION_NAME != OLD_PARTITION_NAME) THEN CALL partition_create(SCHEMA_NAME, TABLE_NAME, PARTITION_NAME, LESS_THAN_TIMESTAMP); END IF; SET @__interval=@__interval+1; SET OLD_PARTITION_NAME = PARTITION_NAME; END LOOP; SET OLDER_THAN_PARTITION_DATE=DATE_FORMAT(DATE_SUB(NOW(), INTERVAL KEEP_DATA_DAYS DAY), '%Y%m%d0000'); CALL partition_drop(SCHEMA_NAME, TABLE_NAME, OLDER_THAN_PARTITION_DATE); END$$ DELIMITER ; DELIMITER $$ CREATE PROCEDURE `partition_verify`(SCHEMANAME VARCHAR(64), TABLENAME VARCHAR(64), HOURLYINTERVAL INT(11)) BEGIN DECLARE PARTITION_NAME VARCHAR(16); DECLARE RETROWS INT(11); DECLARE FUTURE_TIMESTAMP TIMESTAMP; /* * Check if any partitions exist for the given SCHEMANAME.TABLENAME. */ SELECT COUNT(1) INTO RETROWS FROM information_schema.partitions WHERE table_schema = SCHEMANAME AND table_name = TABLENAME AND partition_name IS NULL; /* * If partitions do not exist, go ahead and partition the table */ IF RETROWS = 1 THEN /* * Take the current date at 00:00:00 and add HOURLYINTERVAL to it. This is the timestamp below which we will store values. * We begin partitioning based on the beginning of a day. This is because we don't want to generate a random partition * that won't necessarily fall in line with the desired partition naming (ie: if the hour interval is 24 hours, we could * end up creating a partition now named "p201403270600" when all other partitions will be like "p201403280000"). */ SET FUTURE_TIMESTAMP = TIMESTAMPADD(HOUR, HOURLYINTERVAL, CONCAT(CURDATE(), " ", '00:00:00')); SET PARTITION_NAME = DATE_FORMAT(CURDATE(), 'p%Y%m%d%H00'); -- Create the partitioning query SET @__PARTITION_SQL = CONCAT("ALTER TABLE ", SCHEMANAME, ".", TABLENAME, " PARTITION BY RANGE(`clock`)"); SET @__PARTITION_SQL = CONCAT(@__PARTITION_SQL, "(PARTITION ", PARTITION_NAME, " VALUES LESS THAN (", UNIX_TIMESTAMP(FUTURE_TIMESTAMP), "));"); -- Run the partitioning query PREPARE STMT FROM @__PARTITION_SQL; EXECUTE STMT; DEALLOCATE PREPARE STMT; END IF; END$$ DELIMITER ;
2 代理proxy端
安装依赖
yum install curl* curl-devel libpcre libevent-devel -y
安装参数
./configure --prefix=/usr/local/zabbix_proxy --sysconfdir=/etc/zabbix --enable-proxy --enable-agent --enable-java --with-net-snmp --with-libcurl --with-libxml2 --with-openipmi --with-unixodbc --with-ldap --with-ssh2 --with-mysql=locate && make install
配置
ProxyMode=0 LogFileSize=0 LogSlowQueries=3000 Timeout=4 Server=1.1.1.1 #zabbix服务端IP Hostname=proxy-node #必须和WEB页面添加代理时设置的名称一致 LogFile=/tmp/zabbix_proxy.log #日志文件路径 DBHost=localhost #数据库IP DBName=zabbix #数据库名 DBUser=XXXXX #数据库用户名 DBPassword=XXXX #数据库密码 DBSocket=/tmp/mysql.sock DBPort=3306 DataSenderFrequency=5 #数据同步间隔 ConfigFrequency=60 #配置文件同步间隔 ProxyLocalBuffer=0 #当数据发送到Server,还要在本地保留多少小时.不保留 ProxyOfflineBuffer=3 #当数据没有发送到Server,在本地保留多少小时,3小时。 HeartbeatFrequency=60 #心跳检测代理在Server的可用性 ConfigFrequency=300 #代理多久从Server获取一次配置变化,默认3600秒. DataSenderFrequency=3 #代理收集到数据后,多久向Server发送一次.. CacheSize=512M #用来保存监控数据的缓存数,根据监控主机数量适当调整 StartTrappers=20 # 接受客户端主动提交的trapper进程数量 StartPollers=20 # 以主动方式提取客户端监控数据 StartPingers=10 # ping的启动进程数量 #外部脚本目录 FpingLocation=/usr/sbin/fping
进程负载过高应该调整的参数值
Zabbix busy trapper processes, in % StartTrappers=5 Zabbix busy poller processes, in % StartPollers=5 Zabbix busy ipmi poller processes, in % StartIPMIPollers=0 Zabbix busy discoverer processes, in % StartDiscoverers=1 Zabbix busy icmp pinger processes, in % StartPingers=1 Zabbix busy http poller processes, in % StartHTTPPollers=1 Zabbix busy proxy poller processes, in % StartProxyPollers=1 Zabbix busy unreachable poller processes, in % StartPollersUnreachable=1 Zabbix busy java poller processes, in % StartJavaPollers=0 Zabbix busy snmp trapper processes, in % StartSNMPTrapper=0 Zabbix busy vmware collector processes, in % StartVMwareCollectors=0
3 agent 客户端
安装直接使用已编译完的包,解压即可
配置文件
cat >>/usr/local/zabbix-agent/etc/zabbix_agentd.conf << EOF #version 1 LogFile=/tmp/zabbix_agentd.log Server=yum.ops.net ServerActive=yum.ops.net Include=/usr/local/zabbix-agent/etc/zabbix_agentd.conf.d Timeout=15 Hostname=web.base-1-4 EOF systemctl start zabbix-agent
4 添加监控
http://ywzhou.blog.51cto.com/2785388/1580913
注意 在触发器中配置了
手工添加主机
自定义监控项目
查询及时数据
5 配置检测
服务器端的检测
zabbix_get -s 1-16.goldcoin -p 10050 -k 'jb[currentThreadsBusy]'
正常返回会是一个value值
6 各种监控服务
日志监控
一般使用自动发现自动发现日志文件,然后去做日志的监控项,最后去日志监控值进行触发操作。
自动发现中的发现规则 pyora-discovery[{$ADDRESS},{$DATABASE},show_alert] 自动发现项中的监控项原型: log[{#ALERTLOG},"ORA-|Checkpoint not complete|Can not allocate log|fail|halt|abort|panic",,,skip,] 自动发现中的触发器 ({Template Pyora-primary:log[{#ALERTLOG},"ORA-|Checkpoint not complete|Can not allocate log|fail|halt|abort|panic",,,skip,].regexp("ORA-|Checkpoint not complete|Can not allocate log|halt|abort|panic")}=1) and ({Template Pyora-primary:log[{#ALERTLOG},"ORA-|Checkpoint not complete|Can not allocate log|fail|halt|abort|panic",,,skip,].regexp("ORA-20001|ORA-00235|DBA-|16037|ORA-609|12537|12547|Tns error struct|ORA-48913|12541|ORA-16037")}=0 and {Template Pyora-primary:log[{#ALERTLOG},"ORA-|Checkpoint not complete|Can not allocate log|fail|halt|abort|panic",,,skip,].nodata(600)}=0)
A:
({Template Pyora-primary:log[{#ALERTLOG},"ORA-|Checkpoint not complete|Can not allocate log|fail|halt|abort|panic",,,skip,].regexp("ORA-|Checkpoint not complete|Can not allocate log|halt|abort|panic")}=1)
log中包含了 regexp 的字符串,则表达式为真
B:
({Template Pyora-primary:log[{#ALERTLOG},"ORA-|Checkpoint not complete|Can not allocate log|fail|halt|abort|panic",,,skip,].regexp("ORA-20001|ORA-00235|DBA-|16037|ORA-609|12537|12547|Tns error struct|ORA-48913|12541|ORA-16037")}=0
log中如果匹配到 regexp中的字符串,则为假,用于过滤一些不需要的告警触发
C:
{Template Pyora-primary:log[{#ALERTLOG},"ORA-|Checkpoint not complete|Can not allocate log|fail|halt|abort|panic",,,skip,].nodata(600)}=0)
表示600秒内有数据参数则表达式为真,即600秒内如果新数据了,就表示为假。为假,则告警恢复了。
A&B&C 都为真时,才能是真。即item返回数据中会包含"断开",则触发器被触发,30秒内没有新数据的话,触发器恢复。这样就保证了触发器不会一直在触发状态。
关于log函数
log[/path/to/some/file,<regexp>,<encoding>,<maxlines>,<mode>,<output>] logtr[/path/to/some/filename_format,<regexp>,<encoding>,<maxlines>,<mode>,<output>]
◆ regexp:要匹配内容的正则表达式,或者直接写你要检索的内容也可以,例如我想检索带ERROR关键词的记录
◆ encoding:编码相关,留空即可
◆ maxlines:一次性最多提交多少行,这个参数覆盖配置文件zabbxi_agentd.conf中的’MaxLinesPerSecond’,我们也可以留空
◆ mode:默认是all,也可以是skip,skip会跳过老数据
◆ output:输出给zabbix server的数据。可以是\1、\2一直\9,\1表示第一个正则表达式匹配出得内容,\2表示第二个正则表达式匹配错的内容。
logrt的第一个参数可以使用正则表达式。针对日志回滚用得,例如我们每天都切割nginx日志,日志名位www.a.com_2015-01-01.log、www.a.com_2015-01-02.log等等
注意事项:
1、Zabbix Server和Zabbix Agent会追踪日志文件的大小和最后修改时间,并且分别记录在字节计数器和最新的时间计数器中。
2、Agent会从上次读取日志的地方开始读取日志。
3、字节计数器和最新时间计数器的数据会被记录在Zabbix数据库,并且发送给Agent,这样能够保证Agent从上次停止的地方开始读取日志。
4、当日志文件大小小于字节计数器中的数字时,字节计数器会变为0,从头开始读取文件。
5、所有符合配置的文件,都会被监控。
6、一个目录下的多个文件如果修改时间相同,会按照字母顺序来读取。
7、到每个Update interval的时间时,Agent会检查一次目录下的文件。
8、Zabbix Agent每秒发送日志量,有一个日志行数上限,防止网络和CPU负载过高,这个数字在zabbix_agentd.conf中的MaxLinePerSecond。
9、在logtr中,正则表达式只对文件名有效,对文件目录无效。
磁盘监控
可以将磁盘监控加到里面
https://github.com/Kevalin/ZabbixScripts
nginx监控
http://www.ttlsa.com/zabbix/zabbix-monitor-nginx-performance/
使用grafana 展示
http://8789878.blog.51cto.com/8779878/1681800
7 vmware的支持
支持vmware-scan需要安装libxml2
要装libxml2 ,anaconda2有个坑,把xml2-config 去掉再编译
向zabbix添加esxi的主机监控, 能否通过proxy采集还要测试
esxi主机监控:
模板 TEMPLATE_-_VMWARE_-_STANDALONE_ESXI_HOST
注意auto discovery的时间默认是3600s,调到60s
1.7.1 自动获取后 主机名都是ip地址,使用/etc/hosts 写 ip + 主机名
1.7.2 告警脚本都需要使用zabbix属组用户
1.7.3 测试告警只能通过调整触发器,而触发器都是公共的,没有服务器自定义的
1.7.4 告警确认之后,就不会再发,不然会连续发3
1.7.5 出现zabbix server负载高的问题 ,调整线程:
Zabbix busy trapper processes, in % StartTrappers=5 Zabbix busy poller processes, in % StartPollers=5 Zabbix busy ipmi poller processes, in % StartIPMIPollers=0 Zabbix busy discoverer processes, in % StartDiscoverers=1 Zabbix busy icmp pinger processes, in % StartPingers=1 Zabbix busy http poller processes, in % StartHTTPPollers=1 Zabbix busy proxy poller processes, in % StartProxyPollers=1 Zabbix busy unreachable poller processes, in % StartPollersUnreachable=1 Zabbix busy java poller processes, in % StartJavaPollers=0 Zabbix busy snmp trapper processes, in % StartSNMPTrapper=0 Zabbix busy vmware collector processes, in % StartVMwareCollectors=0
1.7.6 图形乱码问题
/usr/local/nginx/html/zabbix/include/defines.inc.php
中把FONT改为 (simkai)simkai.ttf