dell服务器硬件监控-hwcheck.py
#!/usr/bin/env python """A wrapper script with srvadmin and other tools for hardware monitor. Supported metrics: cpu memory raidcard pdisk vdisk raidcard_bat bios cmos_bat fan power board_temp cpu_temp """ import subprocess import json import time import socket import urllib2 import os from optparse importOptionParser host = socket.gethostname() ip = os.popen("/sbin/ifconfig | awk -F [:\ ]++ '/inet addr/ { print $4}'| head -1").read().strip() messages =[] verbs =[] def addverb(metric, model, index, status, info): m ={} m['metric']= metric m['model']= model m['index']= index m['status']= status m['info']= info verbs.append(m) def addmsg(metric, value): m ={} m['metric']='hw.%s'% metric m['tags']='host='+ host +',ip='+ ip m['value']= value m['timestamp']= int(time.time()) messages.append(m) def map_value(state): statemap ={0:['crit','critical'], 1:['warn','warning','non-critical'], 2:['ok','ready'] } for i in statemap: if state.lower()in statemap[i]: return i def execute(cmd): p = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True) return p.communicate() # cpu def check_cpu(): cmd ='omreport chassis processors -fmt ssv' stdout, stderr = execute(cmd) cpus =[cpu for cpu in stdout.splitlines()if'CPU'in cpu] value =2 for line in cpus: i = line.split(';') Index= i[0].strip().lower() Status= i[1].strip().lower() Connector_Name= i[2].strip().lower() Processor_Brand= i[3].strip().lower() Processor_Version= i[4].strip().lower() Current_Speed= i[5].strip().lower() State= i[6].strip().lower() Core_Count= i[7].strip().lower() model =Processor_Brand.split()[3] v = map_value(Status) if v < value: value = v addverb('cpu', model,Connector_Name,Status,State) addmsg('cpu', value) # memory def check_memory(): cmd ='omreport chassis memory -fmt ssv' stdout, stderr = execute(cmd) mems =[mem for mem in stdout.splitlines()if'DIMM'in mem] value =2 for line in mems: i = line.split(';') # TODO make sure index here is uniq Index= i[0].strip() Status= i[1].strip().lower() Connector_Name= i[2].strip().lower() Type= i[3].strip().lower() Size= i[4].strip() ifStatus=='unknown': continue index =Connector_Name.lstrip('dimm_') v = map_value(Status) if v < value: value = v addverb('memory',Type,Connector_Name,Status,Size) addmsg('memory', value) # disk raidcard def check_raidcard(): cmd ='omreport storage controller -fmt ssv' stdout, stderr = execute(cmd) ctrlers =[c for c in stdout.splitlines()if'Applicable'in c] ids =[] value =2 ifnot ctrlers: return for line in ctrlers: i = line.split(';') ID = i[0].strip() Status= i[1].strip() Name= i[2].strip() Slot_ID= i[3].strip() State= i[4].strip() Firmware_Version= i[5].strip() Latest_Available_Firmware_Version= i[6].strip() Driver_Version= i[7].strip() Minimum_Required_Driver_Version= i[8].strip() Storport_Driver_Version= i[9].strip() Minimum_Required_Storport_Driver_Version= i[10].strip() Number_of_Connectors= i[11].strip() Rebuild_Rate= i[12].strip() BGI_Rate = i[13].strip() Check_Consistency_Rate= i[14].strip() Reconstruct_Rate= i[15].strip() Alarm_State= i[16].strip() Cluster_Mode= i[17].strip() SCSI_Initiator_ID = i[18].strip() Cache_Memory_Size= i[19].strip() Patrol_Read_Mode= i[20].strip() Patrol_Read_State= i[21].strip() Patrol_Read_Rate= i[22].strip() Patrol_Read_Iterations= i[23].strip() Abort_Check_Consistency_on_Error= i[24].strip() Allow_Revertible_Hot_Spare_and_Replace_Member= i[25].strip() Load_Balance= i[26].strip() Auto_Replace_Member_on_Predictive_Failure= i[27].strip() Redundant_Path_view= i[28].strip() CacheCade_Capable= i[29].strip() Persistent_Hot_Spare= i[30].strip() Encryption_Capable= i[31].strip() Encryption_Key_Present= i[32].strip() Encryption_Mode= i[33].strip() Preserved_Cache= i[34].strip() if len(i)==36: T10_Protection_Information_Capable = i[35].strip() elif len(i)==40: Spin_Down_Unconfigured_Drives= i[35].strip() Spin_Down_Hot_Spares= i[36].strip() Spin_Down_Configured_Drives= i[37].strip() Automatic_Disk_Power_Saving_Idle_C= i[38].strip() T10_Protection_Information_Capable = i[39].strip() v = map_value(Status) if v < value: value = v ids.append(ID) addverb('raidcard',Name, ID,Status,State) addmsg('raidcard', value) return(ids) # pdisk def check_pdisk(ctrlers=[0]): ifnot ctrlers: return value =2 for cid in ctrlers: cmd ='omreport storage pdisk controller=%s -fmt ssv'% cid stdout, stderr = execute(cmd) pdisks =[p for p in stdout.splitlines()if'HDD'in p or'SSD'in p] for line in pdisks: i = line.split(';') ID = i[0].strip() Status= i[1].strip() Name= i[2].strip() State= i[3].strip() Power_Status= i[4].strip() Bus_Protocol= i[5].strip() Media= i[6].strip() Part_of_Cache_Pool= i[7].strip() Remaining_Rated_Write_Endurance= i[8].strip() Failure_Predicted= i[9].strip() Revision= i[10].strip() Driver_Version= i[11].strip() Model_Number= i[12].strip() T10_PI_Capable = i[13].strip() Certified= i[14].strip() Encryption_Capable= i[15].strip() Encrypted= i[16].strip() Progress= i[17].strip() Mirror_Set_ID= i[18].strip() Capacity= i[19].strip() Used_RAID_Disk_Space= i[20].strip() Available_RAID_Disk_Space= i[21].strip() Hot_Spare= i[22].strip() Vendor_ID= i[23].strip() Product_ID= i[24].strip() Serial_No= i[25].strip() Part_Number= i[26].strip() Negotiated_Speed= i[27].strip() Capable_Speed= i[28].strip() PCIe_Maximum_Link_Width= i[29].strip() PCIe_Negotiated_Link_Width= i[30].strip() Sector_Size= i[31].strip() Device_Write_Cache= i[32].strip() Manufacture_Day= i[33].strip() Manufacture_Week= i[34].strip() Manufacture_Year= i[35].strip() SAS_Address = i[36].strip() info ={} info ={'Bus_Protocol':Bus_Protocol,'Media':Media, 'Capacity':Capacity,'State':State, 'Vendor_ID':Vendor_ID, 'Serial_No':Serial_No} ifProgress!='Not Applicable': info['Progress']=Progress v = map_value(Status) if v < value: value = v addverb('pdisk',Product_ID, ID,Status, info) addmsg('pdisk', value) # vdisk def check_vdisk(ctrlers=[0]): ifnot ctrlers: return value =2 for cid in ctrlers: cmd ='omreport storage vdisk controller=%s -fmt ssv'% cid stdout, stderr = execute(cmd) vdisks =[v for v in stdout.splitlines()if'HDD'in v or'SSD'in v] for line in vdisks: i = line.split(';') ID = i[0].strip() Status= i[1].strip() Name= i[2].strip() State= i[3].strip() Hot_Spare_Policy_violated= i[4].strip() if len(i)==19: Virtual_Disk_Bad_Blocks= i[5].strip() Encrypted= i[6].strip() Layout= i[7].strip() Size= i[8].strip() T10_Protection_Information_Status = i[9].strip() Associated_Fluid_Cache_State= i[10].strip() Device_Name= i[11].strip() Bus_Protocol= i[12].strip() Media= i[13].strip() Read_Policy= i[14].strip() Write_Policy= i[15].strip() Cache_Policy= i[16].strip() Stripe_Element_Size= i[17].strip() Disk_Cache_Policy= i[18].strip() elif len(i)==18: Encrypted= i[5].strip() Layout= i[6].strip() Size= i[7].strip() T10_Protection_Information_Status = i[8].strip() Associated_Fluid_Cache_State= i[9].strip() Device_Name= i[10].strip() Bus_Protocol= i[11].strip() Media= i[12].strip() Read_Policy= i[13].strip() Write_Policy= i[14].strip() Cache_Policy= i[15].strip() Stripe_Element_Size= i[16].strip() Disk_Cache_Policy= i[17].strip() info ={} info ={'Bus_Protocol':Bus_Protocol,'Media':Media, 'Device_Name':Device_Name,'Size':Size,'State':State} if len(i)==19: info['Virtual_Disk_Bad_Blocks']=Virtual_Disk_Bad_Blocks v = map_value(Status) if v < value: value = v addverb('vdisk',Layout, ID,Status, info) addmsg('vdisk', value) # raidcard battery def check_raidcard_bat(): cmd ='omreport storage battery -fmt ssv' stdout, stderr = execute(cmd) batteries =[bat for bat in stdout.splitlines()if'Battery'in bat] ifnot batteries: return value =2 for line in batteries: i = line.split(';') ID = i[0].strip() Status= i[1].strip() Name= i[2].strip() State= i[3].strip() Recharge_Count= i[4].strip() Max_Recharge_Count= i[5].strip() Learn_State= i[6].strip() Next_Learn_Time= i[7].strip() Maximum_Learn_Delay= i[8].strip() try: Learn_Mode= i[9].strip() except: Learn_Mode=False v = map_value(Status) if v < value: value = v addverb('raidcard_bat',Name, ID,Status,Learn_State) addmsg('raidcard_bat', value) # bios def check_bios(): cmd ='omreport chassis biossetup -fmt ssv' stdout, stderr = execute(cmd) bsets =[b for b in stdout.splitlines()if'C State'in b or'C1-E'in b or 'C1E'in b] ifnot bsets: return value =2 for line in bsets: i = line.split(';') ATTRIBUTE = i[0].strip().lower() if'c state'in ATTRIBUTE: index ='cstate' else: index ='c1e' VALUE = i[1].strip() if VALUE =='Enabled': Status='warn' elif VALUE =='Disabled': Status='ok' else: continue v = map_value(Status) if v < value: value = v addverb('bios',"bios_setting", ATTRIBUTE,Status, VALUE) addmsg('bios', value) # cmos battery def check_cmos_bat(): cmd ='omreport chassis batteries -fmt ssv' stdout, stderr = execute(cmd) bats =[battery for battery in stdout.splitlines()if'CMOS'in battery] ifnot bats: return value =2 for line in bats: i = line.split(';') Index= i[0].strip() Status= i[1].strip() Probe_Name= i[2].strip() Reading= i[3].strip() v = map_value(Status) if v < value: value = v addverb('cmos_bat',Probe_Name,Index,Status,Reading) addmsg('cmos_bat', value) # fan def check_fan(): cmd ='omreport chassis fans -fmt ssv' stdout, stderr = execute(cmd) fans =[fan for fan in stdout.splitlines()if'RPM'in fan] ifnot fans: return value =2 for line in fans: i = line.split(';') Index= i[0].strip() Status= i[1].strip() Probe_Name= i[2].strip() Reading= i[3].strip() Minimum_Warning_Threshold= i[4].strip() Maximum_Warning_Threshold= i[5].strip() Minimum_Failure_Threshold= i[6].strip() Maximum_Failure_Threshold= i[7].strip() v = map_value(Status) if v < value: value = v addverb('fan',Probe_Name,Index,Status,Reading) addmsg('fan', value) # power def check_power(): cmd ='omreport chassis pwrmonitoring -fmt ssv' stdout, stderr = execute(cmd) powers =[pwr for pwr in stdout.splitlines()if'System Board'in pwr] ifnot powers: return value =2 for line in powers: i = line.split(';') Index= i[0].strip() Status= i[1].strip() Probe_Name= i[2].strip() Reading= i[3].strip() Warning_Threshold= i[4].strip() Failure_Threshold= i[5].strip() v = map_value(Status) w =Reading.split()[0] if w > value: value = w addverb('power',Probe_Name,Index,Status,Reading) addmsg('power', value) # board temp def check_board_temp(): cmd ='omreport chassis temps -fmt ssv' stdout, stderr = execute(cmd) temp =[t for t in stdout.splitlines()if'Board'in t] ifnot temp: return value =2 for line in temp: i = line.split(';') Index= i[0].strip() Status= i[1].strip() Probe_Name= i[2].strip() Reading= i[3].strip().split()[0] Minimum_Warning_Threshold= i[4].strip() Maximum_Warning_Threshold= i[5].strip() Minimum_Failure_Threshold= i[6].strip() Maximum_Failure_Threshold= i[7].strip() v = float(Reading) if v > value: value = v addverb('board_temp',Probe_Name,Index,Status,Reading) addmsg('board_temp', value) # cpu temp def check_cpu_temp(): cmd ='sensors' stdout, stderr = execute(cmd) lines = stdout.splitlines() temps =[] id =False temp ={} for line in lines: if line.startswith('coretemp'): if line != id: id = line temp ={} value =0 temp['id']= id elif line.startswith('Core'): lastcore =True key = line.split(':')[0] vv = line.split(':')[1].split()[0] v = vv.split('\xc2\xb0C')[0].split('+')[1] if float(v)> value: value = float(v) temp['core']= key temp['reading']= value elif line ==''and lastcore: if len(temp)!=0: temps.append(temp) else: lastcore =False value =2 for temp in temps: Index='%d'% temps.index(temp) Probe_Name= temp['id'] Reading= temp['reading'] Maximum_Warning_Threshold=80 Maximum_Failure_Threshold=90 ifReading>=Maximum_Failure_Threshold: Status='crit' elifReading>=Maximum_Warning_Threshold: Status='warn' else: Status='ok' ifReading> value: value =Reading addverb('cpu_temp',Probe_Name,Index,Status,Reading) addmsg('cpu_temp', value) def check(target=False): ifnot target: check_cpu() check_memory() ctrlers = check_raidcard() check_pdisk(ctrlers=ctrlers) check_vdisk(ctrlers=ctrlers) check_raidcard_bat() check_cmos_bat() check_bios() check_fan() check_power() check_board_temp() check_cpu_temp() elif target =='cpu': check_cpu() elif target =='memory': check_memory() elif target =='raidcard': check_raidcard() elif target =='pdisk': c = check_raidcard() check_pdisk(c) elif target =='vdisk': c = check_raidcard() check_vdisk(c) elif target =='raidcard_bat': check_raidcard_bat() elif target =='cmos_bat': check_cmos_bat() elif target =='bios': check_bios() elif target =='fan': check_fan() elif target =='power': check_power() elif target =='board_temp': check_board_temp() elif target =='cpu_temp': check_cpu_temp() return messages def push(message): try: urllib2.urlopen( url ='http://127.0.0.1:1988/v1/push', data = json.dumps(message) ) except: pass metrics =['cpu','memory','raidcard','pdisk','vdisk','raidcard_bat', 'bios','cmos_bat','fan','power','board_temp','cpu_temp'] parser =OptionParser() parser.add_option("-p","--push", action="store_true", dest="push", help="push result to agent") parser.add_option("-d","--debug", action="store_true", dest="debug", help="output debug info") parser.add_option("-m","--metric", action="store", dest="metric", help="check special metric") (options, args)= parser.parse_args() metric=None if options.metric: metric = options.metric if metric notin metrics: print __doc__ parser.print_help() exit(1) messages = check(target=metric) if options.push: push(messages) else: if options.debug: print json.dumps(messages, indent=2) else: print json.dumps(verbs, indent=2)
如何安装
配置dell官方repo,安装srvadmin等依赖包
#参考: http://linux.dell.com/repo/hardware/latest/wget -q -O - http://linux.dell.com/repo/hardware/latest/bootstrap.cgi | bashyum install srvadmin-omacore srvadmin-omcommon srvadmin-storage-cli smbios-utils-bin lm_sensors dmidecode cronie
# 启动srvadmin服务/opt/dell/srvadmin/sbin/srvadmin-services.sh enable/opt/dell/srvadmin/sbin/srvadmin-services.sh restart
# 配置lm-sensorsecho yes | /usr/sbin/sensors-detect
如何使用
参数说明
直接执行hwcheck.py不带参数默认会打印出详细的监控数据
hwcheck-d # 打印metrics信息,即是push到munin-agent的数据-p # push数据到munis-agent-m # 指定单个metricSupported metrics:
cpu memory raidcard pdisk vdisk raidcard_bat
bios cmos_bat fan power board_temp cpu_temp
配置报警策略
hwcheck.py push到munin-agent的metric均以 hw 打头,如hw.cpu_temp,除温度是实际的数值外,
其他metric的value中 0表示故障,1表示警告,2表示OK,例如在策略模板中配置如下策略:
| hw.bios [BIOS中C1E/Cstate未禁用] | all(#2)<2 | 1 | 4 |
| hw.board_temp [主板温度过高] | all(#3)>=35 | 1 | 4 |
| hw.cmos_bat [主板电池有问题] | all(#3)<2 | 1 | 4 |
| hw.cpu [CPU可能故障] | all(#2)==1 | 1 | 4 |
| hw.cpu [严重: CPU严重故障] | all(#2)==0 | 2 | 0 |
| hw.fan [风扇出现故障] | all(#3)<2 | 1 | 4 |
| hw.memory [内存可能故障] | all(#1)==1 | 1 | 4 |
| hw.memory [严重: 内存严重故障] | all(#1)==0 | 2 | 0 |
| hw.pdisk [严重: 物理盘严重故障] | all(#1)==0 | 2 | 0 |
| hw.raidcard [阵列卡出现警告] | all(#2)==1 | 1 | 4 |
| hw.raidcard [严重: 阵列卡严重故障] | all(#1)==0 | 2 | 0 |
| hw.raidcard_bat [阵列卡电池出现警告] | all(#2)==1 | 1 | 4 |
| hw.raidcard_bat [严重: 阵列卡电池严重故障] | all(#2)==0 | 2 | 0 |
| hw.vdisk [磁盘阵列出现警告] | all(#2)==1 | 1 | 4 |
| hw.vdisk [严重: 磁盘阵列严重故障] | all(#2)==0 | 2 | 0 |
|
metric/tags/note
|
触发条件
|
最大报警次数
|
报警级别
|
|---|
学习提高自己,能力证明自己,技能创造价值

浙公网安备 33010602011771号