检测openstack云平台是否存在脑裂的虚拟机,加入zabbix告警

           在openstack运维中,有时会遇到虚拟机热迁移,evacuate等操作中,发生虚拟机脑裂的情况,即同一个虚拟机同时在两个hypervisor上面运行,在使用ceph等共享存储时,十有八九会造成虚拟机文件系统损伤,运气好的情况下能修复文件系统错误,重则数据混乱,虚拟机无法启动.为此,我写了一个Python脚本,用于检测openstack的hypervisor(KVM)是否存在脑裂虚拟机,其原理是通过libvirt的API,获取hypervisor上面的虚拟机的名字,比较是否不同的hypervisor上面有相同的虚拟机名字.脚本需要在控制节点运行(需要openrc的环境变量文件),且该控制节点与计算节点打通了ssh秘钥验证,/etc/hosts解析所有hypervisor的主机名.笔者测试在liberty版本和Mitika能正常运行.

2018年10月25日更新:

1. 脚本内容变更如下

import re
import os
import libvirt
import json
from novaclient import client
from multiprocessing import Pool,Queue
from collections import Counter,defaultdict

q=Queue()
EnvFile='/root/openrc'

def get_nova_creds():
    d={}
    pattern_save=re.compile(r'^export.*=.*')
    pattern_split=re.compile(r'=')
    with open(EnvFile,'r') as f:
        for i in f.readlines():
            match=pattern_save.search(i)
            if match:
                temp_str = match.group(0).strip("export").strip()
                environ_value_dic = pattern_split.split(temp_str)
                os.environ[environ_value_dic[0]] = environ_value_dic[1].strip("'")
    d['username'] = os.environ['OS_USERNAME']
    d['api_key'] = os.environ['OS_PASSWORD']
    d['auth_url'] = os.environ['OS_AUTH_URL']
    d['project_id'] = os.environ['OS_TENANT_NAME']
    d['region_name']= os.environ['OS_REGION_NAME']
    return d

def getHypervisor():
    HypervisorHostname = []
    pattern = re.compile(r'node-\d\.domain.tld')
    creds = get_nova_creds()
    nova = client.Client('2', **creds)
    for i in nova.hypervisors.list():
        match = pattern.search(i.hypervisor_hostname)
        if match:
            HypervisorHostname.append(match.group())
    return  HypervisorHostname

def getVM(node):
    try:
        virtcon=libvirt.open("qemu+ssh://%s/system" %node)
    except libvirt.libvirtError,e:
        print "wrong to connect %s libvirt api" %node+' '+str(e)    #增加无法连接计算节点libvirt情况下错误输出
    for id  in virtcon.listDomainsID():
        vminfo=virtcon.lookupByID(id)
        if vminfo.state(0)[0]==libvirt.VIR_DOMAIN_RUNNING:     #增加判断,只把running状态的虚拟机放入Queue命令
            q.put((node,vminfo.name()))
    virtcon.close()

def getVMList():
    InstanceNameList=[]
    # a=getHypervisor()
    # HyperDict = {}.fromkeys(a, [])
    HyperDict=defaultdict(list)
    while not q.empty():
        node,vm=q.get()
        InstanceNameList.append(vm)
        HyperDict[node].append(vm)
    return InstanceNameList,HyperDict

def VMSplitCheck(instancelist,nodedict):
    SplitList=[]
    SplitDict=defaultdict(list)
    c=Counter(instancelist)
    for k,v in c.iteritems():
        if v>=2:
            SplitList.append(k)
    if len(SplitList)!=0:
       for i in SplitList:
           for k,v in nodedict.iteritems():
               if i in v:
                   SplitDict[i].append(k)
    return SplitDict

def main():
    hypername=getHypervisor()
    p=Pool()
    for i in hypername:
        p.apply_async(getVM,args=(i,))
    p.close()
    p.join()

    inslist,hydict=getVMList()
    vmsplit=VMSplitCheck(inslist,hydict)
    if len(vmsplit)!=0:
        print "found_split_vm"+":"+json.dumps(vmsplit)    #只在发现脑裂虚拟机情况下输出文本内容

if __name__ =="__main__":
    main()

 

2 zabbix的item修改

item的Type of information改成text类型

3.zabbix触发器修改

触发器修改为item输出内容里面关键字过滤是否有found_vm_split字段,有的情况下触发告警。

 

4 测试情况

 

 ############################################此为分割线###############################################

 

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @File    : VMSplitCheck.py
import re
import os
import sys
import libvirt
from novaclient import client
from multiprocessing import Pool,Queue
from collections import Counter,defaultdict
import pdb
q=Queue()
EnvFile='/root/openrc'

 

 

# 获取nova的认证token

def get_nova_creds():
    d={}
    try:
        pattern_save=re.compile(r'^export.*=.*')
        pattern_split=re.compile(r'=')
        with open(EnvFile,'r') as f:
            for i in f.readlines():
                match=pattern_save.search(i)
                if match:
                    temp_str = match.group(0).strip("export").strip()
                    environ_value_dic = pattern_split.split(temp_str)
                    os.environ[environ_value_dic[0]] = environ_value_dic[1].strip("'")
        d['username'] = os.environ['OS_USERNAME']
        d['api_key'] = os.environ['OS_PASSWORD']
        d['auth_url'] = os.environ['OS_AUTH_URL']
        d['project_id'] = os.environ['OS_TENANT_NAME']
        d['region_name']= os.environ['OS_REGION_NAME']
        return d
    except:
        print  "error"
        sys.exit(5)

 

#通过调用novaclient获取hypervisor,笔者环境hypervisor主机名都为node-xxx.domain.tld
def getHypervisor():
    HypervisorHostname = []
    pattern = re.compile(r'node-\d\.domain.tld')
    creds = get_nova_creds()
    nova = client.Client('2', **creds)
    for i in nova.hypervisors.list():
        match = pattern.search(i.hypervisor_hostname)
        if match:
            HypervisorHostname.append(match.group())
    return  HypervisorHostname

 

 

#获取各个hypervisor上面的虚拟机的名字,放入队列
def getVM(node):
    try:
        virtcon=libvirt.open("qemu+ssh://%s/system" %node)
    except libvirtError,e:
        print "wrong to connect"
    for id  in virtcon.listDomainsID():
        vminfo=virtcon.lookupByID(id)
        q.put((node,vminfo.name()))

 

 

# 获取所有运行的虚拟机名字的的列表,和以各个hypervisor的主机名为key,上面运行虚拟机为value的字典
def getVMList():
    InstanceNameList=[]
    # a=getHypervisor()
    # HyperDict = {}.fromkeys(a, [])
    HyperDict=defaultdict(list)
    while not q.empty():
        node,vm=q.get()
        InstanceNameList.append(vm)
        HyperDict[node].append(vm)
    return InstanceNameList,HyperDict

 

 

# 检测是否有脑裂虚拟机,如果存在,获取此脑裂虚拟机运行在哪些hypervisor上面.
def VMSplitCheck(instancelist,nodedict):
    SplitList=[]
    SplitDict=defaultdict(list)
    c=Counter(instancelist)
    for k,v in c.iteritems():
        if v>=2:
            SplitList.append(k)
    if len(SplitList)==0:
        print "no split vm"
    else:
       for i in SplitList:
           for k,v in nodedict.iteritems():
               if i in v:
                   SplitDict[i].append(k)
    return SplitDict

 

 

# 主函数
def main():
    hypername=getHypervisor()
#    pdb.set_trace()
#    print hypername
    p=Pool()
    for i in hypername:
        p.apply_async(getVM,args=(i,))
    p.close()
    p.join()

    inslist,hydict=getVMList()
#    print inslist,hydict
    vmsplit=VMSplitCheck(inslist,hydict)
    print  len(vmsplit)

if __name__ =="__main__":
    main()

 

 

笔者测试如下:

如下图,有2台虚拟机同时在node-4和node-6运行

 

运行脚本后,返回以脑裂虚拟机名字为key,同时运行的hypervisor主机名为value的字典.

 

 

2 增加zabbix报警设置

      zabbix-agent增加item

         zabbix-dashboard操作

  

 

 

测试item能否正常

 

增加一个触发器

表达式:{node-1.domain.tld:vm.split.status.last(0)}>0


 

 

 

参考:

https://www.ibm.com/developerworks/cn/cloud/library/cl-openstack-pythonapis/

http://blog.csdn.net/gzhouc/article/details/52915822

 

posted @ 2017-10-11 21:04  360linux  阅读(295)  评论(0编辑  收藏  举报