ceph从新启动后lvm消失问题,lvm丢失记录

ceph从新启动后lvm消失问题

从新启动ceph集群后 有几台机器lvm莫名消失了.

集群就炸了啊,赶紧 nodown noout norecover 一套标记走起来.

vgscan pvscan 都搜不到,对应osd也起不来

[root@ceph-62 ~]# lsblk
NAME            MAJ:MIN RM   SIZE RO TYPE MOUNTPOINT
sda               8:0    0   9.1T  0 disk 
sdb               8:16   0   9.1T  0 disk 
sdc               8:32   0   9.1T  0 disk 
sdd               8:48   0   9.1T  0 disk 
sde               8:64   0   9.1T  0 disk 
sdf               8:80   0   9.1T  0 disk 
sdg               8:96   0   9.1T  0 disk 
sdh               8:112  0   9.1T  0 disk 
sdi               8:128  0   9.1T  0 disk 
sdj               8:144  0   9.1T  0 disk 
sdk               8:160  0   9.1T  0 disk 
sdl               8:176  0   9.1T  0 disk 
sdm               8:192  0   9.1T  0 disk 
sdn               8:208  0   9.1T  0 disk 
sdo               8:224  0   9.1T  0 disk 
sdp               8:240  0   9.1T  0 disk 
sdq              65:0    0   9.1T  0 disk 
sdr              65:16   0   9.1T  0 disk 
sds              65:32   0   9.1T  0 disk 
sdt              65:48   0 223.6G  0 disk 
├─sdt1           65:49   0   200M  0 part /boot/efi
├─sdt2           65:50   0     1G  0 part /boot
└─sdt3           65:51   0 222.4G  0 part 
  ├─centos-root 253:0    0    50G  0 lvm  /
  ├─centos-swap 253:1    0     4G  0 lvm  [SWAP]
  └─centos-home 253:2    0 168.4G  0 lvm  /home
sdu              65:64   0   9.1T  0 disk 

一看凉了,正常的是这样的

[root@ceph-59 ~]# date
NAME                                                                                                 MAJ:MIN RM   SIZE RO TYPE MOUNTPOINT
sda                                                                                                    8:0    0   9.1T  0 disk 
└─ceph--ebc2b2fc--be8e--48af--9729--8149f9a3960e-osd--data--c7ac8edc--4187--46ca--9569--b35faaf1dd9c 253:13   0   9.1T  0 lvm  
sdb                                                                                                    8:16   0   9.1T  0 disk 
└─ceph--3e5b8aba--ec7e--4448--b8c5--ea3e3fe7b4a5-osd--data--11b74eff--5a3c--432e--8ec9--aa776c9d4c64 253:14   0   9.1T  0 lvm  
sdc                                                                                                    8:32   0   9.1T  0 disk 
└─ceph--8800da1a--2764--4370--b8d7--8a0b1332bfc4-osd--data--fd68fe83--b142--4b55--a3d4--aa7b728febec 253:15   0   9.1T  0 lvm  
sdd                                                                                                    8:48   0   9.1T  0 disk 
└─ceph--ebd8ee23--cada--4717--8933--3cc5cdbb9840-osd--data--603de199--c6e3--4a78--a2cf--1cda25bd3d02 253:9    0   9.1T  0 lvm  
sde                                                                                                    8:64   0   9.1T  0 disk 
└─ceph--ce85d728--29a1--4faa--92f4--5288801cf7c0-osd--data--4dfb9363--52bf--498b--9417--9624b99f0a95 253:10   0   9.1T  0 lvm  
sdf                                                                                                    8:80   0   9.1T  0 disk 
└─ceph--9dbd5617--aa78--45cb--b80a--e33944890518-osd--data--c1cd41dd--3042--4949--9551--c081b2fe418d 253:11   0   9.1T  0 lvm  
sdg                                                                                                    8:96   0   9.1T  0 disk 
└─ceph--3ce3bcda--0250--44cd--ac93--50d585ef5ad5-osd--data--101b4271--cf4c--4ec5--aa53--4627d4feb698 253:12   0   9.1T  0 lvm  
sdh                                                                                                    8:112  0   9.1T  0 disk 
└─ceph--86368e74--660f--4772--8844--bf9c29e4e730-osd--data--3bc3cc6c--5850--4c83--a67a--51b024addce6 253:19   0   9.1T  0 lvm  
sdi                                                                                                    8:128  0   9.1T  0 disk 
└─ceph--79778284--af9a--416a--814b--22bb1c732794-osd--data--fcd6b24d--e48e--4315--9bcd--d54fba335815 253:20   0   9.1T  0 lvm  
sdj                                                                                                    8:144  0   9.1T  0 disk 
└─ceph--bf7c58a5--a9e5--4b77--81c3--ee75634754f7-osd--data--8e10bfa5--ce46--46fe--ba2d--ba70a78eb17b 253:21   0   9.1T  0 lvm  

  • 先看看 /etc/lvm/backup/*

    [root@ceph-59 ~]# ls /etc/lvm/backup/
    centos                                     ceph-3e5b8aba-ec7e-4448-b8c5-ea3e3fe7b4a5  ceph-9dbd5617-aa78-45cb-b80a-e33944890518  ceph-ebd8ee23-cada-4717-8933-3cc5cdbb9840
    ceph-0033060c-5010-4bd5-9859-78ffe5ceff27  ceph-6e4dedf0-130c-4b76-a3be-d2520b69f522  ceph-ab9e39dd-dde1-41c8-8bc1-11936adbf85a  ceph-f3220c11-eb66-45ba-9525-fb7017c45d4f
    ceph-28354762-18d9-4302-b11c-946832b7dceb  ceph-716552a4-af1b-4ceb-b034-a31195152391  ceph-bf7c58a5-a9e5-4b77-81c3-ee75634754f7  ceph-f95e3fc6-c7ee-48e8-bc70-ab40e995a868
    ceph-2be8cb74-f986-4893-b3f6-784e1b128e01  ceph-79778284-af9a-416a-814b-22bb1c732794  ceph-c7094bd4-d4a4-427c-add7-0eabd83ee1ba
    ceph-36ef94cd-7f9a-4ecc-9d12-4268289d60cd  ceph-86368e74-660f-4772-8844-bf9c29e4e730  ceph-ce85d728-29a1-4faa-92f4-5288801cf7c0
    ceph-3ce3bcda-0250-44cd-ac93-50d585ef5ad5  ceph-8800da1a-2764-4370-b8d7-8a0b1332bfc4  ceph-ebc2b2fc-be8e-48af-9729-8149f9a3960e
    
    

    **推荐看 ls /etc/lvm/archive/ **

    因为这里版本比较多些.

    [root@ceph-59 ~]# ls /etc/lvm/archive/*
    /etc/lvm/archive/centos_00000-1556371822.vg                                     /etc/lvm/archive/ceph-8800da1a-2764-4370-b8d7-8a0b1332bfc4_00002-498819197.vg
    /etc/lvm/archive/ceph-0033060c-5010-4bd5-9859-78ffe5ceff27_00000-191613742.vg   /etc/lvm/archive/ceph-8800da1a-2764-4370-b8d7-8a0b1332bfc4_00003-196229818.vg
    /etc/lvm/archive/ceph-0033060c-5010-4bd5-9859-78ffe5ceff27_00001-1071232553.vg  /etc/lvm/archive/ceph-8800da1a-2764-4370-b8d7-8a0b1332bfc4_00004-1508357692.vg
    /etc/lvm/archive/ceph-0033060c-5010-4bd5-9859-78ffe5ceff27_00002-1833047220.vg  /etc/lvm/archive/ceph-8800da1a-2764-4370-b8d7-8a0b1332bfc4_00005-482948107.vg
    /etc/lvm/archive/ceph-0033060c-5010-4bd5-9859-78ffe5ceff27_00003-1002373647.vg  /etc/lvm/archive/ceph-8800da1a-2764-4370-b8d7-8a0b1332bfc4_00006-695412169.vg
    /etc/lvm/archive/ceph-0033060c-5010-4bd5-9859-78ffe5ceff27_00004-882636099.vg   /etc/lvm/archive/ceph-9dbd5617-aa78-45cb-b80a-e33944890518_00000-1186551457.vg
    /etc/lvm/archive/ceph-0033060c-5010-4bd5-9859-78ffe5ceff27_00005-645028611.vg   /etc/lvm/archive/ceph-9dbd5617-aa78-45cb-b80a-e33944890518_00001-1556927714.vg
    /etc/lvm/archive/ceph-0033060c-5010-4bd5-9859-78ffe5ceff27_00006-2143351603.vg  /etc/lvm/archive/ceph-9dbd5617-aa78-45cb-b80a-e33944890518_00002-1272062293.vg
    /etc/lvm/archive/ceph-28354762-18d9-4302-b11c-946832b7dceb_00000-1313439840.vg  /etc/lvm/archive/ceph-9dbd5617-aa78-45cb-b80a-e33944890518_00003-163927944.vg
    /etc/lvm/archive/ceph-28354762-18d9-4302-b11c-946832b7dceb_00001-1936313617.vg  /etc/lvm/archive/ceph-9dbd5617-aa78-45cb-b80a-e33944890518_00004-427518662.vg
    /etc/lvm/archive/ceph-28354762-18d9-4302-b11c-946832b7dceb_00002-1765261441.vg  /etc/lvm/archive/ceph-9dbd5617-aa78-45cb-b80a-e33944890518_00005-544615481.vg
    /etc/lvm/archive/ceph-28354762-18d9-4302-b11c-946832b7dceb_00003-1532647703.vg  /etc/lvm/archive/ceph-9dbd5617-aa78-45cb-b80a-e33944890518_00006-671781823.vg
    /etc/lvm/archive/ceph-28354762-18d9-4302-b11c-946832b7dceb_00004-1642162971.vg  /etc/lvm/archive/ceph-ab9e39dd-dde1-41c8-8bc1-11936adbf85a_00000-2137077032.vg
    /etc/lvm/archive/ceph-28354762-18d9-4302-b11c-946832b7dceb_00005-66156891.vg    /etc/lvm/archive/ceph-ab9e39dd-dde1-41c8-8bc1-11936adbf85a_00001-533416413.vg
    /etc/lvm/archive/ceph-28354762-18d9-4302-b11c-946832b7dceb_00006-151645212.vg   /etc/lvm/archive/ceph-ab9e39dd-dde1-41c8-8bc1-11936adbf85a_00002-266310388.vg
    /etc/lvm/archive/ceph-2be8cb74-f986-4893-b3f6-784e1b128e01_00000-1665211260.vg 
    
  • 随便打开一个看看内容

    [root@ceph-59 ~]# cat /etc/lvm/backup/ceph-0033060c-5010-4bd5-9859-78ffe5ceff27 
    # Generated by LVM2 version 2.02.185(2)-RHEL7 (2019-05-13):
    
    contents = "Text Format Volume Group"
    version = 1
    
    description = "Created *after* executing 'vgchange -ay ceph-0033060c-5010-4bd5-9859-78ffe5ceff27'"
    
    creation_host = "ceph-59"       # Linux ceph-59 3.10.0-1062.el7.x86_64 #1 SMP Wed Aug 7 18:08:02 UTC 2019 x86_64
    creation_time = 1611333863     
    
    ceph-0033060c-5010-4bd5-9859-78ffe5ceff27 {
            id = "WIT8wF-kJdI-SS3E-5ue8-e3Ws-Awk3-AnkDkz"
            seqno = 6
            format = "lvm2"                 # informational
            status = ["RESIZEABLE", "READ", "WRITE"]
            flags = []
            extent_size = 8192              # 4 Megabytes
            max_lv = 0
            max_pv = 0
            metadata_copies = 0
    
            physical_volumes {
    
                    pv0 {
                            id = "EIGfcN-503D-MVDf-AwK1-SDtA-A4Fw-mvBqpe"
                            device = "/dev/sdn"     # Hint only
    
                            status = ["ALLOCATABLE"]
                            flags = []
                            dev_size = 19532873728  # 9.0957 Terabytes
                            pe_start = 2048
                            pe_count = 2384383      # 9.0957 Terabytes
                    }
            }
    
            logical_volumes {
    
                    osd-data-d9fa7e7d-57f6-437c-9b2b-c9652ec0697d {
                            id = "WtB7QP-qOxT-hV2b-3a2X-zPFS-09aP-HVKbc0"
                            status = ["READ", "WRITE", "VISIBLE"]
                            flags = []
                            tags = ["ceph.osdspec_affinity=", "ceph.vdo=0", "ceph.osd_id=208", "ceph.osd_fsid=c71e53d3-97e8-4f97-a356-1a4bb1187d45", "ceph.cluster_name=ceph", "ceph.cluster_fsid=9f4c8519-821e-434a-9e85-cdef908b808c", "ceph.encrypted=0", "ceph.cephx_lockbox_secret=", "ceph.type=block", "ceph.crush_device_class=None", "ceph.block_device=/dev/ceph-0033060c-5010-4bd5-9859-78ffe5ceff27/osd-data-d9fa7e7d-57f6-437c-9b2b-c9652ec0697d", "ceph.block_uuid=WtB7QP-qOxT-hV2b-3a2X-zPFS-09aP-HVKbc0"]
                            creation_time = 1601402387      # 2020-09-30 01:59:47 +0800
                            creation_host = "ceph-59"
                            segment_count = 1
    
                            segment1 {
                                    start_extent = 0
                                    extent_count = 2384383  # 9.0957 Terabytes
    
                                    type = "striped"
                                    stripe_count = 1        # linear
    
                                    stripes = [
                                            "pv0", 0
                                    ]
                            }
                    }
            }
    
    }
    
  • 挑点我们一会就用的

                    pv0 {
                            id = "EIGfcN-503D-MVDf-AwK1-SDtA-A4Fw-mvBqpe"
                            device = "/dev/sdn"     # Hint only
    

    device 是设备名 id是设备uuid

                            tags = ["ceph.osdspec_affinity=", "ceph.vdo=0", "ceph.osd_id=208", "ceph.osd_fsid=c71e53d3-97e8-4f97-a356-1a4bb1187d45", "ceph.cluster_name=ceph", "ceph.cluster_fsid=9f4c8519-821e-434a-9e85-cdef908b808c", "ceph.encrypted=0", "ceph.cephx_lockbox_secret=", "ceph.type=block", "ceph.crush_device_class=None", "ceph.block_device=/dev/ceph-0033060c-5010-4bd5-9859-78ffe5ceff27/osd-data-d9fa7e7d-57f6-437c-9b2b-c9652ec0697d", "ceph.block_uuid=WtB7QP-qOxT-hV2b-3a2X-zPFS-09aP-HVKbc0"]
    
    

    ceph.osd_id 是osd的id ceph.osd_fsid osd的fsid

下面是临时写的一个垃圾脚本,基本能跳过已经存在vg不修复,根据备份的信息来自动恢复vg然后自动挂载和启动osd服务并加入集群.

import os
import re
import time

rea=r'id = "([\w+\-*\w*]+)"\s*device = "([\w/]*)"'
reb=r'ceph.osd_id=(\d+)", "ceph.osd_fsid=([\w*\-*\w*]+)"'




    # with open(r'C:\Users\makeit\Desktop\ceph-0033060c-5010-4bd5-9859-78ffe5ceff27','r') as f:


def getAllLvmBackFile():
    backs=os.listdir('/etc/lvm/backup/')
    flist=[]
    for i in backs:
        if  'ceph' in i:
            flist.append(os.path.join('/etc/lvm/backup/',i))
    return flist

def getTargetLvms():
    flist=getAllLvmBackFile()
    cmd_get_old_vgs='''vgs|grep ceph|awk '{print $1}' >/tmp/oldlvms'''
    os.system(cmd_get_old_vgs)
    goodlvms=''
    with open('/tmp/oldlvms','r') as f:
        godlvms=f.readlines()
    for l in goodlvms:
        l=l.strip()
        for i in flist:
            # print(i,l,i.find(l))
            if i.find(l)!=-1:
                print('remove ',i)
                flist.remove(i)
                break

    return flist




def fixOneOSD(filename):
    content=''
    with open(filename,'r') as f:
        content=f.read()

    r=re.search(rea,content)
    uuid=''
    dev=''
    if r:
        uuid=r.group(1).replace('\n','').replace('\r','')
        dev=r.group(2)
        print(uuid,dev)

    #recovery lvm

    cmd_clear_part='dd if=/dev/zero of={} bs=1k count=6'.format(dev)
    os.system(cmd_clear_part)
    print('create pv ')
    cmd_create_new_pv='pvcreate --force --uuid {}  --restorefile {} {}'.format(uuid,filename,dev)
    cmd_restore_vg='vgcfgrestore {}'.format(os.path.split(filename)[1])
    cmd_active_vg='vgchange -ay {}'.format(os.path.split(filename)[1])
    os.system(cmd_create_new_pv)
    print('cmd_restore_vg ')
    os.system(cmd_restore_vg)
    print('cmd_active_vg')
    os.system(cmd_active_vg)


    # start osd 
    r=re.search(reb,content)
    osdid=''
    osdfsid=''
    if r:
        osdid=r.group(1)
        osdfsid=r.group(2)
        print(osdid,osdfsid)
    
    cmd_star_mount='systemctl start ceph-volume@lvm-{osdid}-{osdfsid};systemctl enable ceph-volume@lvm-{osdid}-{osdfsid}'.format(osdid=osdid,osdfsid=osdfsid)
    cmd_start_osd='systemctl start ceph-osd@{osdid};systemctl enable ceph-osd@{osdid}'.format(osdid=osdid)
    print('cmd_star_mount ')
    os.system(cmd_star_mount)
    print('cmd_start_osd ')
    os.system(cmd_start_osd)
    
def main():
    ''
    # badlvms=getTargetLvms()
    # for i in badlvms:
        # fixOneOSD(i)
        # time.sleep(3)
    
	
    fixOneOSD(os.path.join('/etc/lvm/backup/','ceph-54ced685-fec0-4725-8df3-e78ad313d223'))
    fixOneOSD(os.path.join('/etc/lvm/backup/','ceph-891ebdb2-3ba9-47c4-b2d4-788c4b0c1a2c'))

if __name__ == "__main__":
    main()

*** 经测试 发现一个问题,有的机器从新启动的时候/dev/sda有可能会变成/dev/sdb这种情况.***

上面脚本就处理不好了,目前我是根据 /var/lib/ceph/osd/ceph-*/里面信息人工推测设备名

如果设备名不匹配服务是起不来的,会报告keyring 不匹配等错误, 执行 ceph-osd -i $osdid --flush-journal 也会报告该错误.

最笨的方法就一个一个测试,如果是对的就服务就能起来,osd就可以加入到集群中.

反复从新启动机器,只要盘里面数据没问题,基本都可恢复,ceph还是挺抗揍的.

如果确定设备名后修改/dev/sdn 到正确设备名即可

    pv0 {
                        id = "EIGfcN-503D-MVDf-AwK1-SDtA-A4Fw-mvBqpe"
                        device = "/dev/sdn"     # Hint only

然后从修改脚本,

 fixOneOSD(os.path.join('/etc/lvm/backup/','ceph-891ebdb2-3ba9-47c4-b2d4-788c4b0c1a2c'))

从新运行脚本. 正常是要从新启动机器的. 不然执行会出问题.

posted @ 2021-01-23 07:06  lvusyy  阅读(753)  评论(0编辑  收藏  举报