ceph的crushmap

crush算法详解

1、解决了什么问题?

  • 在传统的存储系统中,需要维护一个元数据映射表来记录数据块与存储位置的对应的关系,当集群规模非常大的时候,维护这个映射表非常的困难,且元数据服务器容器成为性能瓶颈和单点故障

  • crsuh算法,通过计算直接得出数据存储的位置

    • 彻底去中心化

    • 客户端直接与osd交互

    • 扩展能力强

  • 就能自定义将数据分布在哪些osd上面

2、crush map树状层级结构

img

3、获取集群crush map

# 是一个编译文件
[root@ceph01 ~]# ceph osd getcrushmap -o  crushmap.bin
60

# 需要反编译才能看得到
[root@ceph01 ~]# crushtool -d crushmap.bin -o crushmap.txt


[root@ceph01 ~]# cat crushmap.txt 
# begin crush map
tunable choose_local_tries 0
tunable choose_local_fallback_tries 0
tunable choose_total_tries 50
tunable chooseleaf_descend_once 1
tunable chooseleaf_vary_r 1
tunable chooseleaf_stable 1
tunable straw_calc_version 1
tunable allowed_bucket_algs 54

# devices
device 0 osd.0 class hdd
device 1 osd.1 class hdd
device 2 osd.2 class hdd
device 3 osd.3 class hdd
device 4 osd.4 class hdd
device 5 osd.5 class hdd
device 6 osd.6 class hdd
device 7 osd.7 class hdd
device 8 osd.8 class hdd

# types
type 0 osd
type 1 host
type 2 chassis
type 3 rack
type 4 row
type 5 pdu
type 6 pod
type 7 room
type 8 datacenter
type 9 zone
type 10 region
type 11 root

# buckets
host ceph01 {
	id -3		# do not change unnecessarily
	id -4 class hdd		# do not change unnecessarily
	# weight 0.029
	alg straw2
	hash 0	# rjenkins1
	item osd.0 weight 0.010
	item osd.1 weight 0.010
	item osd.2 weight 0.010
}
host ceph02 {
	id -5		# do not change unnecessarily
	id -6 class hdd		# do not change unnecessarily
	# weight 0.029
	alg straw2
	hash 0	# rjenkins1
	item osd.3 weight 0.010
	item osd.4 weight 0.010
	item osd.5 weight 0.010
}
host ceph03 {
	id -7		# do not change unnecessarily
	id -8 class hdd		# do not change unnecessarily
	# weight 0.029
	alg straw2
	hash 0	# rjenkins1
	item osd.6 weight 0.010
	item osd.7 weight 0.010
	item osd.8 weight 0.010
}
root default {
	id -1		# do not change unnecessarily
	id -2 class hdd		# do not change unnecessarily
	# weight 0.088
	alg straw2
	hash 0	# rjenkins1
	item ceph01 weight 0.029
	item ceph02 weight 0.029
	item ceph03 weight 0.029
}

# rules
rule replicated_rule {
	id 0
	type replicated
	min_size 1
	max_size 10
	step take default
	step chooseleaf firstn 0 type host
	step emit
}
rule erasure-code {
	id 1
	type erasure
	min_size 3
	max_size 4
	step set_chooseleaf_tries 5
	step set_choose_tries 100
	step take default
	step chooseleaf indep 0 type host
	step emit
}
rule test2 {
	id 2
	type erasure
	min_size 3
	max_size 5
	step set_chooseleaf_tries 5
	step set_choose_tries 100
	step take default
	step choose indep 0 type osd
	step emit
}

# end crush map

  • tunables:tunables参数主要用来修正一些旧bug、优化算法、以及向后兼容老版本

  • devices: 代表了各个存储数据的osd

  • types:代表类型,可以自定义

  • rules: 分布策略,

实验

ceph中的每个设备都可以选择一个class类型与之关联,通常有三种class类型:

  • hdd
  • ssd
  • nvme

1、配置crush class

  • 就是将数据都存储在sdd上面,指定了数据的分布
# osd默认都是hdd类型
[root@ceph01 ~]# ceph osd crush class ls
[
    "hdd"
]

查看当前osd布局

[root@ceph01 ~]# ceph osd tree
ID  CLASS  WEIGHT   TYPE NAME        STATUS  REWEIGHT  PRI-AFF
-1         0.08817  root default                              
-3         0.02939      host ceph01                           
 0    hdd  0.00980          osd.0        up   1.00000  1.00000
 1    hdd  0.00980          osd.1        up   1.00000  1.00000
 2    hdd  0.00980          osd.2        up   1.00000  1.00000
-5         0.02939      host ceph02                           
 3    hdd  0.00980          osd.3        up   1.00000  1.00000
 4    hdd  0.00980          osd.4        up   1.00000  1.00000
 5    hdd  0.00980          osd.5        up   1.00000  1.00000
-7         0.02939      host ceph03                           
 6    hdd  0.00980          osd.6        up   1.00000  1.00000
 7    hdd  0.00980          osd.7        up   1.00000  1.00000
 8    hdd  0.00980          osd.8        up   1.00000  1.00000

2、将磁盘修改为ssd

  • 就是打上标签
# 修改这些,就相当于是修改crushmap配置文件

ceph osd crush rm-device-class osd.2
ceph osd crush set-device-class ssd osd.2

ceph osd crush rm-device-class osd.5
ceph osd crush set-device-class ssd osd.5

ceph osd crush rm-device-class osd.8
ceph osd crush set-device-class ssd osd.8

[root@ceph01 ~]# ceph osd tree
ID  CLASS  WEIGHT   TYPE NAME        STATUS  REWEIGHT  PRI-AFF
-1         0.08817  root default                              
-3         0.02939      host ceph01                           
 0    hdd  0.00980          osd.0        up   1.00000  1.00000
 1    hdd  0.00980          osd.1        up   1.00000  1.00000
 2    ssd  0.00980          osd.2        up   1.00000  1.00000
-5         0.02939      host ceph02                           
 3    hdd  0.00980          osd.3        up   1.00000  1.00000
 4    hdd  0.00980          osd.4        up   1.00000  1.00000
 5    ssd  0.00980          osd.5        up   1.00000  1.00000
-7         0.02939      host ceph03                           
 6    hdd  0.00980          osd.6        up   1.00000  1.00000
 7    hdd  0.00980          osd.7        up   1.00000  1.00000
 8    ssd  0.00980          osd.8        up   1.00000  1.00000

3、创建基于ssd的class rule

[root@ceph01 ~]# ceph osd crush rule create-replicated ssd_rule default host ssd

[root@ceph01 ~]# ceph osd crush rule ls 
replicated_rule
erasure-code
test2
ssd_rule

  • 再次查看crushmap信息,发现里面有新添加的规则
# devices里面的信息也定义了

# devices
device 0 osd.0 class hdd
device 1 osd.1 class hdd
device 2 osd.2 class ssd
device 3 osd.3 class hdd
device 4 osd.4 class hdd
device 5 osd.5 class ssd
device 6 osd.6 class hdd
device 7 osd.7 class hdd
device 8 osd.8 class ssd


rule ssd_rule {
        id 3  # rule id为3
        type replicated
        min_size 1
        max_size 10
        step take default class ssd  # ssd类型的
        step chooseleaf firstn 0 type host
        step emit
}

4、创建基于ssd_rule规则的存储池

  • 创建一个基于ssd_rule规则的存储池
[root@ceph01 ~]# ceph osd pool create ssd_pool replicated ssd_rule
pool 'ssd_pool' created

# 发现ssd_pool 使用的crush_rule规则为3
pool 24 'ssd_pool' replicated size 3 min_size 2 crush_rule 3 object_hash rjenkins pg_num 32 pgp_num 32 autoscale_mode on last_change 467 flags hashpspool stripe_width 0

5、测试ssd的池

# 上传数据

[root@ceph01 ~]# rados -p ssd_pool put hosts /etc/hosts
[root@ceph01 ~]# rados -p ssd_pool ls
hosts

# 查看pg,osd之间的映射关系,发现全部分布在ssd上面了
[root@ceph01 ~]# ceph osd map ssd_pool hosts
osdmap e467 pool 'ssd_pool' (24) object 'hosts' -> pg 24.ea1b298e (24.e) -> up ([2,5,8], p2) acting ([2,5,8], p2)

6、怎么删除规则?

[root@ceph01 ~]# ceph osd pool rm ssd_pool ssd_pool  --yes-i-really-really-mean-it
pool 'ssd_pool' removed
[root@ceph01 ~]# ceph osd crush rule ls
replicated_rule
erasure-code
test2
ssd_rule
[root@ceph01 ~]# ceph osd crush rule rm ssd_rule
[root@ceph01 ~]# ceph osd crush rule ls
replicated_rule
erasure-code
test2

posted @ 2026-03-21 12:22  乔的港口  阅读(6)  评论(0)    收藏  举报