ceph 010 clustermap ceph調優
- 2022 年 8 月 21 日
- 筆記
- ceph 分散式存儲
cluster map
[ceph: root@clienta /]# ceph mon dump
epoch 4
fsid 2ae6d05a-229a-11ec-925e-52540000fa0c
last_changed 2021-10-01T09:33:53.880442+0000
created 2021-10-01T09:30:30.146231+0000
min_mon_release 16 (pacific)
election_strategy: 1
0: [v2:172.25.250.12:3300/0,v1:172.25.250.12:6789/0] mon.serverc.lab.example.com
1: [v2:172.25.250.10:3300/0,v1:172.25.250.10:6789/0] mon.clienta
2: [v2:172.25.250.13:3300/0,v1:172.25.250.13:6789/0] mon.serverd
3: [v2:172.25.250.14:3300/0,v1:172.25.250.14:6789/0] mon.servere
dumped monmap epoch 4 #數字方便同步
[ceph: root@clienta /]#
[ceph: root@clienta /]# ceph osd dump
epoch 401
fsid 2ae6d05a-229a-11ec-925e-52540000fa0c
created 2021-10-01T09:30:32.028240+0000
modified 2022-08-20T14:56:19.230208+0000
flags sortbitwise,recovery_deletes,purged_snapdirs,pglog_hardlimit
crush_version 77
full_ratio 0.95
backfillfull_ratio 0.9
nearfull_ratio 0.85
require_min_compat_client luminous
min_compat_client jewel
require_osd_release pacific
stretch_mode_enabled false
pool 1 'device_health_metrics' replicated size 3 min_size 2 crush_rule 0 object_hash rjenkins pg_num 1 pgp_num 1 autoscale_mode on last_change 374 flags hashpspool stripe_width 0 pg_num_min 1 application mgr_devicehealth
pool 2 '.rgw.root' replicated size 3 min_size 2 crush_rule 0 object_hash rjenkins pg_num 32 pgp_num 32 autoscale_mode on last_change 48 flags hashpspool stripe_width 0 application rgw
pool 3 'default.rgw.log' replicated size 3 min_size 2 crush_rule 0 object_hash rjenkins pg_num 32 pgp_num 32 autoscale_mode on last_change 50 flags hashpspool stripe_width 0 application rgw
pool 4 'default.rgw.control' replicated size 3 min_size 2 crush_rule 0 object_hash rjenkins pg_num 32 pgp_num 32 autoscale_mode on last_change 52 flags hashpspool stripe_width 0 application rgw
pool 5 'default.rgw.meta' replicated size 3 min_size 2 crush_rule 0 object_hash rjenkins pg_num 8 pgp_num 8 autoscale_mode on last_change 184 lfor 0/184/182 flags hashpspool stripe_width 0 pg_autoscale_bias 4 pg_num_min 8 application rgw
pool 10 'pool1' replicated size 3 min_size 2 crush_rule 1 object_hash rjenkins pg_num 32 pgp_num 32 autoscale_mode on last_change 266 flags hashpspool stripe_width 0
pool 11 'ssdpool' replicated size 3 min_size 2 crush_rule 2 object_hash rjenkins pg_num 32 pgp_num 32 autoscale_mode on last_change 338 flags hashpspool stripe_width 0
pool 12 'myecpool' erasure profile myprofile1 size 4 min_size 3 crush_rule 3 object_hash rjenkins pg_num 32 pgp_num 32 autoscale_mode on last_change 345 flags hashpspool stripe_width 8192
pool 13 'myecpool2' erasure profile myprofile2 size 4 min_size 3 crush_rule 4 object_hash rjenkins pg_num 32 pgp_num 32 autoscale_mode on last_change 350 flags hashpspool stripe_width 8192
max_osd 9
osd.0 up in weight 1 up_from 360 up_thru 398 down_at 354 last_clean_interval [243,350) [v2:172.25.250.12:6800/2528022353,v1:172.25.250.12:6801/2528022353] [v2:172.25.249.12:6802/2528022353,v1:172.25.249.12:6803/2528022353] exists,up 5be66be9-8262-4c4b-9654-ed549f6280f7
osd.1 up in weight 1 up_from 359 up_thru 397 down_at 354 last_clean_interval [244,350) [v2:172.25.250.12:6808/3093181835,v1:172.25.250.12:6809/3093181835] [v2:172.25.249.12:6810/3093181835,v1:172.25.249.12:6811/3093181835] exists,up 3f751363-a03c-4b76-af92-8114e38bfa09
osd.2 up in weight 1 up_from 363 up_thru 378 down_at 354 last_clean_interval [242,350) [v2:172.25.250.12:6816/1645468882,v1:172.25.250.12:6817/1645468882] [v2:172.25.249.12:6818/1645468882,v1:172.25.249.12:6819/1645468882] exists,up 68d72b66-4c99-4d54-a7e4-f1cb8f8e5054
osd.3 up in weight 1 up_from 363 up_thru 390 down_at 354 last_clean_interval [236,350) [v2:172.25.250.13:6816/2535000344,v1:172.25.250.13:6817/2535000344] [v2:172.25.249.13:6818/2535000344,v1:172.25.249.13:6819/2535000344] exists,up 21a9ebe9-908d-4026-8a57-8fbee935033e
osd.4 up in weight 1 up_from 354 up_thru 400 down_at 353 last_clean_interval [237,350) [v2:172.25.250.14:6800/408153468,v1:172.25.250.14:6801/408153468] [v2:172.25.249.14:6802/408153468,v1:172.25.249.14:6803/408153468] exists,up 85202210-9298-4443-9140-027792ddc891
osd.5 up in weight 1 up_from 363 up_thru 399 down_at 354 last_clean_interval [235,350) [v2:172.25.250.13:6802/1745131990,v1:172.25.250.13:6803/1745131990] [v2:172.25.249.13:6804/1745131990,v1:172.25.249.13:6805/1745131990] exists,up 252d1668-c4c2-42ca-85fe-87c7419557d6
osd.6 up in weight 1 up_from 353 up_thru 381 down_at 352 last_clean_interval [237,350) [v2:172.25.250.14:6804/1927667266,v1:172.25.250.14:6806/1927667266] [v2:172.25.249.14:6807/1927667266,v1:172.25.249.14:6811/1927667266] exists,up 2d753bfc-32f6-4663-9411-16067f366977
osd.7 up in weight 1 up_from 363 up_thru 378 down_at 354 last_clean_interval [236,350) [v2:172.25.250.13:6800/4217605284,v1:172.25.250.13:6801/4217605284] [v2:172.25.249.13:6806/4217605284,v1:172.25.249.13:6808/4217605284] exists,up fccc62ed-9b04-456a-95c3-5c3cb27e56d4
osd.8 up in weight 1 up_from 357 up_thru 399 down_at 356 last_clean_interval [237,350) [v2:172.25.250.14:6816/3368063169,v1:172.25.250.14:6817/3368063169] [v2:172.25.249.14:6818/3368063169,v1:172.25.249.14:6819/3368063169] exists,up 8b0789f2-f40e-4d63-ac52-343b8e11f24c
blocklist 172.25.250.14:6825/1595923670 expires 2022-08-21T14:55:16.971863+0000
blocklist 172.25.250.14:6824/1595923670 expires 2022-08-21T14:55:16.971863+0000
blocklist 172.25.250.14:0/3491691321 expires 2022-08-21T14:55:16.971863+0000
blocklist 172.25.250.14:0/2738777763 expires 2022-08-21T14:55:16.971863+0000
blocklist 172.25.250.12:0/1239900377 expires 2022-08-20T16:19:27.333673+0000
blocklist 172.25.250.12:6825/3912612299 expires 2022-08-20T16:19:27.333673+0000
blocklist 172.25.250.12:6824/3912612299 expires 2022-08-20T16:19:27.333673+0000
blocklist 172.25.250.12:0/2171541544 expires 2022-08-20T16:19:27.333673+0000
blocklist 172.25.250.12:0/1139201862 expires 2022-08-20T16:19:27.333673+0000
blocklist 172.25.250.12:0/2525786376 expires 2022-08-21T08:52:54.506446+0000
blocklist 172.25.250.14:0/3949782568 expires 2022-08-21T14:55:16.971863+0000
blocklist 172.25.250.12:0/1486113939 expires 2022-08-21T08:52:54.506446+0000
blocklist 172.25.250.12:6825/2537331399 expires 2022-08-21T08:52:54.506446+0000
blocklist 172.25.250.12:0/2290094124 expires 2022-08-21T08:52:54.506446+0000
blocklist 172.25.250.12:6824/2537331399 expires 2022-08-21T08:52:54.506446+0000
[ceph: root@clienta /]#
[ceph: root@clienta /]# ceph pg dump
#忽略輸出,太多了
osd 100-200 最多承載pg,建議值
[ceph: root@clienta /]# ceph mgr dump | grep "dashboard"
"config_dashboard": {
"name": "config_dashboard",
"default_value": "registry.redhat.io/rhceph/rhceph-5-dashboard-rhel8:latest",
"name": "dashboard",
"default_value": "osd,host,dashboard,pool,block,nfs,ceph,monitors,gateway,logs,crush,maps",
"config_dashboard": {
"name": "config_dashboard",
"default_value": "registry.redhat.io/rhceph/rhceph-5-dashboard-rhel8:latest",
"name": "dashboard",
"default_value": "osd,host,dashboard,pool,block,nfs,ceph,monitors,gateway,logs,crush,maps",
"config_dashboard": {
"name": "config_dashboard",
"default_value": "registry.redhat.io/rhceph/rhceph-5-dashboard-rhel8:latest",
"name": "dashboard",
"default_value": "osd,host,dashboard,pool,block,nfs,ceph,monitors,gateway,logs,crush,maps",
"dashboard",
"config_dashboard": {
"name": "config_dashboard",
"default_value": "registry.redhat.io/rhceph/rhceph-5-dashboard-rhel8:latest",
"name": "dashboard",
"default_value": "osd,host,dashboard,pool,block,nfs,ceph,monitors,gateway,logs,crush,maps",
"dashboard": "//172.25.250.14:8443/",
[ceph: root@clienta /]#
Cluster Map基本查詢
ceph mon dump
ceph osd dump
ceph osd crush dump
ceph pg dump all
ceph fs dump
ceph mgr dump
ceph service dump
mon小集群,三節點部署 存放所有map
[root@serverc 2ae6d05a-229a-11ec-925e-52540000fa0c]# pwd
/var/lib/ceph/2ae6d05a-229a-11ec-925e-52540000fa0c
[root@serverc 2ae6d05a-229a-11ec-925e-52540000fa0c]# ll
total 292
drwx------. 3 root root 149 Oct 1 2021 alertmanager.serverc
-rw-r--r--. 1 root root 295991 Oct 1 2021 cephadm.d7a73386d1e46cffff151775b8e1d098069c88b89aea56cab15b079c1a1f555f
drwx------. 3 167 167 20 Oct 1 2021 crash
drwx------. 2 167 167 167 Oct 1 2021 crash.serverc
drwx------. 4 472 472 161 Oct 1 2021 grafana.serverc
drwx------. 2 167 167 167 Oct 1 2021 mgr.serverc.lab.example.com.aiqepd
drwx------. 3 167 167 224 Oct 1 2021 mon.serverc.lab.example.com
drwx------. 2 nobody nobody 138 Oct 1 2021 node-exporter.serverc
drwx------. 2 167 167 275 Aug 20 10:54 osd.0
drwx------. 2 167 167 275 Aug 20 10:54 osd.1
drwx------. 2 167 167 275 Aug 20 10:54 osd.2
drwx------. 4 root root 161 Oct 1 2021 prometheus.serverc
drwx------. 2 167 167 167 Oct 29 2021 rgw.realm.zone.serverc.bqwjcv
drwxr-xr-x. 2 root root 6 Oct 1 2021 selinux
[root@serverc 2ae6d05a-229a-11ec-925e-52540000fa0c]#
角色相關資訊
奇數部署好一些
osd之間會發消息,確定心跳。osd無心跳時,osd會彙報mon
數據恢復:副本丟失情況下,恢復副本的過程
數據回填:當有新的osd加入時 (重平衡)
osd是看使用比率
osd 最大70%左右 再大的話就不好恢復
[ceph: root@clienta /]# ceph osd set noout
noout is set
[ceph: root@clienta /]# ceph osd unset noout
noout is unset
[ceph: root@clienta /]#
nearfull_ratio 0.85 提醒集群容量塊滿了 health warn(擴容)
backfillfull_ratio 0.9 當osd使用比達到90%,數據禁止回填,但是可以恢復,正常對外提供讀寫
full_ratio 0.95 當osd使用比達到95%,數據禁止寫入,可以讀,可以恢復
[ceph: root@clienta /]# ceph osd set-full-ratio 0.95
osd set-full-ratio 0.95
[ceph: root@clienta /]# ceph osd set-nearfull-ratio 0.85
osd set-nearfull-ratio 0.85
[ceph: root@clienta /]# ceph osd dump
epoch 426
fsid 2ae6d05a-229a-11ec-925e-52540000fa0c
created 2021-10-01T09:30:32.028240+0000
modified 2022-08-20T17:56:35.571847+0000
flags sortbitwise,recovery_deletes,purged_snapdirs,pglog_hardlimit
crush_version 82
full_ratio 0.95
backfillfull_ratio 0.9
nearfull_ratio 0.85
//docs.ceph.com/en/quincy/?rtd_search=mon_osd_down_out_interval+
可以尋找這些參數
設置osd權重
0就是盡量不分配在這個osd上面,移除時,先改為0
[ceph: root@clienta /]# ceph osd primary-affinity osd.0 0
降低權重
[ceph: root@clienta /]# ceph pg dump pgs_brief
PG_STAT STATE UP UP_PRIMARY ACTING ACTING_PRIMARY
4.8 active+clean [4,3,0] 4 [4,3,0] 4
3.f active+clean [7,4,0] 7 [7,4,0] 7
2.e active+clean [2,4,3] 2 [2,4,3] 2
4.b active+clean [7,0,4] 7 [7,0,4] 7
3.c active+clean [5,0,6] 5 [5,0,6] 5
2.d active+clean [4,3,2] 4 [4,3,2] 4
4.a active+clean [5,1,4] 5 [5,1,4] 5
3.d active+clean [7,6,2] 7 [7,6,2] 7
2.c active+clean [6,0,5] 6 [6,0,5] 6
3.a active+clean [3,1,8] 3 [3,1,8] 3
osd.0不在作為主了
[ceph: root@clienta /]# ceph pg dump pgs_brief | grep "\[6"
dumped pgs_brief
2.c active+clean [6,0,5] 6 [6,0,5] 6
2.a active+clean [6,1,3] 6 [6,1,3] 6
4.3 active+clean [6,7,1] 6 [6,7,1] 6
過濾帶特殊符號
參數
上面的默認值可以去官網查,可能有變化
ceph 調優
ceph對吞吐量較高,需要大記憶體,則numa架構就不適合
如果你的程式不佔用大記憶體,要求更快的程式運行時間,你應該選擇限制值訪問本numa node的方式來進行處理
Ceph部署最佳實踐
MON的性能對集群總體性能至關重要,應用部署於專用節點,為確保正確仲裁,數量應為奇數個
在OSD節點上,作業系統、OSD數據、OSD日誌應當位於獨立的磁碟上,以確保滿意的吞吐量
在集群安裝後,需要監控集群、排除故障並維護,儘管 Ceph具有自愈功能。如果發生性能問題,首先在磁碟、網路和硬體層面上排查。然後逐步轉向RADOS塊設備和Ceph對象網關
RBD建議
塊設備上的工作負載通常是I/O密集型負載,例如在OpenStack中虛擬機上運行資料庫。
對於RBD,OSD日誌應當位於SSD或者NVMe設備上
對後端存儲,可以使用不同的存儲設備以提供不同級別的服務
OSD建議硬體
將一個raid1磁碟用於作業系統
每個OSD一塊硬碟,將SSD或者NVMe用於日誌
使用多個10Gb網卡,每個網路一個雙鏈路綁定
每個OSD預留1個CPU,每個邏輯核心1GHz
分配16GB記憶體,外加每個OSD 2G記憶體
現在ceph可以自動計算
cephpgc:可以去看一下,這個紅帽官網的計算器,還挺有意思
Ceph網路
儘可能使用10Gb網路頻寬
儘可能使用不同的cluster網路和public網路
網路監控
OSD建議硬體
將一個raid1磁碟用於作業系統
每個OSD一塊硬碟,將SSD或者NVMe用於日誌
使用多個10Gb網卡,每個網路一個雙鏈路綁定
每個OSD預留1個CPU,每個邏輯核心1GHz
分配16GB記憶體,外加每個OSD 2G記憶體
其他性能測試工具
dd
echo 3 > /proc/sys/vm/drop_caches
dd if=/dev/zero of=/var/lib/ceph/osd/ceph-0/test.img bs=4M count=1024 oflag=direct
dd if=/var/lib/ceph/osd/ceph-0/test.img of=/dev/null bs=4M count=1024 oflag=direct
fio
//help.aliyun.com/document_detail/95501.html?spm=a2c4g.11174283.6.640.6e904da23dhdcG
[ceph: root@clienta /]# ceph osd pool create pool1
pool 'pool1' created
[ceph: root@clienta /]# rados bench -p pool1 10 write --no-cleanup
hints = 1
Maintaining 16 concurrent writes of 4194304 bytes to objects of size 4194304 for up to 10 seconds or 0 objects
Object prefix: benchmark_data_clienta.lab.example.com_565
sec Cur ops started finished avg MB/s cur MB/s last lat(s) avg lat(s)
0 0 0 0 0 0 - 0
1 16 16 0 0 0 - 0
2 16 17 1 1.99911 2 1.77034 1.77034
3 16 19 3 3.99849 8 2.89986 2.41562
4 16 20 4 3.99855 4 3.87552 2.78059
5 16 26 10 7.99736 24 4.87003 3.66784
6 16 29 13 8.65267 12 5.8558 3.89705
7 16 36 20 11.4123 28 2.25577 4.20837
8 16 39 23 11.4849 12 3.18817 4.17326
9 16 49 33 14.6481 40 1.93119 3.67961
10 16 54 38 15.1205 20 4.61332 3.71135
11 15 54 39 14.1054 4 4.50752 3.73177
12 14 54 40 13.262 4 3.58412 3.72808
13 11 54 43 13.1608 12 3.9051 3.71667
Total time run: 13.7161
Total writes made: 54
Write size: 4194304
Object size: 4194304
Bandwidth (MB/sec): 15.7479
Stddev Bandwidth: 11.8495
Max bandwidth (MB/sec): 40
Min bandwidth (MB/sec): 0
Average IOPS: 3
Stddev IOPS: 3.00427
Max IOPS: 10
Min IOPS: 0
Average Latency(s): 3.86659
Stddev Latency(s): 1.48435
Max latency(s): 7.45216
Min latency(s): 1.17718
[ceph: root@clienta /]#
[ceph: root@clienta /]# rados bench -p pool1 10 seq
hints = 1
sec Cur ops started finished avg MB/s cur MB/s last lat(s) avg lat(s)
0 0 0 0 0 0 - 0
1 16 24 8 31.8681 32 0.539518 0.460968
2 16 45 29 57.6669 84 0.657187 0.773738
3 5 54 49 65.0267 80 0.595555 0.685997
4 2 54 52 51.6497 12 2.35873 0.808986
Total time run: 4.26827
Total reads made: 54
Read size: 4194304
Object size: 4194304
Bandwidth (MB/sec): 50.606
Average IOPS: 12
Stddev IOPS: 8.90693
Max IOPS: 21
Min IOPS: 3
Average Latency(s): 0.856345
Max latency(s): 3.07995
Min latency(s): 0.0897737
[ceph: root@clienta /]#
[ceph: root@clienta /]# rados bench -p pool1 10 rand
hints = 1
sec Cur ops started finished avg MB/s cur MB/s last lat(s) avg lat(s)
0 0 0 0 0 0 - 0
1 16 26 10 39.8594 40 0.450443 0.523675
2 16 45 29 57.894 76 1.81343 0.569421
3 16 54 38 50.5224 36 2.38602 0.792168
4 16 79 63 62.8348 100 0.0543633 0.813247
5 16 94 78 62.2342 60 2.35538 0.832442
6 16 127 111 73.8291 132 0.141455 0.779658
7 16 158 142 80.881 124 1.5348 0.742651
8 16 188 172 85.4177 120 0.431023 0.71256
9 16 208 192 84.786 80 0.657024 0.690867
10 16 213 197 78.2818 20 0.30201 0.702446
11 11 213 202 72.9987 20 2.83034 0.737541
Total time run: 11.4804
Total reads made: 213
Read size: 4194304
Object size: 4194304
Bandwidth (MB/sec): 74.2134
Average IOPS: 18
Stddev IOPS: 10.4045
Max IOPS: 33
Min IOPS: 5
Average Latency(s): 0.829176
Max latency(s): 3.0047
Min latency(s): 0.0343662
測之前還是得清除快取和對象
[ceph: root@clienta /]# rados -p pool1 cleanup
Removed 54 objects
[root@clienta ~]# sysctl vm.drop_caches=3
我這是虛擬機部署,不是物理機,與物理機比性能高下立判。物理機Bandwidth (MB/sec): 1000 虛擬機Bandwidth (MB/sec): 74.2134
[ceph: root@clienta /]# rbd pool init pool1
[ceph: root@clienta /]# rbd create --size 1G pool1/image1
[ceph: root@clienta /]# rbd info pool1/image1
rbd image 'image1':
size 1 GiB in 256 objects
order 22 (4 MiB objects)
snapshot_count: 0
id: 197ad26b4bdeb
block_name_prefix: rbd_data.197ad26b4bdeb
format: 2
features: layering, exclusive-lock, object-map, fast-diff, deep-flatten
op_features:
flags:
create_timestamp: Sun Aug 21 10:29:30 2022
access_timestamp: Sun Aug 21 10:29:30 2022
modify_timestamp: Sun Aug 21 10:29:30 2022
[ceph: root@clienta /]# rbd bench --io-type write image1 --pool=pool1
bench type write io_size 4096 io_threads 16 bytes 1073741824 pattern sequential
SEC OPS OPS/SEC BYTES/SEC
1 6288 6174.26 24 MiB/s
2 6800 3117.98 12 MiB/s
3 7232 2402.35 9.4 MiB/s
4 7856 1891.83 7.4 MiB/s
5 8336 1666.05 6.5 MiB/s
6 9040 552.049 2.2 MiB/s
7 14160 1514.69 5.9 MiB/s
8 17472 2018.1 7.9 MiB/s
9 23056 3039.35 12 MiB/s
10 26000 3539.12 14 MiB/s
11 28416 3876.7 15 MiB/s
讀
[ceph: root@clienta /]# rbd bench --io-type read image1 --pool=pool1
bench type read io_size 4096 io_threads 16 bytes 1073741824 pattern sequential
SEC OPS OPS/SEC BYTES/SEC
1 400 452.168 1.8 MiB/s
2 816 430.636 1.7 MiB/s
3 1248 431.099 1.7 MiB/s
4 1712 441.599 1.7 MiB/s
5 2144 438.929 1.7 MiB/s
6 2560 429.247 1.7 MiB/s
7 2896 414.255 1.6 MiB/s