ceph 010 clustermap ceph調優

cluster map

[ceph: root@clienta /]# ceph mon dump
epoch 4
fsid 2ae6d05a-229a-11ec-925e-52540000fa0c
last_changed 2021-10-01T09:33:53.880442+0000
created 2021-10-01T09:30:30.146231+0000
min_mon_release 16 (pacific)
election_strategy: 1
0: [v2:172.25.250.12:3300/0,v1:172.25.250.12:6789/0] mon.serverc.lab.example.com
1: [v2:172.25.250.10:3300/0,v1:172.25.250.10:6789/0] mon.clienta
2: [v2:172.25.250.13:3300/0,v1:172.25.250.13:6789/0] mon.serverd
3: [v2:172.25.250.14:3300/0,v1:172.25.250.14:6789/0] mon.servere
dumped monmap epoch 4    #數字方便同步
[ceph: root@clienta /]# 

[ceph: root@clienta /]# ceph osd dump
epoch 401
fsid 2ae6d05a-229a-11ec-925e-52540000fa0c
created 2021-10-01T09:30:32.028240+0000
modified 2022-08-20T14:56:19.230208+0000
flags sortbitwise,recovery_deletes,purged_snapdirs,pglog_hardlimit
crush_version 77
full_ratio 0.95
backfillfull_ratio 0.9
nearfull_ratio 0.85
require_min_compat_client luminous
min_compat_client jewel
require_osd_release pacific
stretch_mode_enabled false
pool 1 'device_health_metrics' replicated size 3 min_size 2 crush_rule 0 object_hash rjenkins pg_num 1 pgp_num 1 autoscale_mode on last_change 374 flags hashpspool stripe_width 0 pg_num_min 1 application mgr_devicehealth
pool 2 '.rgw.root' replicated size 3 min_size 2 crush_rule 0 object_hash rjenkins pg_num 32 pgp_num 32 autoscale_mode on last_change 48 flags hashpspool stripe_width 0 application rgw
pool 3 'default.rgw.log' replicated size 3 min_size 2 crush_rule 0 object_hash rjenkins pg_num 32 pgp_num 32 autoscale_mode on last_change 50 flags hashpspool stripe_width 0 application rgw
pool 4 'default.rgw.control' replicated size 3 min_size 2 crush_rule 0 object_hash rjenkins pg_num 32 pgp_num 32 autoscale_mode on last_change 52 flags hashpspool stripe_width 0 application rgw
pool 5 'default.rgw.meta' replicated size 3 min_size 2 crush_rule 0 object_hash rjenkins pg_num 8 pgp_num 8 autoscale_mode on last_change 184 lfor 0/184/182 flags hashpspool stripe_width 0 pg_autoscale_bias 4 pg_num_min 8 application rgw
pool 10 'pool1' replicated size 3 min_size 2 crush_rule 1 object_hash rjenkins pg_num 32 pgp_num 32 autoscale_mode on last_change 266 flags hashpspool stripe_width 0
pool 11 'ssdpool' replicated size 3 min_size 2 crush_rule 2 object_hash rjenkins pg_num 32 pgp_num 32 autoscale_mode on last_change 338 flags hashpspool stripe_width 0
pool 12 'myecpool' erasure profile myprofile1 size 4 min_size 3 crush_rule 3 object_hash rjenkins pg_num 32 pgp_num 32 autoscale_mode on last_change 345 flags hashpspool stripe_width 8192
pool 13 'myecpool2' erasure profile myprofile2 size 4 min_size 3 crush_rule 4 object_hash rjenkins pg_num 32 pgp_num 32 autoscale_mode on last_change 350 flags hashpspool stripe_width 8192
max_osd 9
osd.0 up   in  weight 1 up_from 360 up_thru 398 down_at 354 last_clean_interval [243,350) [v2:172.25.250.12:6800/2528022353,v1:172.25.250.12:6801/2528022353] [v2:172.25.249.12:6802/2528022353,v1:172.25.249.12:6803/2528022353] exists,up 5be66be9-8262-4c4b-9654-ed549f6280f7
osd.1 up   in  weight 1 up_from 359 up_thru 397 down_at 354 last_clean_interval [244,350) [v2:172.25.250.12:6808/3093181835,v1:172.25.250.12:6809/3093181835] [v2:172.25.249.12:6810/3093181835,v1:172.25.249.12:6811/3093181835] exists,up 3f751363-a03c-4b76-af92-8114e38bfa09
osd.2 up   in  weight 1 up_from 363 up_thru 378 down_at 354 last_clean_interval [242,350) [v2:172.25.250.12:6816/1645468882,v1:172.25.250.12:6817/1645468882] [v2:172.25.249.12:6818/1645468882,v1:172.25.249.12:6819/1645468882] exists,up 68d72b66-4c99-4d54-a7e4-f1cb8f8e5054
osd.3 up   in  weight 1 up_from 363 up_thru 390 down_at 354 last_clean_interval [236,350) [v2:172.25.250.13:6816/2535000344,v1:172.25.250.13:6817/2535000344] [v2:172.25.249.13:6818/2535000344,v1:172.25.249.13:6819/2535000344] exists,up 21a9ebe9-908d-4026-8a57-8fbee935033e
osd.4 up   in  weight 1 up_from 354 up_thru 400 down_at 353 last_clean_interval [237,350) [v2:172.25.250.14:6800/408153468,v1:172.25.250.14:6801/408153468] [v2:172.25.249.14:6802/408153468,v1:172.25.249.14:6803/408153468] exists,up 85202210-9298-4443-9140-027792ddc891
osd.5 up   in  weight 1 up_from 363 up_thru 399 down_at 354 last_clean_interval [235,350) [v2:172.25.250.13:6802/1745131990,v1:172.25.250.13:6803/1745131990] [v2:172.25.249.13:6804/1745131990,v1:172.25.249.13:6805/1745131990] exists,up 252d1668-c4c2-42ca-85fe-87c7419557d6
osd.6 up   in  weight 1 up_from 353 up_thru 381 down_at 352 last_clean_interval [237,350) [v2:172.25.250.14:6804/1927667266,v1:172.25.250.14:6806/1927667266] [v2:172.25.249.14:6807/1927667266,v1:172.25.249.14:6811/1927667266] exists,up 2d753bfc-32f6-4663-9411-16067f366977
osd.7 up   in  weight 1 up_from 363 up_thru 378 down_at 354 last_clean_interval [236,350) [v2:172.25.250.13:6800/4217605284,v1:172.25.250.13:6801/4217605284] [v2:172.25.249.13:6806/4217605284,v1:172.25.249.13:6808/4217605284] exists,up fccc62ed-9b04-456a-95c3-5c3cb27e56d4
osd.8 up   in  weight 1 up_from 357 up_thru 399 down_at 356 last_clean_interval [237,350) [v2:172.25.250.14:6816/3368063169,v1:172.25.250.14:6817/3368063169] [v2:172.25.249.14:6818/3368063169,v1:172.25.249.14:6819/3368063169] exists,up 8b0789f2-f40e-4d63-ac52-343b8e11f24c
blocklist 172.25.250.14:6825/1595923670 expires 2022-08-21T14:55:16.971863+0000
blocklist 172.25.250.14:6824/1595923670 expires 2022-08-21T14:55:16.971863+0000
blocklist 172.25.250.14:0/3491691321 expires 2022-08-21T14:55:16.971863+0000
blocklist 172.25.250.14:0/2738777763 expires 2022-08-21T14:55:16.971863+0000
blocklist 172.25.250.12:0/1239900377 expires 2022-08-20T16:19:27.333673+0000
blocklist 172.25.250.12:6825/3912612299 expires 2022-08-20T16:19:27.333673+0000
blocklist 172.25.250.12:6824/3912612299 expires 2022-08-20T16:19:27.333673+0000
blocklist 172.25.250.12:0/2171541544 expires 2022-08-20T16:19:27.333673+0000
blocklist 172.25.250.12:0/1139201862 expires 2022-08-20T16:19:27.333673+0000
blocklist 172.25.250.12:0/2525786376 expires 2022-08-21T08:52:54.506446+0000
blocklist 172.25.250.14:0/3949782568 expires 2022-08-21T14:55:16.971863+0000
blocklist 172.25.250.12:0/1486113939 expires 2022-08-21T08:52:54.506446+0000
blocklist 172.25.250.12:6825/2537331399 expires 2022-08-21T08:52:54.506446+0000
blocklist 172.25.250.12:0/2290094124 expires 2022-08-21T08:52:54.506446+0000
blocklist 172.25.250.12:6824/2537331399 expires 2022-08-21T08:52:54.506446+0000
[ceph: root@clienta /]# 


[ceph: root@clienta /]# ceph pg dump 
#忽略輸出,太多了

osd 100-200  最多承載pg,建議值

[ceph: root@clienta /]# ceph mgr dump  | grep "dashboard" 
                    "config_dashboard": {
                            "name": "config_dashboard",
                            "default_value": "registry.redhat.io/rhceph/rhceph-5-dashboard-rhel8:latest",
                    "name": "dashboard",
                            "default_value": "osd,host,dashboard,pool,block,nfs,ceph,monitors,gateway,logs,crush,maps",
                        "config_dashboard": {
                            "name": "config_dashboard",
                            "default_value": "registry.redhat.io/rhceph/rhceph-5-dashboard-rhel8:latest",
                    "name": "dashboard",
                            "default_value": "osd,host,dashboard,pool,block,nfs,ceph,monitors,gateway,logs,crush,maps",
                        "config_dashboard": {
                            "name": "config_dashboard",
                            "default_value": "registry.redhat.io/rhceph/rhceph-5-dashboard-rhel8:latest",
                    "name": "dashboard",
                            "default_value": "osd,host,dashboard,pool,block,nfs,ceph,monitors,gateway,logs,crush,maps",
        "dashboard",
                "config_dashboard": {
                    "name": "config_dashboard",
                    "default_value": "registry.redhat.io/rhceph/rhceph-5-dashboard-rhel8:latest",
            "name": "dashboard",
                    "default_value": "osd,host,dashboard,pool,block,nfs,ceph,monitors,gateway,logs,crush,maps",
        "dashboard": "//172.25.250.14:8443/",
[ceph: root@clienta /]# 


Cluster Map基本查詢
ceph mon dump
ceph osd dump
ceph osd crush dump
ceph pg dump all
ceph fs dump
ceph mgr dump
ceph service dump

mon小集群,三節點部署 存放所有map

[root@serverc 2ae6d05a-229a-11ec-925e-52540000fa0c]# pwd
/var/lib/ceph/2ae6d05a-229a-11ec-925e-52540000fa0c
[root@serverc 2ae6d05a-229a-11ec-925e-52540000fa0c]# ll
total 292
drwx------. 3 root   root      149 Oct  1  2021 alertmanager.serverc
-rw-r--r--. 1 root   root   295991 Oct  1  2021 cephadm.d7a73386d1e46cffff151775b8e1d098069c88b89aea56cab15b079c1a1f555f
drwx------. 3    167    167     20 Oct  1  2021 crash
drwx------. 2    167    167    167 Oct  1  2021 crash.serverc
drwx------. 4    472    472    161 Oct  1  2021 grafana.serverc
drwx------. 2    167    167    167 Oct  1  2021 mgr.serverc.lab.example.com.aiqepd
drwx------. 3    167    167    224 Oct  1  2021 mon.serverc.lab.example.com
drwx------. 2 nobody nobody    138 Oct  1  2021 node-exporter.serverc
drwx------. 2    167    167    275 Aug 20 10:54 osd.0
drwx------. 2    167    167    275 Aug 20 10:54 osd.1
drwx------. 2    167    167    275 Aug 20 10:54 osd.2
drwx------. 4 root   root      161 Oct  1  2021 prometheus.serverc
drwx------. 2    167    167    167 Oct 29  2021 rgw.realm.zone.serverc.bqwjcv
drwxr-xr-x. 2 root   root        6 Oct  1  2021 selinux
[root@serverc 2ae6d05a-229a-11ec-925e-52540000fa0c]# 
角色相關資訊


奇數部署好一些

osd之間會發消息,確定心跳。osd無心跳時,osd會彙報mon

數據恢復:副本丟失情況下,恢復副本的過程
數據回填:當有新的osd加入時 (重平衡)

osd是看使用比率
osd 最大70%左右 再大的話就不好恢復

[ceph: root@clienta /]# ceph osd set noout
noout is set
[ceph: root@clienta /]# ceph osd unset noout
noout is unset
[ceph: root@clienta /]# 

nearfull_ratio 0.85 提醒集群容量塊滿了 health warn(擴容)
backfillfull_ratio 0.9 當osd使用比達到90%,數據禁止回填,但是可以恢復,正常對外提供讀寫
full_ratio 0.95 當osd使用比達到95%,數據禁止寫入,可以讀,可以恢復

[ceph: root@clienta /]# ceph osd set-full-ratio 0.95
osd set-full-ratio 0.95
[ceph: root@clienta /]# ceph osd set-nearfull-ratio 0.85
osd set-nearfull-ratio 0.85
[ceph: root@clienta /]# ceph osd dump
epoch 426
fsid 2ae6d05a-229a-11ec-925e-52540000fa0c
created 2021-10-01T09:30:32.028240+0000
modified 2022-08-20T17:56:35.571847+0000
flags sortbitwise,recovery_deletes,purged_snapdirs,pglog_hardlimit
crush_version 82
full_ratio 0.95
backfillfull_ratio 0.9
nearfull_ratio 0.85

//docs.ceph.com/en/quincy/?rtd_search=mon_osd_down_out_interval+
可以尋找這些參數

設置osd權重
0就是盡量不分配在這個osd上面,移除時,先改為0

[ceph: root@clienta /]# ceph osd primary-affinity osd.0 0
降低權重

[ceph: root@clienta /]# ceph pg dump pgs_brief
PG_STAT  STATE         UP       UP_PRIMARY  ACTING   ACTING_PRIMARY
4.8      active+clean  [4,3,0]           4  [4,3,0]               4
3.f      active+clean  [7,4,0]           7  [7,4,0]               7
2.e      active+clean  [2,4,3]           2  [2,4,3]               2
4.b      active+clean  [7,0,4]           7  [7,0,4]               7
3.c      active+clean  [5,0,6]           5  [5,0,6]               5
2.d      active+clean  [4,3,2]           4  [4,3,2]               4
4.a      active+clean  [5,1,4]           5  [5,1,4]               5
3.d      active+clean  [7,6,2]           7  [7,6,2]               7
2.c      active+clean  [6,0,5]           6  [6,0,5]               6
3.a      active+clean  [3,1,8]           3  [3,1,8]               3

osd.0不在作為主了

[ceph: root@clienta /]# ceph pg dump pgs_brief | grep "\[6"
dumped pgs_brief
2.c      active+clean  [6,0,5]           6  [6,0,5]               6
2.a      active+clean  [6,1,3]           6  [6,1,3]               6
4.3      active+clean  [6,7,1]           6  [6,7,1]               6

過濾帶特殊符號

參數

上面的默認值可以去官網查,可能有變化

ceph 調優

ceph對吞吐量較高,需要大記憶體,則numa架構就不適合
如果你的程式不佔用大記憶體,要求更快的程式運行時間,你應該選擇限制值訪問本numa node的方式來進行處理

Ceph部署最佳實踐
MON的性能對集群總體性能至關重要,應用部署於專用節點,為確保正確仲裁,數量應為奇數個
在OSD節點上,作業系統、OSD數據、OSD日誌應當位於獨立的磁碟上,以確保滿意的吞吐量
在集群安裝後,需要監控集群、排除故障並維護,儘管 Ceph具有自愈功能。如果發生性能問題,首先在磁碟、網路和硬體層面上排查。然後逐步轉向RADOS塊設備和Ceph對象網關

RBD建議
塊設備上的工作負載通常是I/O密集型負載,例如在OpenStack中虛擬機上運行資料庫。
對於RBD,OSD日誌應當位於SSD或者NVMe設備上
對後端存儲,可以使用不同的存儲設備以提供不同級別的服務

OSD建議硬體
將一個raid1磁碟用於作業系統
每個OSD一塊硬碟,將SSD或者NVMe用於日誌
使用多個10Gb網卡,每個網路一個雙鏈路綁定
每個OSD預留1個CPU,每個邏輯核心1GHz
分配16GB記憶體,外加每個OSD 2G記憶體


現在ceph可以自動計算
cephpgc:可以去看一下,這個紅帽官網的計算器,還挺有意思

Ceph網路
儘可能使用10Gb網路頻寬
儘可能使用不同的cluster網路和public網路
網路監控

OSD建議硬體
將一個raid1磁碟用於作業系統
每個OSD一塊硬碟,將SSD或者NVMe用於日誌
使用多個10Gb網卡,每個網路一個雙鏈路綁定
每個OSD預留1個CPU,每個邏輯核心1GHz
分配16GB記憶體,外加每個OSD 2G記憶體

其他性能測試工具

dd
 echo 3 > /proc/sys/vm/drop_caches
dd if=/dev/zero of=/var/lib/ceph/osd/ceph-0/test.img bs=4M count=1024 oflag=direct
dd if=/var/lib/ceph/osd/ceph-0/test.img of=/dev/null bs=4M count=1024 oflag=direct  
fio
//help.aliyun.com/document_detail/95501.html?spm=a2c4g.11174283.6.640.6e904da23dhdcG



[ceph: root@clienta /]# ceph osd pool create pool1
pool 'pool1' created
[ceph: root@clienta /]# rados bench -p pool1 10 write --no-cleanup 
hints = 1
Maintaining 16 concurrent writes of 4194304 bytes to objects of size 4194304 for up to 10 seconds or 0 objects
Object prefix: benchmark_data_clienta.lab.example.com_565
sec Cur ops   started  finished  avg MB/s  cur MB/s last lat(s)  avg lat(s)
    0       0         0         0         0         0           -           0
    1      16        16         0         0         0           -           0
    2      16        17         1   1.99911         2     1.77034     1.77034
    3      16        19         3   3.99849         8     2.89986     2.41562
    4      16        20         4   3.99855         4     3.87552     2.78059
    5      16        26        10   7.99736        24     4.87003     3.66784
    6      16        29        13   8.65267        12      5.8558     3.89705
    7      16        36        20   11.4123        28     2.25577     4.20837
    8      16        39        23   11.4849        12     3.18817     4.17326
    9      16        49        33   14.6481        40     1.93119     3.67961
10      16        54        38   15.1205        20     4.61332     3.71135
11      15        54        39   14.1054         4     4.50752     3.73177
12      14        54        40    13.262         4     3.58412     3.72808
13      11        54        43   13.1608        12      3.9051     3.71667
Total time run:         13.7161
Total writes made:      54
Write size:             4194304
Object size:            4194304
Bandwidth (MB/sec):     15.7479
Stddev Bandwidth:       11.8495
Max bandwidth (MB/sec): 40
Min bandwidth (MB/sec): 0
Average IOPS:           3
Stddev IOPS:            3.00427
Max IOPS:               10
Min IOPS:               0
Average Latency(s):     3.86659
Stddev Latency(s):      1.48435
Max latency(s):         7.45216
Min latency(s):         1.17718
[ceph: root@clienta /]# 


[ceph: root@clienta /]# rados bench -p pool1 10 seq
hints = 1
sec Cur ops   started  finished  avg MB/s  cur MB/s last lat(s)  avg lat(s)
    0       0         0         0         0         0           -           0
    1      16        24         8   31.8681        32    0.539518    0.460968
    2      16        45        29   57.6669        84    0.657187    0.773738
    3       5        54        49   65.0267        80    0.595555    0.685997
    4       2        54        52   51.6497        12     2.35873    0.808986
Total time run:       4.26827
Total reads made:     54
Read size:            4194304
Object size:          4194304
Bandwidth (MB/sec):   50.606
Average IOPS:         12
Stddev IOPS:          8.90693
Max IOPS:             21
Min IOPS:             3
Average Latency(s):   0.856345
Max latency(s):       3.07995
Min latency(s):       0.0897737
[ceph: root@clienta /]# 


[ceph: root@clienta /]# rados  bench -p pool1 10 rand
hints = 1
sec Cur ops   started  finished  avg MB/s  cur MB/s last lat(s)  avg lat(s)
    0       0         0         0         0         0           -           0
    1      16        26        10   39.8594        40    0.450443    0.523675
    2      16        45        29    57.894        76     1.81343    0.569421
    3      16        54        38   50.5224        36     2.38602    0.792168
    4      16        79        63   62.8348       100   0.0543633    0.813247
    5      16        94        78   62.2342        60     2.35538    0.832442
    6      16       127       111   73.8291       132    0.141455    0.779658
    7      16       158       142    80.881       124      1.5348    0.742651
    8      16       188       172   85.4177       120    0.431023     0.71256
    9      16       208       192    84.786        80    0.657024    0.690867
10      16       213       197   78.2818        20     0.30201    0.702446
11      11       213       202   72.9987        20     2.83034    0.737541
Total time run:       11.4804
Total reads made:     213
Read size:            4194304
Object size:          4194304
Bandwidth (MB/sec):   74.2134
Average IOPS:         18
Stddev IOPS:          10.4045
Max IOPS:             33
Min IOPS:             5
Average Latency(s):   0.829176
Max latency(s):       3.0047
Min latency(s):       0.0343662

測之前還是得清除快取和對象

[ceph: root@clienta /]# rados -p pool1 cleanup
Removed 54 objects
[root@clienta ~]# sysctl vm.drop_caches=3

我這是虛擬機部署,不是物理機,與物理機比性能高下立判。物理機Bandwidth (MB/sec): 1000 虛擬機Bandwidth (MB/sec): 74.2134

[ceph: root@clienta /]# rbd pool init pool1
[ceph: root@clienta /]# rbd create --size 1G pool1/image1
[ceph: root@clienta /]# rbd info pool1/image1
rbd image 'image1':
    size 1 GiB in 256 objects
    order 22 (4 MiB objects)
    snapshot_count: 0
    id: 197ad26b4bdeb
    block_name_prefix: rbd_data.197ad26b4bdeb
    format: 2
    features: layering, exclusive-lock, object-map, fast-diff, deep-flatten
    op_features: 
    flags: 
    create_timestamp: Sun Aug 21 10:29:30 2022
    access_timestamp: Sun Aug 21 10:29:30 2022
    modify_timestamp: Sun Aug 21 10:29:30 2022
[ceph: root@clienta /]# rbd bench --io-type write image1 --pool=pool1
bench  type write io_size 4096 io_threads 16 bytes 1073741824 pattern sequential
SEC       OPS   OPS/SEC   BYTES/SEC
    1      6288   6174.26    24 MiB/s
    2      6800   3117.98    12 MiB/s
    3      7232   2402.35   9.4 MiB/s
    4      7856   1891.83   7.4 MiB/s
    5      8336   1666.05   6.5 MiB/s
    6      9040   552.049   2.2 MiB/s
    7     14160   1514.69   5.9 MiB/s
    8     17472    2018.1   7.9 MiB/s
    9     23056   3039.35    12 MiB/s
10     26000   3539.12    14 MiB/s
11     28416    3876.7    15 MiB/s

[ceph: root@clienta /]# rbd bench --io-type read image1 --pool=pool1
bench  type read io_size 4096 io_threads 16 bytes 1073741824 pattern sequential
SEC       OPS   OPS/SEC   BYTES/SEC
    1       400   452.168   1.8 MiB/s
    2       816   430.636   1.7 MiB/s
    3      1248   431.099   1.7 MiB/s
    4      1712   441.599   1.7 MiB/s
    5      2144   438.929   1.7 MiB/s
    6      2560   429.247   1.7 MiB/s
    7      2896   414.255   1.6 MiB/s