frans42 / ceph-goodies

Collection of useful scripts for ceph
GNU General Public License v3.0
3 stars 3 forks source link

Divide by zero #1

Open test-erik opened 3 days ago

test-erik commented 3 days ago

That's the bash -x output. I hope it contains enough information for solvinmg the bc errors:

root@ceph00:~# bash -x pool-scrub-report libvirt-pool
+ ((  1 == 1  ))
+ max_age=300
+ cache_dir=/root/.cache/ceph
+ [[ -d /root/.cache/ceph ]]
+ [[ libvirt-pool =~ ^[[:digit:]]+$ ]]
+ pool_info=($(ceph osd pool ls detail --format json-pretty |           jq -r --arg nm "$1" '.[] | select(.pool_name==$nm)
                | "\(.pool_id) \(.pool_name) \(.pg_num)"'))
++ ceph osd pool ls detail --format json-pretty
++ jq -r --arg nm libvirt-pool '.[] | select(.pool_name==$nm)
                | "\(.pool_id) \(.pool_name) \(.pg_num)"'
+ ((  3==3  ))
++ ceph osd pool get libvirt-pool all --format json-pretty
+ pool_settings='
{
    "pool": "libvirt-pool",
    "pool_id": 2,
    "size": 3,
    "min_size": 2,
    "pg_num": 512,
    "pgp_num": 512,
    "crush_rule": "replicated_rule",
    "hashpspool": true,
    "nodelete": false,
    "nopgchange": false,
    "nosizechange": false,
    "write_fadvise_dontneed": false,
    "noscrub": false,
    "nodeep-scrub": false,
    "use_gmt_hitset": true,
    "fast_read": 0,
    "pg_autoscale_mode": "on",
    "eio": false,
    "bulk": false
}'
+ echo -n 'Scrub info for pool libvirt-pool (id=2): '
Scrub info for pool libvirt-pool (id=2): ++ jq -r '(.scrub_min_interval  // 0)/21600 | ceil'
+ sc_int=0
++ jq -r '(.deep_scrub_interval // 0)/86400 | ceil'
+ ds_int=0
++ jq -r '(.scrub_min_interval // 0)/3600 | round'
+ pool_scrub_min_interval=0
+ rebuild_cache=1
+ [[ -r /root/.cache/ceph/pgs_dump.json ]]
+++ date -r /root/.cache/ceph/pgs_dump.json +%s
++ date -d 'now - 1719926167 seconds' +%s
+ age=82
+ rebuild_cache=0
+ ((  rebuild_cache  ))
+ printf 'using cache (RTTL=%01d:%02d)\n' 3 38
using cache (RTTL=3:38)
+ echo ''

++ jq --arg pool_id 2 -r '[.pg_stats[] | select(.pgid | startswith($pool_id+"."))] | .[0] | .acting[0]' /root/.cache/ceph/pgs_dump.json
+ first_primary=osd.63
++ ceph config get osd.63 osd_scrub_interval_randomize_ratio
+ osd_scrub_interval_randomize_ratio=0.500000
++ ceph config get osd.63 osd_scrub_during_recovery
+ osd_scrub_during_recovery=false
+ ((  rebuild_cache  ))
+ awk -v pool_id=2 -v sis_mn=0 -v dsis_mn=0 -v sim=0 -v sirr=0.500000 -v sdr=false '{
        split($1, pg_id, ".") # [1]=pool_id
        if(pg_in_pool=(pg_id[1]==pool_id)) {
                NPGS++
                if($2>sis_mn*21600) {
                        ST+=$2
                        NPGS_S++
                }
                si=$2>0 ? 1+int($2/21600) : 1
                si_mx=si_mx<si ? si : si_mx
                if($3>dsis_mn*86400) {
                        DST+=$3
                        NPGS_DS++
                }
                dsi=$3>0 ? 1+int($3/86400) : 1
                dsi_mx=dsi_mx<dsi ? dsi : dsi_mx
                pg_sn[si]++
                pg_dsn[dsi]++
                for(i=5; i<=NF; ++i) pg_osds[$1]=pg_osds[$1] " " $i
        }
        if($4 ~ /scrubbing\+deep/) {
                if(pg_in_pool) deep_scrubbing[dsi]++
                if(pg_in_pool) deep_scrubbing_s[si]++
                for(i=5; i<=NF; ++i) osd[$i]="busy"
        } else if($4 ~ /scrubbing/) {
                if(pg_in_pool) scrubbing[si]++
                if(pg_in_pool) scrubbing_d[dsi]++
                for(i=5; i<=NF; ++i) osd[$i]="busy"
        } else if($4 ~ /active\+clean/) {
                if(pg_in_pool) pg_sn_ids[si]=pg_sn_ids[si] " " $1
                if(pg_in_pool) pg_dsn_ids[dsi]=pg_dsn_ids[dsi] " " $1
#       } else if((sdr=="true") && ($4 ~ /active\+remapped\+backfilling/)) {
#               if(pg_in_pool) pg_sn_ids[si]=pg_sn_ids[si] " " $1
#               if(pg_in_pool) pg_dsn_ids[dsi]=pg_dsn_ids[dsi] " " $1
#               for(i=5; i<=NF; ++i) osd[$i]="busy" # Are all OSDs with backfill reservation busy?
        } else if((sdr=="true") && ($4 ~ /active\+remapped/)) {
                if(pg_in_pool) pg_sn_ids[si]=pg_sn_ids[si] " " $1
                if(pg_in_pool) pg_dsn_ids[dsi]=pg_dsn_ids[dsi] " " $1
                for(i=5; i<=NF; ++i) osd[$i]="busy"
        } else {
                if(pg_in_pool) unclean[si]++
                if(pg_in_pool) unclean_d[dsi]++
                for(i=5; i<=NF; ++i) osd[$i]="busy"
        }
}
END {
        print "Scrub report:"
        npgs=0
        npgse=0
        intervals=0
        nscr=0
        nidle=0
        nunc=0
        for(si=1; si<=si_mx; ++si) {
                intervals+=6
                if(pg_sn[si]==0 && scrubbing[si]==0 && unclean[si]==0) continue;
                npgs+=pg_sn[si]
                nscr+=scrubbing[si]
                nunc+=unclean[si]
                printf("%4d%% %7d PGs not scrubbed since %2d intervals (%3dh)", \
                        100.0*npgs/NPGS, pg_sn[si], si, intervals)
                split(pg_sn_ids[si], pgs)
                busy=0
                if(si>sis_mn) {
                        npgse+=pg_sn[si]
                        for(pg in pgs) {
                                split(pg_osds[pgs[pg]], osds)
                                osds_busy=0
                                for(o in osds) if(osd[osds[o]]=="busy") {osds_busy=1; break}
                                if(osds_busy) busy++
                        }
                }
                if(si>sis_mn && idle=length(pgs)-busy) {
                        nidle+=idle;
                        printf(" [%d idle]", idle)
                }
                if(deep_scrubbing_s[si]) printf(" [%d scrubbing+deep]", deep_scrubbing_s[si])
                if(unclean[si])          printf(" [%d unclean]", unclean[si])
                if(scrubbing[si])        printf(" %d scrubbing", scrubbing[si])
                printf("\n")
        }
        if(npgs==NPGS) {
                printf("      %7d PGs, EST=%.2fd (%.2fd), %d scrubbing, %d (%.1f%%) idle, %d unclean.\n", \
                        NPGS, (NPGS_S>0 ? ST/NPGS_S/86400 : sis_mn/4), (sim*(1+sirr/3))/24, \
                        nscr, nidle, nidle/npgse*100, nunc)
        } else {
                printf("      %7d PGs out of %d reported, %d missing, %d scrubbing, %d idle, %d unclean.\n", \
                        npgs, NPGS, NPGS-npgs, nscr, nidle, nunc)
        }
        print ""
        print "Deep-scrub report:"
        npgs=0
        nscr=0
        nunc=0
        intervals=0
        for(dsi=1; dsi<=dsi_mx; ++dsi) {
                intervals+=24
                if(pg_dsn[dsi]==0 && deep_scrubbing[dsi]==0 && unclean_d[dsi]==0) continue;
                npgs+=pg_dsn[dsi]
                nscr+=deep_scrubbing[dsi]
                nunc+=unclean_d[dsi]
                printf("%4d%% %7d PGs not deep-scrubbed since %2d intervals (%3dh)", \
                        100.0*npgs/NPGS, pg_dsn[dsi], dsi, intervals)
                split(pg_dsn_ids[dsi], pgs)
                busy=0
                if(dsi_mn<dsi && pg_dsn[dsi]<=5) for(pg in pgs) {
                        split(pg_osds[pgs[pg]], osds)
                        osds_busy=0
                        for(o in osds) if(osd[osds[o]]=="busy") {osds_busy=1; break}
                        if( osds_busy) busy++
                        if(!osds_busy) printf(" %s", pgs[pg])
                }
                if(busy) printf(" [%d busy]", busy)
                if(scrubbing_d[dsi])    printf(" [%d scrubbing]", scrubbing_d[dsi])
                if(unclean_d[dsi])      printf(" [%d unclean]", unclean_d[dsi])
                if(deep_scrubbing[dsi]) printf(" %d scrubbing+deep", deep_scrubbing[dsi])
                printf("\n")
        }
        if(npgs==NPGS) {
                printf("      %7d PGs, EDST=%.2fd, %d scrubbing+deep, %d unclean.\n", \
                        NPGS, (NPGS_DS>0 ? DST/NPGS_DS/86400 : dsi_mn), nscr, nunc)
        } else {
                printf("      %7d PGs out of %d reported, %d missing, %d scrubbing+deep, %d unclean.\n", \
                        npgs, NPGS, NPGS-npgs, nscr, nunc)
        }
}
' /root/.cache/ceph/pgs_scrub_info.txt
Scrub report:
  18%      94 PGs not scrubbed since  1 intervals (  6h)
  39%     109 PGs not scrubbed since  2 intervals ( 12h)
  58%      96 PGs not scrubbed since  3 intervals ( 18h)
  74%      80 PGs not scrubbed since  4 intervals ( 24h) [1 idle]
  93%     100 PGs not scrubbed since  5 intervals ( 30h)
 100%      33 PGs not scrubbed since  6 intervals ( 36h) 1 scrubbing
          512 PGs, EST=0.66d (0.00d), 1 scrubbing, 1 (0.2%) idle, 0 unclean.

Deep-scrub report:
  16%      82 PGs not deep-scrubbed since  1 intervals ( 24h)
  34%      95 PGs not deep-scrubbed since  2 intervals ( 48h) [1 scrubbing]
  52%      93 PGs not deep-scrubbed since  3 intervals ( 72h)
  64%      58 PGs not deep-scrubbed since  4 intervals ( 96h)
  75%      60 PGs not deep-scrubbed since  5 intervals (120h)
  86%      57 PGs not deep-scrubbed since  6 intervals (144h)
  94%      38 PGs not deep-scrubbed since  7 intervals (168h)
  99%      26 PGs not deep-scrubbed since  8 intervals (192h)
 100%       3 PGs not deep-scrubbed since  9 intervals (216h) [3 busy]
          512 PGs, EDST=3.23d, 0 scrubbing+deep, 0 unclean.
+ echo ''

++ bc
+ smi_eq_occup=80
++ bc
Runtime error (func=(main), adr=22): Divide by zero
+ pgs_per_bucket=
+ echo 'libvirt-pool  scrub_min_interval=0h  (0i/80%/PGs÷i)'
libvirt-pool  scrub_min_interval=0h  (0i/80%/PGs÷i)
++ jq -r '(.scrub_max_interval // 0)/3600 | round'
+ pool_scrub_max_interval=0
+ echo 'libvirt-pool  scrub_max_interval=0h  (0d)'
libvirt-pool  scrub_max_interval=0h  (0d)
++ jq -r '(.deep_scrub_interval // 0)/3600 | round'
+ pool_deep_scrub_interval=0
++ bc
Runtime error (func=(main), adr=39): Divide by zero
+ dsmi_eq_occup=
++ bc
(standard_in) 1: syntax error
+ pgs_per_bucket=
+ echo 'libvirt-pool  deep_scrub_interval=0h  (0d/~%/~PGs÷d)'
libvirt-pool  deep_scrub_interval=0h  (0d/~%/~PGs÷d)
++ bc -l
+ smi_plus_rand=0
+ echo 'osd.63  osd_scrub_interval_randomize_ratio=0.500000  scrubs start after: 0h..0h'
osd.63  osd_scrub_interval_randomize_ratio=0.500000  scrubs start after: 0h..0h
++ ceph config get osd.63 osd_deep_scrub_randomize_ratio
+ osd_deep_scrub_randomize_ratio=0.150000
+ echo 'osd.63  osd_deep_scrub_randomize_ratio=0.150000'
osd.63  osd_deep_scrub_randomize_ratio=0.150000
++ ceph config get osd.63 osd_max_scrubs
+ osd_max_scrubs=1
+ echo 'osd.63  osd_max_scrubs=1'
osd.63  osd_max_scrubs=1
++ ceph config get osd.63 osd_scrub_backoff_ratio
+ osd_backoff=0.660000
++ ceph osd crush get-device-class osd.63
+ osd_class=hdd
++ jq -r .size
+ pool_size=3
++ bc
+ rec_backoff=.7500
+ echo 'osd.63  osd_scrub_backoff_ratio=0.660000'
osd.63  osd_scrub_backoff_ratio=0.660000
++ ceph config get mon.ceph-01 mon_warn_pg_not_scrubbed_ratio
+ mon_warn_pg_not_scrubbed_ratio=0.500000
++ bc -l
+ scrub_warni=0
++ bc -l
+ scrub_warnd=0
+ echo 'mon.ceph-01  mon_warn_pg_not_scrubbed_ratio=0.500000  warn: 0d (0i)'
mon.ceph-01  mon_warn_pg_not_scrubbed_ratio=0.500000  warn: 0d (0i)
++ ceph config get mon.ceph-01 mon_warn_pg_not_deep_scrubbed_ratio
+ mon_warn_pg_not_deep_scrubbed_ratio=0.750000
++ bc -l
+ deeps_warn=0
+ echo 'mon.ceph-01  mon_warn_pg_not_deep_scrubbed_ratio=0.750000  warn: 0d'
mon.ceph-01  mon_warn_pg_not_deep_scrubbed_ratio=0.750000  warn: 0d
frans42 commented 3 days ago

This script assumes that all per-pool scrub settings are set. I thought about it and decided that I will not spend time on supporting other sources of scrub settings and try to figure out which one has actual precedence or applies to what pool. I did consider to add a check whether or not some pool variables are set (non-zero) and remove certain stats from the output and decided against it because it is a major hassle and doesn't seem to be worth the effort.

To get the entire script to work properly, please set the per-pool scrub settings. I would recommend that in any case.

Otherwise, please ignore stats and division by zero errors, most of the numbers are then meaningless any ways. The only output that should be correct is the actual scrub stamp histogram and that looks good in your case (everything unreliable removed):

Scrub info for pool libvirt-pool (id=2): using cache (RTTL=3:38)

Scrub report:
  18%      94 PGs not scrubbed since  1 intervals (  6h)
  39%     109 PGs not scrubbed since  2 intervals ( 12h)
  58%      96 PGs not scrubbed since  3 intervals ( 18h)
  74%      80 PGs not scrubbed since  4 intervals ( 24h) [1 idle]
  93%     100 PGs not scrubbed since  5 intervals ( 30h)
 100%      33 PGs not scrubbed since  6 intervals ( 36h) 1 scrubbing

Deep-scrub report:
  16%      82 PGs not deep-scrubbed since  1 intervals ( 24h)
  34%      95 PGs not deep-scrubbed since  2 intervals ( 48h) [1 scrubbing]
  52%      93 PGs not deep-scrubbed since  3 intervals ( 72h)
  64%      58 PGs not deep-scrubbed since  4 intervals ( 96h)
  75%      60 PGs not deep-scrubbed since  5 intervals (120h)
  86%      57 PGs not deep-scrubbed since  6 intervals (144h)
  94%      38 PGs not deep-scrubbed since  7 intervals (168h)
  99%      26 PGs not deep-scrubbed since  8 intervals (192h)
 100%       3 PGs not deep-scrubbed since  9 intervals (216h) [3 busy]
frans42 commented 3 days ago

I reopened it to remember it. I consider fixing it in some way at a later time.

Check if it is possible to pull missing settings from the primary OSD of the first PG. Possibly print a warning that some parameter values are unreliable because settings for other OSDs might conflict (and that will not be checked).

Print recommendation to set per-pool scrub parameters if missing.