Closed kristi closed 4 years ago
Add /query/meta endpoint to webui which returns a json version of the status. Parse rebalanceStatus and watchdog stats in webui.
Json status makes it easy for scripts to query the status of qfs.
Example response from http://localhost:22000/query/meta
{ "evacuatingServers": [], "freeFsSpace": 183537364992, "serversByRack": {}, "retiringServers": [], "config": { "metaServer.chunkServerPort": "20100", "metaServer.clientPort": "20000", "metaServer.clientThreadCount": "4", "metaServer.clusterKey": "myTestCluster", "metaServer.cpDir": "/Users/ktsukida/qfsbase/meta/checkpoints", "metaServer.log.logDir": "/Users/ktsukida/qfsbase/meta/logs", "metaServer.logDir": "/Users/ktsukida/qfsbase/meta/logs", "metaServer.minChunkservers": "1", "metaServer.msgLogWriter.logLevel": "DEBUG", "metaServer.msgLogWriter.maxLogFileSize": "1e6", "metaServer.msgLogWriter.maxLogFiles": "10", "metaServer.pidFile": "/Users/ktsukida/qfsbase/meta/metaserver.pid", "metaServer.recoveryInterval": "1", "metaServer.rootDirGroup": "20", "metaServer.rootDirMode": "0777", "metaServer.rootDirUser": "503" }, "numReallyDownServers": 0, "vrStatus": {}, "canNotBeUsedForPlacement": 0, "upServers": [ { "load": 0, "overloaded": false, "ip": "127.0.0.1", "used": 0, "down": 0, "numDrives": 2, "replay": 0, "chunks": 0, "nblocks": "0", "nevacuate": 0, "numReplications": 0, "total": 250685575168, "retiring": 0, "nchunksToMove": "0", "port": "21001", "numWritableDrives": 2, "tiersCount": 1, "good": "1", "free": 61179121664, "util": 75.6, "nwrites": 0, "host": "127.0.0.1", "connected": 1, "nlost": "0", "tiers": "15:1:0:0:6.12e+10:2.51e+11:75.60", "displayName": "127.0.0.1:21001", "ncorrupt": 0, "md5sum": "e9b59b05d4d5108573e7d6c2b05e7d1b", "lostChunkDirs": "", "bytesevacuate": 0, "rack": 0, "numReadReplications": 0, "stopped": 0, "lastheard": 1 }, { "load": 0, "overloaded": false, "ip": "127.0.0.1", "used": 0, "down": 0, "numDrives": 1, "replay": 0, "chunks": 0, "nblocks": "0", "nevacuate": 0, "numReplications": 0, "total": 250685575168, "retiring": 0, "nchunksToMove": "0", "port": "21002", "numWritableDrives": 1, "tiersCount": 1, "good": "1", "free": 61179121664, "util": 75.6, "nwrites": 0, "host": "127.0.0.1", "connected": 1, "nlost": "0", "tiers": "15:1:0:0:6.12e+10:2.51e+11:75.60", "displayName": "127.0.0.1:21002", "ncorrupt": 0, "md5sum": "e9b59b05d4d5108573e7d6c2b05e7d1b", "lostChunkDirs": "", "bytesevacuate": 0, "rack": 0, "numReadReplications": 0, "stopped": 0, "lastheard": 1 }, { "load": 0, "overloaded": false, "ip": "127.0.0.1", "used": 0, "down": 0, "numDrives": 1, "replay": 0, "chunks": 0, "nblocks": "0", "nevacuate": 0, "numReplications": 0, "total": 250685575168, "retiring": 0, "nchunksToMove": "0", "port": "21003", "numWritableDrives": 1, "tiersCount": 1, "good": "1", "free": 61179121664, "util": 75.6, "nwrites": 0, "host": "127.0.0.1", "connected": 1, "nlost": "0", "tiers": "15:1:0:0:6.12e+10:2.51e+11:75.60", "displayName": "127.0.0.1:21003", "ncorrupt": 0, "md5sum": "e9b59b05d4d5108573e7d6c2b05e7d1b", "lostChunkDirs": "", "bytesevacuate": 0, "rack": 0, "numReadReplications": 0, "stopped": 0, "lastheard": 1 } ], "tiersColumnNames": [ "rack", "tier", "devices", "wr-chunks", "chunks", "space-available", "total-space", "%util.", "candidates" ], "downServers": [ { "displayName": "127.0.0.1:21001", "host": "127.0.0.1", "down": "Mon Jul 20 07:03:51 2020", "reason": "replay: start servicing", "port": "21001", "stillDown": 0 }, { "displayName": "127.0.0.1:21002", "host": "127.0.0.1", "down": "Mon Jul 20 07:03:51 2020", "reason": "replay: start servicing", "port": "21002", "stillDown": 0 }, { "displayName": "127.0.0.1:21003", "host": "127.0.0.1", "down": "Mon Jul 20 07:03:51 2020", "reason": "replay: start servicing", "port": "21003", "stillDown": 0 } ], "watchdog": { "wd.polls": "0", "wd.timeouts": "0", "wd.timerOverruns": "0", "wd.timerOverrunsUsecs": "0", "wd.0.name": "main", "wd.0.polls": "0", "wd.0.timeouts": "0", "wd.0.totalTimeouts": "0", "wd.0.changedAgoUsec": "1595253843378996", "wd.1.name": "LogWriter", "wd.1.polls": "0", "wd.1.timeouts": "0", "wd.1.totalTimeouts": "0", "wd.1.changedAgoUsec": "1595253843378998", "wd.2.name": "client", "wd.2.polls": "0", "wd.2.timeouts": "0", "wd.2.totalTimeouts": "0", "wd.2.changedAgoUsec": "1595253843379000", "wd.3.name": "client", "wd.3.polls": "0", "wd.3.timeouts": "0", "wd.3.totalTimeouts": "0", "wd.3.changedAgoUsec": "1595253843379002", "wd.4.name": "client", "wd.4.polls": "0", "wd.4.timeouts": "0", "wd.4.totalTimeouts": "0", "wd.4.changedAgoUsec": "1595253843379004", "wd.5.name": "client", "wd.5.polls": "0", "wd.5.timeouts": "0", "wd.5.totalTimeouts": "0", "wd.5.changedAgoUsec": "1595253843379006" }, "systemInfo": { "logDiskWriteByteCount": 774, "csmapEntryBytes": 0, "dentrySize": 64, "maxClients": 13230, "sourceVersion": "cc3ac568-git@github.com:kristi/qfs.git/feature/spelling-timeouts@cc3ac568c3706a0a91211598d11c4e1f109732fd", "dentries": 13, "objStoreDeletes": 0, "totalSpace": 752056725504, "sumOfLogicalFileSizes": 0, "hibernatedServerCount": 0, "internalNodeSize": 4096, "totalBuffers": 262144, "usedSpace": 0, "vrPrimaryNodeId": -1, "fattrs": 5, "bTreeHeight": 1, "log10SecAvgUsec": 30, "csmapEntryAllocs": 0, "fattrAllocSize": 8388624, "cinfos": 0, "appendCacheSize": 0, "isInRecovery": false, "pendingRecovery": 0, "log15SecAvgUsec": 27, "usedBuffers": 6, "freeFsSpace": 183537364992, "csMaxGoodSlaveLoadAvg": 0, "startedAt": " Mon Jul 20 07:03:50 2020", "writableDrives": 4, "internalNodeAllocSize": 8388624, "buildVersion": "cc3ac568-cc3ac568c3706a0a91211598d11c4e1f109732fd-RelWithDebInfo-BF894F67", "log10SecAvgReqRate": 77, "replicationBacklog": 0, "csmapNodeSize": 72, "csMaxGoodMasterLoadAvg": 0, "logDiskWriteCount": 5, "logAvgReqRateDiv": 256, "maxChunkServers": 2016, "csMastersToRestart": 0, "logOpWrite10SecAvgUsec": 1, "objStoreDeletesInFlight": 0, "vrNodeId": -1, "chunks": 0, "logTimeUsec": 2299, "sockets": 17, "totalDrives": 4, "objStoreEnabled": 0, "internalNodes": 1, "allocatedRequests": 6, "logDiskWriteUsec": 145, "csToRestart": 0, "fattrSize": 104, "delayedRecovery": 0, "csMaxGoodCandidateLoadAvg": 0, "objStoreDeletesRetry": 0, "goodMasters": 2, "log5SecAvgUsec": 23, "clients": 1, "logPendingOpsCount": 0, "log15SecAvgReqRate": 73, "logExceedQueueDepthFailedCount": 0, "fileCount": 1, "csmapAllocSize": 0, "logTimeOpsCount": 11, "cinfoAllocSize": 0, "goodSlaves": 1, "logOpWrite15SecAvgUsec": 1, "chunkServers": 3, "cinfoSize": 40, "logOpWrite5SecAvgUsec": 0, "uptime": 13, "pendingReplication": 0, "csmapNodes": 0, "logExceedLogQueueDepthFailureCount300SecAvg": 0, "dirCount": 4, "replicationsCheck": 0, "objStoreDeletesStartedAgo": 0, "logTotalRequestCount": 11, "vrPrimaryFlag": 1, "vrActiveFlag": 1, "wormMode": "Disabled", "dentryAllocSize": 8388624, "log5SecAvgReqRate": 53, "replications": 0, "fileSystemId": 6106949049890591000, "logPendingAckByteCount": 0 }, "rebalanceStatus": { "RoundCount": " 0", "NoSource": " 0", "ServerNeeded": " 0", "NoServerFound": " 0", "RackNeeded": " 0", "NoRackFound": " 0", "NonLoadedServerNeeded": " 0", "NoNonLoadedServerFound": " 0", "Ok": " 0", "Scanned": " 0", "Busy": " 0", "BusyOther": " 0", "ReplicationStarted": " 0", "NoReplicationStarted": " 0", "ScanTimeout": " 0", "TotalNoSource": " 0", "TotalServerNeeded": " 0", "TotalNoServerFound": " 0", "TotalRackNeeded": " 0", "TotalNoRackFound": " 0", "TotalNonLoadedServerNeeded": " 0", "TotalNoNonLoadedServerFound": " 0", "TotalOk": " 0", "TotalScanned": " 0", "TotalBusy": " 0", "TotalBusyOther": " 0", "TotalReplicationStarted": " 0", "TotalNoReplicationStarted": " 0", "TotalScanTimeout": " 0", "Plan": " 0", "PlanNoDest": " 0", "PlanTimeout": " 0", "PlanScanned": " 0", "PlanNoChunk": " 0", "PlanNoSrc": " 0", "PlanBusy": " 0", "PlanBusyOther": " 0", "PlanCannotMove": " 0", "PlanReplicationStarted": " 0", "PlanNoReplicationStarted": " 0", "PlanLine": " 0", "PlanNoServer": " 0", "PlanAdded": " 0", "TotalPlanNoDest": " 0", "TotalPlanTimeout": " 0", "TotalPlanScanned": " 0", "TotalPlanNoChunk": " 0", "TotalPlanNoSrc": " 0", "TotalPlanBusy": " 0", "TotalPlanBusyOther": " 0", "TotalPlanCannotMove": " 0", "TotalPlanReplicationStarted": " 0", "TotalPlanNoReplicationStarted": " 0", "TotalPlanLine": " 0", "TotalPlanNoServer": " 0", "TotalPlanAdded": " 0" }, "tiersInfo": [ "all", "15", "3", "0", "0", "183537364992", "752056725504", "75.60", "3", "0", "15", "3", "0", "0", "183537364992", "752056725504", "75.60", "3" ], "goodNoRackAssignedCount": 0 }
Add /query/meta endpoint to webui which returns a json version of the status. Parse rebalanceStatus and watchdog stats in webui.
Json status makes it easy for scripts to query the status of qfs.
Example response from http://localhost:22000/query/meta