rwynn / monstache

a go daemon that syncs MongoDB to Elasticsearch in realtime. you know, for search.
https://rwynn.github.io/monstache-site/
MIT License
1.29k stars 182 forks source link

Token resume lead to Mongo HIGH CPU usage #721

Open UnderTreeTech opened 6 months ago

UnderTreeTech commented 6 months ago

I use montsache to sync 100 tables data to es.Monstache version 6.7.17, Resume type is token, mongo version is 3.6.21, my linux server is 8 core 16G memory. Everytime I reboot monstache, mongo's cpu usage rise rapidly till nearly 800% and it comes down after a period time. I still not find the problem after investigate serveral days. Hope you can help me @rwynn . Here is my conf

# connect to MongoDB using the following URL
mongo-url = "mongodb://127.0.0.1:27017/?connect=direct"

# connect to the Elasticsearch REST API at the following node URLs
elasticsearch-urls = ["http://127.0.0.1:9200"]
elasticsearch-user = "xxx"
elasticsearch-password = "xxx"

change-stream-namespaces = ["message.message_0","message.message_1","message.message_2","message.message_3","message.message_4","message.message_5","message.message_6","message.message_7","message.message_8","message.message_9","message.message_10","message.message_11","message.message_12","message.message_13","message.message_14","message.message_15","message.message_16","message.message_17","message.message_18","message.message_19","message.message_20","message.message_21","message.message_22","message.message_23","message.message_24","message.message_25","message.message_26","message.message_27","message.message_28","message.message_29","message.message_30","message.message_31","message.message_32","message.message_33","message.message_34","message.message_35","message.message_36","message.message_37","message.message_38","message.message_39","message.message_40","message.message_41","message.message_42","message.message_43","message.message_44","message.message_45","message.message_46","message.message_47","message.message_48","message.message_49","message.message_50","message.message_51","message.message_52","message.message_53","message.message_54","message.message_55","message.message_56","message.message_57","message.message_58","message.message_59","message.message_60","message.message_61","message.message_62","message.message_63","message.message_64","message.message_65","message.message_66","message.message_67","message.message_68","message.message_69","message.message_70","message.message_71","message.message_72","message.message_73","message.message_74","message.message_75","message.message_76","message.message_77","message.message_78","message.message_79","message.message_80","message.message_81","message.message_82","message.message_83","message.message_84","message.message_85","message.message_86","message.message_87","message.message_88","message.message_89","message.message_90","message.message_91","message.message_92","message.message_93","message.message_94","message.message_95","message.message_96","message.message_97","message.message_98","message.message_99"]

direct-read-namespaces = ["message.message_0","message.message_1","message.message_2","message.message_3","message.message_4","message.message_5","message.message_6","message.message_7","message.message_8","message.message_9","message.message_10","message.message_11","message.message_12","message.message_13","message.message_14","message.message_15","message.message_16","message.message_17","message.message_18","message.message_19","message.message_20","message.message_21","message.message_22","message.message_23","message.message_24","message.message_25","message.message_26","message.message_27","message.message_28","message.message_29","message.message_30","message.message_31","message.message_32","message.message_33","message.message_34","message.message_35","message.message_36","message.message_37","message.message_38","message.message_39","message.message_40","message.message_41","message.message_42","message.message_43","message.message_44","message.message_45","message.message_46","message.message_47","message.message_48","message.message_49","message.message_50","message.message_51","message.message_52","message.message_53","message.message_54","message.message_55","message.message_56","message.message_57","message.message_58","message.message_59","message.message_60","message.message_61","message.message_62","message.message_63","message.message_64","message.message_65","message.message_66","message.message_67","message.message_68","message.message_69","message.message_70","message.message_71","message.message_72","message.message_73","message.message_74","message.message_75","message.message_76","message.message_77","message.message_78","message.message_79","message.message_80","message.message_81","message.message_82","message.message_83","message.message_84","message.message_85","message.message_86","message.message_87","message.message_88","message.message_89","message.message_90","message.message_91","message.message_92","message.message_93","message.message_94","message.message_95","message.message_96","message.message_97","message.message_98","message.message_99"]
direct-read-stateful = true

# resume processing from a timestamp saved in a previous run
resume = true
# use a custom resume strategy (tokens) instead of the default strategy (timestamps)
resume-strategy = 1

# log detail
verbose = true

index-as-update = true
delete-strategy = 2

[logs]
info = "./logs/mongo2es.log"
warn = "./logs/mongo2es.log"
error = "./logs/mongo2es.log"
trace = "./logs/mongo2es.log"

[log-rotate]
max-size = 50

[gtm-settings]
# the value is a go duration string
max-await-time = "2s"

There are many slow logs

2024-05-15T15:25:43.354+0800 I COMMAND  [conn1916750] command local.oplog.rs appName: "monstache" command: aggregate { aggregate: "message_79", pipeline: [ { $changeStream: { fullDocument: "updateLookup", resumeAfter: { _data: BinData(0, 82664311940000000146645F69640064664311941BDE944B30D95502005A1004FC599E6C6E9949F9922EE6F06EFFFE3A04) } } } ], cursor: {}, lsid: { id: UUID("701fcd0f-efa7-4866-bbc0-814af4a02c25") }, $clusterTime: { clusterTime: Timestamp(1715757933, 1), signature: { hash: BinData(0, 0000000000000000000000000000000000000000), keyId: 0 } }, $db: "message", $readPreference: { mode: "primaryPreferred" } } planSummary: COLLSCAN cursorid:5013926950783328236 keysExamined:0 docsExamined:71925 numYields:593 nreturned:0 reslen:216 locks:{ Global: { acquireCount: { r: 1194 } }, Database: { acquireCount: { r: 597 } }, Collection: { acquireCount: { r: 1 } }, oplog: { acquireCount: { r: 596 } } } protocol:op_msg 1462ms
2024-05-15T15:25:45.812+0800 I COMMAND  [conn1916760] command local.oplog.rs appName: "monstache" command: aggregate { aggregate: "message_78", pipeline: [ { $changeStream: { fullDocument: "updateLookup", resumeAfter: { _data: BinData(0, 826642DCD30000000546645F696400646642DCD31BDE944B30D95493005A1004EBBECF9650D5467A8BCC31D6D47A4BE904) } } } ], cursor: {}, lsid: { id: UUID("dc94c2af-8b56-492a-82d4-aae5c004c173") }, $clusterTime: { clusterTime: Timestamp(1715757933, 1), signature: { hash: BinData(0, 0000000000000000000000000000000000000000), keyId: 0 } }, $db: "message", $readPreference: { mode: "primaryPreferred" } } planSummary: COLLSCAN cursorid:6541947636423231642 keysExamined:0 docsExamined:83214 numYields:697 nreturned:0 reslen:216 locks:{ Global: { acquireCount: { r: 1402 } }, Database: { acquireCount: { r: 701 } }, Collection: { acquireCount: { r: 1 } }, oplog: { acquireCount: { r: 700 } } } protocol:op_msg 3481ms
2024-05-15T15:26:08.011+0800 I COMMAND  [conn1916759] command local.oplog.rs appName: "monstache" command: aggregate { aggregate: "message_64", pipeline: [ { $changeStream: { fullDocument: "updateLookup", resumeAfter: { _data: BinData(0, 82662387E20000007A46645F69640064661DF3471BDE944B30D906B6005A1004EA6F2AB6AB8F462FA060A14EBFE2351204) } } } ], cursor: {}, lsid: { id: UUID("46570917-13e9-4c08-a9d2-4d33bf3c207a") }, $clusterTime: { clusterTime: Timestamp(1715757933, 1), signature: { hash: BinData(0, 0000000000000000000000000000000000000000), keyId: 0 } }, $db: "message", $readPreference: { mode: "primaryPreferred" } } planSummary: COLLSCAN cursorid:4808403073260753301 keysExamined:0 docsExamined:1773330 numYields:14297 nreturned:0 reslen:216 locks:{ Global: { acquireCount: { r: 28602 } }, Database: { acquireCount: { r: 14301 } }, Collection: { acquireCount: { r: 1 } }, oplog: { acquireCount: { r: 14300 } } } protocol:op_msg 25773ms
2024-05-15T15:26:08.021+0800 I COMMAND  [conn1916729] command local.oplog.rs appName: "monstache" command: aggregate { aggregate: "message_72", pipeline: [ { $changeStream: { fullDocument: "updateLookup", resumeAfter: { _data: BinData(0, 82662387E40000001346645F69640064662084451BDE944B30D90B30005A10042C3C18736B73416CB25C487A7E62E2FA04) } } } ], cursor: {}, lsid: { id: UUID("2ad4d0ab-d1db-4e49-bb6a-565195cd9b94") }, $clusterTime: { clusterTime: Timestamp(1715757933, 1), signature: { hash: BinData(0, 0000000000000000000000000000000000000000), keyId: 0 } }, $db: "message", $readPreference: { mode: "primaryPreferred" } } planSummary: COLLSCAN cursorid:9048699408739661260 keysExamined:0 docsExamined:1773291 numYields:14291 nreturned:0 reslen:216 locks:{ Global: { acquireCount: { r: 28590 } }, Database: { acquireCount: { r: 14295 } }, Collection: { acquireCount: { r: 1 } }, oplog: { acquireCount: { r: 14294 } } } protocol:op_msg 26740ms
2024-05-15T15:26:08.040+0800 I COMMAND  [conn1916761] command local.oplog.rs appName: "monstache" command: aggregate { aggregate: "message_67", pipeline: [ { $changeStream: { fullDocument: "updateLookup", resumeAfter: { _data: BinData(0, 82662387E30000001446645F69640064661DF2041BDE944B30D906A6005A100474DF9EEBB1184748B570CBEFC095387F04) } } } ], cursor: {}, lsid: { id: UUID("c557de53-06e4-49ea-be5e-e9198414c574") }, $clusterTime: { clusterTime: Timestamp(1715757933, 1), signature: { hash: BinData(0, 0000000000000000000000000000000000000000), keyId: 0 } }, $db: "message", $readPreference: { mode: "primaryPreferred" } } planSummary: COLLSCAN cursorid:6435371553498803409 keysExamined:0 docsExamined:1773310 numYields:14313 nreturned:0 reslen:216 locks:{ Global: { acquireCount: { r: 28634 } }, Database: { acquireCount: { r: 14317 } }, Collection: { acquireCount: { r: 1 } }, oplog: { acquireCount: { r: 14316 } } } protocol:op_msg 25750ms