Open tantra35 opened 5 years ago
Do you have reproduction steps?
We just start stop virtual box instances in in various non-deterministic sequence, to monitor how evaluations will be catched by our cluster autoscale solution
And have folow evaluation log
2018-09-27 02:57:28.510000
{u'vault_debug-02/periodic-1538005800': {'evals': {u'1ae37566-8897-5043-4d4f-de5fe885dc8f': {u'AnnotatePlan': False,
u'BlockedEval': u'3aa562f8-9f32-a5fe-0b7d-66affa686106',
u'ClassEligibility': None,
u'CreateIndex': 74,
u'DeploymentID': u'',
u'EscapedComputedClass': False,
u'FailedTGAllocs': {u'test': {u'AllocationTime': 121981,
u'ClassExhausted': {u'dockerworker': 2},
u'ClassFiltered': None,
u'CoalescedFailures': 1,
u'ConstraintFiltered': None,
u'DimensionExhausted': {u'network: reserved port collision': 2},
u'NodesAvailable': {u'test': 2},
u'NodesEvaluated': 2,
u'NodesExhausted': 2,
u'NodesFiltered': 0,
u'QuotaExhausted': None,
u'Scores': None}},
u'ID': u'1ae37566-8897-5043-4d4f-de5fe885dc8f',
u'JobID': u'vault_debug-02/periodic-1538005800',
u'JobModifyIndex': 0,
u'LeaderACL': u'',
u'ModifyIndex': 78,
u'Namespace': u'default',
u'NextEval': u'',
u'NodeID': u'fe0e887c-db1a-435d-0446-6c7710b3aa99',
u'NodeModifyIndex': 73,
u'PreviousEval': u'',
u'Priority': 50,
u'QueuedAllocations': {u'test': 2},
u'QuotaLimitReached': u'',
u'SnapshotIndex': 74,
u'Status': u'complete',
u'StatusDescription': u'',
u'TriggeredBy': u'node-update',
u'Type': u'batch',
u'Wait': 0,
u'WaitUntil': u'0001-01-01T00:00:00Z'},
u'316c70a0-ddc2-2a87-b384-2aac664b7504': {u'AnnotatePlan': False,
u'BlockedEval': u'a2ac7c7d-3038-7263-a6bf-93f9dfc027b9',
u'ClassEligibility': None,
u'CreateIndex': 84,
u'DeploymentID': u'',
u'EscapedComputedClass': False,
u'FailedTGAllocs': {u'test': {u'AllocationTime': 203174,
u'ClassExhausted': {u'dockerworker': 2},
u'ClassFiltered': None,
u'CoalescedFailures': 1,
u'ConstraintFiltered': None,
u'DimensionExhausted': {u'network: reserved port collision': 2},
u'NodesAvailable': {u'test': 2},
u'NodesEvaluated': 2,
u'NodesExhausted': 2,
u'NodesFiltered': 0,
u'QuotaExhausted': None,
u'Scores': None}},
u'ID': u'316c70a0-ddc2-2a87-b384-2aac664b7504',
u'JobID': u'vault_debug-02/periodic-1538005800',
u'JobModifyIndex': 0,
u'LeaderACL': u'',
u'ModifyIndex': 89,
u'Namespace': u'default',
u'NextEval': u'',
u'NodeID': u'ccf3ac8d-6b7e-972e-b9c9-1ee358b876df',
u'NodeModifyIndex': 83,
u'PreviousEval': u'',
u'Priority': 50,
u'QueuedAllocations': {u'test': 2},
u'QuotaLimitReached': u'',
u'SnapshotIndex': 85,
u'Status': u'complete',
u'StatusDescription': u'',
u'TriggeredBy': u'node-update',
u'Type': u'batch',
u'Wait': 0,
u'WaitUntil': u'0001-01-01T00:00:00Z'},
u'8763f6a9-bc69-fcf6-c05f-90c73970a753': {u'AnnotatePlan': False,
u'BlockedEval': u'a4f9603e-a4dc-3797-c560-4e296e1fb049',
u'ClassEligibility': None,
u'CreateIndex': 86,
u'DeploymentID': u'',
u'EscapedComputedClass': False,
u'FailedTGAllocs': {u'test': {u'AllocationTime': 76012,
u'ClassExhausted': {u'dockerworker': 2},
u'ClassFiltered': None,
u'CoalescedFailures': 1,
u'ConstraintFiltered': None,
u'DimensionExhausted': {u'network: reserved port collision': 2},
u'NodesAvailable': {u'test': 2},
u'NodesEvaluated': 2,
u'NodesExhausted': 2,
u'NodesFiltered': 0,
u'QuotaExhausted': None,
u'Scores': None}},
u'ID': u'8763f6a9-bc69-fcf6-c05f-90c73970a753',
u'JobID': u'vault_debug-02/periodic-1538005800',
u'JobModifyIndex': 0,
u'LeaderACL': u'',
u'ModifyIndex': 92,
u'Namespace': u'default',
u'NextEval': u'',
u'NodeID': u'fe0e887c-db1a-435d-0446-6c7710b3aa99',
u'NodeModifyIndex': 85,
u'PreviousEval': u'',
u'Priority': 50,
u'QueuedAllocations': {u'test': 2},
u'QuotaLimitReached': u'',
u'SnapshotIndex': 89,
u'Status': u'complete',
u'StatusDescription': u'',
u'TriggeredBy': u'node-update',
u'Type': u'batch',
u'Wait': 0,
u'WaitUntil': u'0001-01-01T00:00:00Z'},
u'b46c0057-c4ff-10f7-dc5f-172f134eb18d': {u'AnnotatePlan': False,
u'BlockedEval': u'748e28cc-e631-0ba4-a80f-63ad0ea3ec1b',
u'ClassEligibility': None,
u'CreateIndex': 67,
u'DeploymentID': u'',
u'EscapedComputedClass': False,
u'FailedTGAllocs': {u'test': {u'AllocationTime': 98738,
u'ClassExhausted': {u'dockerworker': 3},
u'ClassFiltered': None,
u'CoalescedFailures': 0,
u'ConstraintFiltered': None,
u'DimensionExhausted': {u'network: reserved port collision': 3},
u'NodesAvailable': {u'test': 3},
u'NodesEvaluated': 3,
u'NodesExhausted': 3,
u'NodesFiltered': 0,
u'QuotaExhausted': None,
u'Scores': None}},
u'ID': u'b46c0057-c4ff-10f7-dc5f-172f134eb18d',
u'JobID': u'vault_debug-02/periodic-1538005800',
u'JobModifyIndex': 0,
u'LeaderACL': u'',
u'ModifyIndex': 70,
u'Namespace': u'default',
u'NextEval': u'',
u'NodeID': u'ccf3ac8d-6b7e-972e-b9c9-1ee358b876df',
u'NodeModifyIndex': 66,
u'PreviousEval': u'',
u'Priority': 50,
u'QueuedAllocations': {u'test': 1},
u'QuotaLimitReached': u'',
u'SnapshotIndex': 67,
u'Status': u'complete',
u'StatusDescription': u'',
u'TriggeredBy': u'node-update',
u'Type': u'batch',
u'Wait': 0,
u'WaitUntil': u'0001-01-01T00:00:00Z'},
u'e0a7f8c2-2c08-9efb-6486-62ebe876f3cf': {u'AnnotatePlan': False,
u'BlockedEval': u'2dc6b6ea-5e41-2df0-d769-fa89bcde4385',
u'ClassEligibility': None,
u'CreateIndex': 97,
u'DeploymentID': u'',
u'EscapedComputedClass': False,
u'FailedTGAllocs': {u'test': {u'AllocationTime': 17864,
u'ClassExhausted': {u'dockerworker': 3},
u'ClassFiltered': None,
u'CoalescedFailures': 0,
u'ConstraintFiltered': None,
u'DimensionExhausted': {u'network: reserved port collision': 3},
u'NodesAvailable': {u'test': 3},
u'NodesEvaluated': 3,
u'NodesExhausted': 3,
u'NodesFiltered': 0,
u'QuotaExhausted': None,
u'Scores': None}},
u'ID': u'e0a7f8c2-2c08-9efb-6486-62ebe876f3cf',
u'JobID': u'vault_debug-02/periodic-1538005800',
u'JobModifyIndex': 0,
u'LeaderACL': u'',
u'ModifyIndex': 100,
u'Namespace': u'default',
u'NextEval': u'',
u'NodeID': u'fe0e887c-db1a-435d-0446-6c7710b3aa99',
u'NodeModifyIndex': 96,
u'PreviousEval': u'',
u'Priority': 50,
u'QueuedAllocations': {u'test': 1},
u'QuotaLimitReached': u'',
u'SnapshotIndex': 97,
u'Status': u'complete',
u'StatusDescription': u'',
u'TriggeredBy': u'node-update',
u'Type': u'batch',
u'Wait': 0,
u'WaitUntil': u'0001-01-01T00:00:00Z'}},
'lastupdate': 1538006248.51}}
2018-09-27 03:00:40.456000
{u'vault_debug-02/periodic-1538005800': {'evals': {u'1ae37566-8897-5043-4d4f-de5fe885dc8f': {u'AnnotatePlan': False,
u'BlockedEval': u'3aa562f8-9f32-a5fe-0b7d-66affa686106',
u'ClassEligibility': None,
u'CreateIndex': 74,
u'DeploymentID': u'',
u'EscapedComputedClass': False,
u'FailedTGAllocs': {u'test': {u'AllocationTime': 121981,
u'ClassExhausted': {u'dockerworker': 2},
u'ClassFiltered': None,
u'CoalescedFailures': 1,
u'ConstraintFiltered': None,
u'DimensionExhausted': {u'network: reserved port collision': 2},
u'NodesAvailable': {u'test': 2},
u'NodesEvaluated': 2,
u'NodesExhausted': 2,
u'NodesFiltered': 0,
u'QuotaExhausted': None,
u'Scores': None}},
u'ID': u'1ae37566-8897-5043-4d4f-de5fe885dc8f',
u'JobID': u'vault_debug-02/periodic-1538005800',
u'JobModifyIndex': 0,
u'LeaderACL': u'',
u'ModifyIndex': 78,
u'Namespace': u'default',
u'NextEval': u'',
u'NodeID': u'fe0e887c-db1a-435d-0446-6c7710b3aa99',
u'NodeModifyIndex': 73,
u'PreviousEval': u'',
u'Priority': 50,
u'QueuedAllocations': {u'test': 2},
u'QuotaLimitReached': u'',
u'SnapshotIndex': 74,
u'Status': u'complete',
u'StatusDescription': u'',
u'TriggeredBy': u'node-update',
u'Type': u'batch',
u'Wait': 0,
u'WaitUntil': u'0001-01-01T00:00:00Z'},
u'316c70a0-ddc2-2a87-b384-2aac664b7504': {u'AnnotatePlan': False,
u'BlockedEval': u'a2ac7c7d-3038-7263-a6bf-93f9dfc027b9',
u'ClassEligibility': None,
u'CreateIndex': 84,
u'DeploymentID': u'',
u'EscapedComputedClass': False,
u'FailedTGAllocs': {u'test': {u'AllocationTime': 203174,
u'ClassExhausted': {u'dockerworker': 2},
u'ClassFiltered': None,
u'CoalescedFailures': 1,
u'ConstraintFiltered': None,
u'DimensionExhausted': {u'network: reserved port collision': 2},
u'NodesAvailable': {u'test': 2},
u'NodesEvaluated': 2,
u'NodesExhausted': 2,
u'NodesFiltered': 0,
u'QuotaExhausted': None,
u'Scores': None}},
u'ID': u'316c70a0-ddc2-2a87-b384-2aac664b7504',
u'JobID': u'vault_debug-02/periodic-1538005800',
u'JobModifyIndex': 0,
u'LeaderACL': u'',
u'ModifyIndex': 89,
u'Namespace': u'default',
u'NextEval': u'',
u'NodeID': u'ccf3ac8d-6b7e-972e-b9c9-1ee358b876df',
u'NodeModifyIndex': 83,
u'PreviousEval': u'',
u'Priority': 50,
u'QueuedAllocations': {u'test': 2},
u'QuotaLimitReached': u'',
u'SnapshotIndex': 85,
u'Status': u'complete',
u'StatusDescription': u'',
u'TriggeredBy': u'node-update',
u'Type': u'batch',
u'Wait': 0,
u'WaitUntil': u'0001-01-01T00:00:00Z'},
u'8763f6a9-bc69-fcf6-c05f-90c73970a753': {u'AnnotatePlan': False,
u'BlockedEval': u'a4f9603e-a4dc-3797-c560-4e296e1fb049',
u'ClassEligibility': None,
u'CreateIndex': 86,
u'DeploymentID': u'',
u'EscapedComputedClass': False,
u'FailedTGAllocs': {u'test': {u'AllocationTime': 76012,
u'ClassExhausted': {u'dockerworker': 2},
u'ClassFiltered': None,
u'CoalescedFailures': 1,
u'ConstraintFiltered': None,
u'DimensionExhausted': {u'network: reserved port collision': 2},
u'NodesAvailable': {u'test': 2},
u'NodesEvaluated': 2,
u'NodesExhausted': 2,
u'NodesFiltered': 0,
u'QuotaExhausted': None,
u'Scores': None}},
u'ID': u'8763f6a9-bc69-fcf6-c05f-90c73970a753',
u'JobID': u'vault_debug-02/periodic-1538005800',
u'JobModifyIndex': 0,
u'LeaderACL': u'',
u'ModifyIndex': 92,
u'Namespace': u'default',
u'NextEval': u'',
u'NodeID': u'fe0e887c-db1a-435d-0446-6c7710b3aa99',
u'NodeModifyIndex': 85,
u'PreviousEval': u'',
u'Priority': 50,
u'QueuedAllocations': {u'test': 2},
u'QuotaLimitReached': u'',
u'SnapshotIndex': 89,
u'Status': u'complete',
u'StatusDescription': u'',
u'TriggeredBy': u'node-update',
u'Type': u'batch',
u'Wait': 0,
u'WaitUntil': u'0001-01-01T00:00:00Z'},
u'b46c0057-c4ff-10f7-dc5f-172f134eb18d': {u'AnnotatePlan': False,
u'BlockedEval': u'748e28cc-e631-0ba4-a80f-63ad0ea3ec1b',
u'ClassEligibility': None,
u'CreateIndex': 67,
u'DeploymentID': u'',
u'EscapedComputedClass': False,
u'FailedTGAllocs': {u'test': {u'AllocationTime': 98738,
u'ClassExhausted': {u'dockerworker': 3},
u'ClassFiltered': None,
u'CoalescedFailures': 0,
u'ConstraintFiltered': None,
u'DimensionExhausted': {u'network: reserved port collision': 3},
u'NodesAvailable': {u'test': 3},
u'NodesEvaluated': 3,
u'NodesExhausted': 3,
u'NodesFiltered': 0,
u'QuotaExhausted': None,
u'Scores': None}},
u'ID': u'b46c0057-c4ff-10f7-dc5f-172f134eb18d',
u'JobID': u'vault_debug-02/periodic-1538005800',
u'JobModifyIndex': 0,
u'LeaderACL': u'',
u'ModifyIndex': 70,
u'Namespace': u'default',
u'NextEval': u'',
u'NodeID': u'ccf3ac8d-6b7e-972e-b9c9-1ee358b876df',
u'NodeModifyIndex': 66,
u'PreviousEval': u'',
u'Priority': 50,
u'QueuedAllocations': {u'test': 1},
u'QuotaLimitReached': u'',
u'SnapshotIndex': 67,
u'Status': u'complete',
u'StatusDescription': u'',
u'TriggeredBy': u'node-update',
u'Type': u'batch',
u'Wait': 0,
u'WaitUntil': u'0001-01-01T00:00:00Z'},
u'e0a7f8c2-2c08-9efb-6486-62ebe876f3cf': {u'AnnotatePlan': False,
u'BlockedEval': u'2dc6b6ea-5e41-2df0-d769-fa89bcde4385',
u'ClassEligibility': None,
u'CreateIndex': 97,
u'DeploymentID': u'',
u'EscapedComputedClass': False,
u'FailedTGAllocs': {u'test': {u'AllocationTime': 17864,
u'ClassExhausted': {u'dockerworker': 3},
u'ClassFiltered': None,
u'CoalescedFailures': 0,
u'ConstraintFiltered': None,
u'DimensionExhausted': {u'network: reserved port collision': 3},
u'NodesAvailable': {u'test': 3},
u'NodesEvaluated': 3,
u'NodesExhausted': 3,
u'NodesFiltered': 0,
u'QuotaExhausted': None,
u'Scores': None}},
u'ID': u'e0a7f8c2-2c08-9efb-6486-62ebe876f3cf',
u'JobID': u'vault_debug-02/periodic-1538005800',
u'JobModifyIndex': 0,
u'LeaderACL': u'',
u'ModifyIndex': 100,
u'Namespace': u'default',
u'NextEval': u'',
u'NodeID': u'fe0e887c-db1a-435d-0446-6c7710b3aa99',
u'NodeModifyIndex': 96,
u'PreviousEval': u'',
u'Priority': 50,
u'QueuedAllocations': {u'test': 1},
u'QuotaLimitReached': u'',
u'SnapshotIndex': 97,
u'Status': u'complete',
u'StatusDescription': u'',
u'TriggeredBy': u'node-update',
u'Type': u'batch',
u'Wait': 0,
u'WaitUntil': u'0001-01-01T00:00:00Z'}},
'lastupdate': 1538006248.51}}
2018-09-27 03:00:40.518000
{}
2018-09-27 15:17:45.706000
{}
2018-09-27 15:17:45.715000
{}
2018-09-27 15:17:45.725000
{u'vault_debug-02/periodic-1538050665': {'evals': {u'735c92a7-569c-976d-5b20-75007e9dc5be': {u'AnnotatePlan': False,
u'BlockedEval': u'2a0c13ca-444a-7d4c-fa87-117de285c1e1',
u'ClassEligibility': None,
u'CreateIndex': 867,
u'DeploymentID': u'',
u'EscapedComputedClass': False,
u'FailedTGAllocs': {u'test': {u'AllocationTime': 150177,
u'ClassExhausted': {u'dockerworker': 4},
u'ClassFiltered': None,
u'CoalescedFailures': 1,
u'ConstraintFiltered': None,
u'DimensionExhausted': {u'network: reserved port collision': 4},
u'NodesAvailable': {u'test': 4},
u'NodesEvaluated': 4,
u'NodesExhausted': 4,
u'NodesFiltered': 0,
u'QuotaExhausted': None,
u'Scores': None}},
u'ID': u'735c92a7-569c-976d-5b20-75007e9dc5be',
u'JobID': u'vault_debug-02/periodic-1538050665',
u'JobModifyIndex': 866,
u'LeaderACL': u'',
u'ModifyIndex': 870,
u'Namespace': u'default',
u'NextEval': u'',
u'NodeID': u'',
u'NodeModifyIndex': 0,
u'PreviousEval': u'',
u'Priority': 50,
u'QueuedAllocations': {u'test': 2},
u'QuotaLimitReached': u'',
u'SnapshotIndex': 867,
u'Status': u'complete',
u'StatusDescription': u'',
u'TriggeredBy': u'periodic-job',
u'Type': u'batch',
u'Wait': 0,
u'WaitUntil': u'0001-01-01T00:00:00Z'}},
'lastupdate': 1538050665.725}}
2018-09-27 19:40:52.370000
{u'vault_debug-02/periodic-1538050665': {'evals': {u'735c92a7-569c-976d-5b20-75007e9dc5be': {u'AnnotatePlan': False,
u'BlockedEval': u'2a0c13ca-444a-7d4c-fa87-117de285c1e1',
u'ClassEligibility': None,
u'CreateIndex': 867,
u'DeploymentID': u'',
u'EscapedComputedClass': False,
u'FailedTGAllocs': {u'test': {u'AllocationTime': 150177,
u'ClassExhausted': {u'dockerworker': 4},
u'ClassFiltered': None,
u'CoalescedFailures': 1,
u'ConstraintFiltered': None,
u'DimensionExhausted': {u'network: reserved port collision': 4},
u'NodesAvailable': {u'test': 4},
u'NodesEvaluated': 4,
u'NodesExhausted': 4,
u'NodesFiltered': 0,
u'QuotaExhausted': None,
u'Scores': None}},
u'ID': u'735c92a7-569c-976d-5b20-75007e9dc5be',
u'JobID': u'vault_debug-02/periodic-1538050665',
u'JobModifyIndex': 866,
u'LeaderACL': u'',
u'ModifyIndex': 870,
u'Namespace': u'default',
u'NextEval': u'',
u'NodeID': u'',
u'NodeModifyIndex': 0,
u'PreviousEval': u'',
u'Priority': 50,
u'QueuedAllocations': {u'test': 2},
u'QuotaLimitReached': u'',
u'SnapshotIndex': 867,
u'Status': u'complete',
u'StatusDescription': u'',
u'TriggeredBy': u'periodic-job',
u'Type': u'batch',
u'Wait': 0,
u'WaitUntil': u'0001-01-01T00:00:00Z'}},
'lastupdate': 1538050665.725}}
2018-09-27 19:40:52.480000
{u'vault_debug-02/periodic-1538050665': {'evals': {u'735c92a7-569c-976d-5b20-75007e9dc5be': {u'AnnotatePlan': False,
u'BlockedEval': u'2a0c13ca-444a-7d4c-fa87-117de285c1e1',
u'ClassEligibility': None,
u'CreateIndex': 867,
u'DeploymentID': u'',
u'EscapedComputedClass': False,
u'FailedTGAllocs': {u'test': {u'AllocationTime': 150177,
u'ClassExhausted': {u'dockerworker': 4},
u'ClassFiltered': None,
u'CoalescedFailures': 1,
u'ConstraintFiltered': None,
u'DimensionExhausted': {u'network: reserved port collision': 4},
u'NodesAvailable': {u'test': 4},
u'NodesEvaluated': 4,
u'NodesExhausted': 4,
u'NodesFiltered': 0,
u'QuotaExhausted': None,
u'Scores': None}},
u'ID': u'735c92a7-569c-976d-5b20-75007e9dc5be',
u'JobID': u'vault_debug-02/periodic-1538050665',
u'JobModifyIndex': 866,
u'LeaderACL': u'',
u'ModifyIndex': 870,
u'Namespace': u'default',
u'NextEval': u'',
u'NodeID': u'',
u'NodeModifyIndex': 0,
u'PreviousEval': u'',
u'Priority': 50,
u'QueuedAllocations': {u'test': 2},
u'QuotaLimitReached': u'',
u'SnapshotIndex': 867,
u'Status': u'complete',
u'StatusDescription': u'',
u'TriggeredBy': u'periodic-job',
u'Type': u'batch',
u'Wait': 0,
u'WaitUntil': u'0001-01-01T00:00:00Z'}},
'lastupdate': 1538050665.725}}
2018-09-27 19:40:52.498000
{u'vault_debug-02/periodic-1538050665': {'evals': {u'735c92a7-569c-976d-5b20-75007e9dc5be': {u'AnnotatePlan': False,
u'BlockedEval': u'2a0c13ca-444a-7d4c-fa87-117de285c1e1',
u'ClassEligibility': None,
u'CreateIndex': 867,
u'DeploymentID': u'',
u'EscapedComputedClass': False,
u'FailedTGAllocs': {u'test': {u'AllocationTime': 150177,
u'ClassExhausted': {u'dockerworker': 4},
u'ClassFiltered': None,
u'CoalescedFailures': 1,
u'ConstraintFiltered': None,
u'DimensionExhausted': {u'network: reserved port collision': 4},
u'NodesAvailable': {u'test': 4},
u'NodesEvaluated': 4,
u'NodesExhausted': 4,
u'NodesFiltered': 0,
u'QuotaExhausted': None,
u'Scores': None}},
u'ID': u'735c92a7-569c-976d-5b20-75007e9dc5be',
u'JobID': u'vault_debug-02/periodic-1538050665',
u'JobModifyIndex': 866,
u'LeaderACL': u'',
u'ModifyIndex': 870,
u'Namespace': u'default',
u'NextEval': u'',
u'NodeID': u'',
u'NodeModifyIndex': 0,
u'PreviousEval': u'',
u'Priority': 50,
u'QueuedAllocations': {u'test': 2},
u'QuotaLimitReached': u'',
u'SnapshotIndex': 867,
u'Status': u'complete',
u'StatusDescription': u'',
u'TriggeredBy': u'periodic-job',
u'Type': u'batch',
u'Wait': 0,
u'WaitUntil': u'0001-01-01T00:00:00Z'}},
'lastupdate': 1538050665.725}}
2018-09-27 19:40:57.570000
{u'vault_debug-02/periodic-1538050665': {'evals': {u'735c92a7-569c-976d-5b20-75007e9dc5be': {u'AnnotatePlan': False,
u'BlockedEval': u'2a0c13ca-444a-7d4c-fa87-117de285c1e1',
u'ClassEligibility': None,
u'CreateIndex': 867,
u'DeploymentID': u'',
u'EscapedComputedClass': False,
u'FailedTGAllocs': {u'test': {u'AllocationTime': 150177,
u'ClassExhausted': {u'dockerworker': 4},
u'ClassFiltered': None,
u'CoalescedFailures': 1,
u'ConstraintFiltered': None,
u'DimensionExhausted': {u'network: reserved port collision': 4},
u'NodesAvailable': {u'test': 4},
u'NodesEvaluated': 4,
u'NodesExhausted': 4,
u'NodesFiltered': 0,
u'QuotaExhausted': None,
u'Scores': None}},
u'ID': u'735c92a7-569c-976d-5b20-75007e9dc5be',
u'JobID': u'vault_debug-02/periodic-1538050665',
u'JobModifyIndex': 866,
u'LeaderACL': u'',
u'ModifyIndex': 870,
u'Namespace': u'default',
u'NextEval': u'',
u'NodeID': u'',
u'NodeModifyIndex': 0,
u'PreviousEval': u'',
u'Priority': 50,
u'QueuedAllocations': {u'test': 2},
u'QuotaLimitReached': u'',
u'SnapshotIndex': 867,
u'Status': u'complete',
u'StatusDescription': u'',
u'TriggeredBy': u'periodic-job',
u'Type': u'batch',
u'Wait': 0,
u'WaitUntil': u'0001-01-01T00:00:00Z'}},
'lastupdate': 1538050665.725}}
2018-09-27 19:40:57.587000
{}
2018-09-27 19:42:45.791000
{}
2018-09-27 19:43:03.211000
{}
2018-09-27 19:43:03.319000
{}
2018-09-27 19:43:03.323000
{}
2018-09-27 19:50:00.299000
{}
2018-09-27 19:50:00.420000
{}
2018-09-27 19:50:00.426000
{}
So after some time we just see that was launched 2 child jobs despite the fact that prohibit_overlap = true
was set, and see follow status
vagrant@consulnomad-1:/tmp/nomadserver$ nomad status
ID Type Priority Status Submit Date
vault_debug-00 service 50 running 2018-09-27T02:40:21+03:00
vault_debug-02 batch/periodic 50 running 2018-09-27T02:41:29+03:00
vault_debug-02/periodic-1538005800 batch 50 running 2018-09-27T02:50:00+03:00
vault_debug-02/periodic-1538050665 batch 50 pending 2018-09-27T15:17:45+03:00
Hope this helps. In log above {}
(empty brackets) means that no any jobs with unplaced allocations
And on leader at moment of launch buggy allocation was follow in logs
-- Logs begin at Thu 2018-09-27 02:20:14 MSK, end at Thu 2018-09-27 20:31:30 MSK. --
Sep 27 15:17:44 consulnomad-2 nomad.sh[19505]: 2018/09/27 15:17:44 [WARN] raft: Failed to contact 192.168.142.101:4647 in 500.48794ms
Sep 27 15:17:44 consulnomad-2 nomad.sh[19505]: 2018/09/27 15:17:44 [WARN] raft: Failed to contact 192.168.142.103:4647 in 500.512387ms
Sep 27 15:17:44 consulnomad-2 nomad.sh[19505]: 2018/09/27 15:17:44 [WARN] raft: Failed to contact 192.168.142.101:4647 in 519.347452ms
Sep 27 15:17:44 consulnomad-2 nomad.sh[19505]: 2018/09/27 15:17:44 [WARN] raft: Failed to contact quorum of nodes, stepping down
Sep 27 15:17:44 consulnomad-2 nomad.sh[19505]: 2018/09/27 15:17:44 [INFO] raft: Node at 192.168.142.102:4647 [Follower] entering Follower state (Leader: "")
Sep 27 15:17:44 consulnomad-2 nomad.sh[19505]: 2018/09/27 15:17:44 [INFO] raft: aborting pipeline replication to peer {Voter 192.168.142.103:4647 192.168.142.103:4647}
Sep 27 15:17:44 consulnomad-2 nomad.sh[19505]: 2018/09/27 15:17:44 [INFO] raft: aborting pipeline replication to peer {Voter 192.168.142.101:4647 192.168.142.101:4647}
Sep 27 15:17:44 consulnomad-2 nomad.sh[19505]: 2018/09/27 15:17:44.386976 [INFO] nomad: cluster leadership lost
Sep 27 15:17:44 consulnomad-2 nomad.sh[19505]: 2018/09/27 15:17:44.387184 [ERR] worker: failed to dequeue evaluation: eval broker disabled
Sep 27 15:17:45 consulnomad-2 nomad.sh[19505]: 2018/09/27 15:17:45 [WARN] raft: Heartbeat timeout from "" reached, starting election
Sep 27 15:17:45 consulnomad-2 nomad.sh[19505]: 2018/09/27 15:17:45 [INFO] raft: Node at 192.168.142.102:4647 [Candidate] entering Candidate state in term 3
Sep 27 15:17:45 consulnomad-2 nomad.sh[19505]: 2018/09/27 15:17:45 [INFO] raft: Election won. Tally: 2
Sep 27 15:17:45 consulnomad-2 nomad.sh[19505]: 2018/09/27 15:17:45 [INFO] raft: Node at 192.168.142.102:4647 [Leader] entering Leader state
Sep 27 15:17:45 consulnomad-2 nomad.sh[19505]: 2018/09/27 15:17:45 [INFO] raft: Added peer 192.168.142.103:4647, starting replication
Sep 27 15:17:45 consulnomad-2 nomad.sh[19505]: 2018/09/27 15:17:45 [INFO] raft: Added peer 192.168.142.101:4647, starting replication
Sep 27 15:17:45 consulnomad-2 nomad.sh[19505]: 2018/09/27 15:17:45.491895 [INFO] nomad: cluster leadership acquired
Sep 27 15:17:45 consulnomad-2 nomad.sh[19505]: 2018/09/27 15:17:45 [INFO] raft: pipelining replication to peer {Voter 192.168.142.101:4647 192.168.142.101:4647}
Sep 27 15:17:45 consulnomad-2 nomad.sh[19505]: 2018/09/27 15:17:45 [INFO] raft: pipelining replication to peer {Voter 192.168.142.103:4647 192.168.142.103:4647}
Sep 27 17:52:25 consulnomad-2 nomad.sh[19505]: 2018/09/27 17:52:25 [WARN] raft: Failed to contact 192.168.142.103:4647 in 500.52684ms
but at this time(15:17) we doens't start/stop instances and this is looks like lag in virtualbox network
At the same time no any agent nodes have network problems, because thy logs are empty at that moment(15:17)
First agent instance
vagrant@dockerworker-1:~$ sudo journalctl -u nomad -S 15:00
-- No entries --
Second agent instance
vagrant@dockerworker-2:~$ sudo journalctl -u nomad -S 15:00
-- No entries --
Third agent instance
vagrant@dockerworker-3:~$ sudo journalctl -u nomad -S 15:00
-- Logs begin at Thu 2018-09-27 02:56:56 MSK, end at Thu 2018-09-27 20:36:54 MSK. --
Sep 27 19:40:53 dockerworker-3 nomad.sh[1564]: 2018/09/27 19:40:53.167913 [INFO] driver.docker: stopped container 9f4c7ea7cad0923f4c0fa1b182859ac02c1dac025fd94d4990a77739bf7a81d0
Sep 27 19:40:57 dockerworker-3 nomad.sh[1564]: 2018-09-27T19:40:57.175+0300 [DEBUG] plugin.nomad: 2018/09/27 19:40:57 [ERR] plugin: plugin server: accept unix /tmp/plugin889748026: use of closed network connection
Sep 27 19:40:57 dockerworker-3 nomad.sh[1564]: 2018-09-27T19:40:57.176+0300 [DEBUG] plugin: plugin process exited: path=/opt/nomad/nomad_0.8.6-playrix/nomad
Sep 27 19:40:57 dockerworker-3 nomad.sh[1564]: 2018/09/27 19:40:57.181070 [INFO] (runner) stopping
Sep 27 19:40:57 dockerworker-3 nomad.sh[1564]: 2018/09/27 19:40:57.181163 [INFO] client.gc: marking allocation 094cfb9a-f80b-df2e-de07-6a66cd739a5f for GC
Sep 27 19:40:57 dockerworker-3 nomad.sh[1564]: 2018/09/27 19:40:57.181687 [INFO] (runner) received finish
Sep 27 19:40:59 dockerworker-3 nomad.sh[1564]: 2018/09/27 19:40:59.802233 [INFO] (runner) creating new runner (dry: false, once: false)
Sep 27 19:40:59 dockerworker-3 nomad.sh[1564]: 2018/09/27 19:40:59.802354 [INFO] (runner) creating watcher
Sep 27 19:40:59 dockerworker-3 nomad.sh[1564]: 2018/09/27 19:40:59.802423 [INFO] (runner) starting
Sep 27 19:40:59 dockerworker-3 nomad.sh[1564]: 2018/09/27 19:40:59.802432 [INFO] (runner) initiating run
Sep 27 19:41:00 dockerworker-3 nomad.sh[1564]: 2018/09/27 19:41:00.190068 [INFO] (runner) initiating run
Sep 27 19:41:00 dockerworker-3 nomad.sh[1564]: 2018/09/27 19:41:00.190545 [INFO] (runner) rendered "(dynamic)" => "/var/lib/nomad/alloc/2b3ceab1-586a-af12-bf5f-9674cebafc53/vault_debug_task/secrets/consul_token.env"
Sep 27 19:41:00 dockerworker-3 nomad.sh[1564]: 2018-09-27T19:41:00.192+0300 [DEBUG] plugin: starting plugin: path=/opt/nomad/nomad_0.8.6-playrix/nomad args="[/opt/nomad/nomad_0.8.6-playrix/nomad executor {"LogFile":"/var/lib/nomad/alloc/2b3ceab1-586a-af12-bf5f-9674cebafc
Sep 27 19:41:00 dockerworker-3 nomad.sh[1564]: 2018-09-27T19:41:00.192+0300 [DEBUG] plugin: waiting for RPC address: path=/opt/nomad/nomad_0.8.6-playrix/nomad
Sep 27 19:41:00 dockerworker-3 nomad.sh[1564]: 2018-09-27T19:41:00.421+0300 [DEBUG] plugin.nomad: plugin address: timestamp=2018-09-27T19:41:00.420+0300 address=/tmp/plugin830397527 network=unix
Sep 27 19:41:00 dockerworker-3 nomad.sh[1564]: 2018/09/27 19:41:00.547253 [INFO] driver.docker: created container acaf2e1f2cd5aa8a7f90103a06e942eee6eda0fa34e3c392fc22a4049786da5f
Sep 27 19:41:01 dockerworker-3 nomad.sh[1564]: 2018/09/27 19:41:01.628902 [INFO] driver.docker: started container acaf2e1f2cd5aa8a7f90103a06e942eee6eda0fa34e3c392fc22a4049786da5f
Sep 27 19:43:03 dockerworker-3 nomad.sh[1564]: 2018/09/27 19:43:03.284001 [INFO] driver.docker: stopped container acaf2e1f2cd5aa8a7f90103a06e942eee6eda0fa34e3c392fc22a4049786da5f
Sep 27 19:43:07 dockerworker-3 nomad.sh[1564]: 2018-09-27T19:43:07.286+0300 [DEBUG] plugin.nomad: 2018/09/27 19:43:07 [ERR] plugin: plugin server: accept unix /tmp/plugin830397527: use of closed network connection
Sep 27 19:43:07 dockerworker-3 nomad.sh[1564]: 2018-09-27T19:43:07.287+0300 [DEBUG] plugin: plugin process exited: path=/opt/nomad/nomad_0.8.6-playrix/nomad
Sep 27 19:43:07 dockerworker-3 nomad.sh[1564]: 2018/09/27 19:43:07.291237 [INFO] (runner) stopping
Sep 27 19:43:07 dockerworker-3 nomad.sh[1564]: 2018/09/27 19:43:07.291599 [INFO] client.gc: marking allocation 2b3ceab1-586a-af12-bf5f-9674cebafc53 for GC
Sep 27 19:43:07 dockerworker-3 nomad.sh[1564]: 2018/09/27 19:43:07.291881 [INFO] (runner) received finish
Sep 27 19:50:02 dockerworker-3 nomad.sh[1564]: 2018/09/27 19:50:02.819470 [INFO] (runner) creating new runner (dry: false, once: false)
Sep 27 19:50:02 dockerworker-3 nomad.sh[1564]: 2018/09/27 19:50:02.819930 [INFO] (runner) creating watcher
Sep 27 19:50:02 dockerworker-3 nomad.sh[1564]: 2018/09/27 19:50:02.820138 [INFO] (runner) starting
Sep 27 19:50:02 dockerworker-3 nomad.sh[1564]: 2018/09/27 19:50:02.820273 [INFO] (runner) initiating run
Sep 27 19:50:03 dockerworker-3 nomad.sh[1564]: 2018/09/27 19:50:03.090304 [INFO] (runner) initiating run
Sep 27 19:50:03 dockerworker-3 nomad.sh[1564]: 2018/09/27 19:50:03.090602 [INFO] (runner) rendered "(dynamic)" => "/var/lib/nomad/alloc/27119a0c-a2d6-3d39-d196-2cea1150510a/vault_debug_task/secrets/consul_token.env"
Sep 27 19:50:03 dockerworker-3 nomad.sh[1564]: 2018-09-27T19:50:03.091+0300 [DEBUG] plugin: starting plugin: path=/opt/nomad/nomad_0.8.6-playrix/nomad args="[/opt/nomad/nomad_0.8.6-playrix/nomad executor {"LogFile":"/var/lib/nomad/alloc/27119a0c-a2d6-3d39-d196-2cea115051
Sep 27 19:50:03 dockerworker-3 nomad.sh[1564]: 2018-09-27T19:50:03.091+0300 [DEBUG] plugin: waiting for RPC address: path=/opt/nomad/nomad_0.8.6-playrix/nomad
Sep 27 19:50:03 dockerworker-3 nomad.sh[1564]: 2018-09-27T19:50:03.166+0300 [DEBUG] plugin.nomad: plugin address: timestamp=2018-09-27T19:50:03.166+0300 address=/tmp/plugin756062442 network=unix
Sep 27 19:50:03 dockerworker-3 nomad.sh[1564]: 2018/09/27 19:50:03.198287 [INFO] driver.docker: created container 1aa66cd52b6ebf11d5d778ceb971a55e36bb1c2cba3549f137954fbba5e52dc5
Sep 27 19:50:03 dockerworker-3 nomad.sh[1564]: 2018/09/27 19:50:03.599225 [INFO] driver.docker: started container 1aa66cd52b6ebf11d5d778ceb971a55e36bb1c2cba3549f137954fbba5e52dc5
And the last instance
vagrant@dockerworker-4:~$ sudo journalctl -u nomad -S 15:00
-- Logs begin at Thu 2018-09-27 03:00:07 MSK, end at Thu 2018-09-27 20:37:17 MSK. --
Sep 27 19:40:53 dockerworker-4 nomad.sh[1530]: 2018/09/27 19:40:53.241353 [INFO] driver.docker: stopped container b1ecb17c3f35087613fb137bd376727f880f6c63b8d62f7383622060cb3b9669
Sep 27 19:40:57 dockerworker-4 nomad.sh[1530]: 2018-09-27T19:40:57.256+0300 [DEBUG] plugin.nomad: 2018/09/27 19:40:57 [ERR] plugin: plugin server: accept unix /tmp/plugin826844264: use of closed network connection
Sep 27 19:40:57 dockerworker-4 nomad.sh[1530]: 2018-09-27T19:40:57.257+0300 [DEBUG] plugin: plugin process exited: path=/opt/nomad/nomad_0.8.6-playrix/nomad
Sep 27 19:40:57 dockerworker-4 nomad.sh[1530]: 2018/09/27 19:40:57.261204 [INFO] (runner) stopping
Sep 27 19:40:57 dockerworker-4 nomad.sh[1530]: 2018/09/27 19:40:57.261549 [INFO] client.gc: marking allocation 9817586b-3ad7-258b-d566-e9eda02fc7a9 for GC
Sep 27 19:40:57 dockerworker-4 nomad.sh[1530]: 2018/09/27 19:40:57.261684 [INFO] (runner) received finish
Sep 27 19:40:59 dockerworker-4 nomad.sh[1530]: 2018/09/27 19:40:59.785721 [INFO] (runner) creating new runner (dry: false, once: false)
Sep 27 19:40:59 dockerworker-4 nomad.sh[1530]: 2018/09/27 19:40:59.785881 [INFO] (runner) creating watcher
Sep 27 19:40:59 dockerworker-4 nomad.sh[1530]: 2018/09/27 19:40:59.785957 [INFO] (runner) starting
Sep 27 19:40:59 dockerworker-4 nomad.sh[1530]: 2018/09/27 19:40:59.785967 [INFO] (runner) initiating run
Sep 27 19:41:00 dockerworker-4 nomad.sh[1530]: 2018/09/27 19:41:00.007392 [INFO] (runner) initiating run
Sep 27 19:41:00 dockerworker-4 nomad.sh[1530]: 2018/09/27 19:41:00.007924 [INFO] (runner) rendered "(dynamic)" => "/var/lib/nomad/alloc/da85f761-dbbb-956c-4aa1-9da13a4794d5/vault_debug_task/secrets/consul_token.env"
Sep 27 19:41:00 dockerworker-4 nomad.sh[1530]: 2018-09-27T19:41:00.009+0300 [DEBUG] plugin: starting plugin: path=/opt/nomad/nomad_0.8.6-playrix/nomad args="[/opt/nomad/nomad_0.8.6-playrix/nomad executor {"LogFile":"/var/lib/nomad/alloc/da85f761-dbbb-956c-4aa1-9da13a4794
Sep 27 19:41:00 dockerworker-4 nomad.sh[1530]: 2018-09-27T19:41:00.009+0300 [DEBUG] plugin: waiting for RPC address: path=/opt/nomad/nomad_0.8.6-playrix/nomad
Sep 27 19:41:00 dockerworker-4 nomad.sh[1530]: 2018-09-27T19:41:00.098+0300 [DEBUG] plugin.nomad: plugin address: timestamp=2018-09-27T19:41:00.097+0300 network=unix address=/tmp/plugin170929565
Sep 27 19:41:00 dockerworker-4 nomad.sh[1530]: 2018/09/27 19:41:00.128257 [INFO] driver.docker: created container 9076863e566ec9c7f865c07a58c29498f4ae4d1037728d2b623e1bffb21716c5
Sep 27 19:41:00 dockerworker-4 nomad.sh[1530]: 2018/09/27 19:41:00.802449 [INFO] driver.docker: started container 9076863e566ec9c7f865c07a58c29498f4ae4d1037728d2b623e1bffb21716c5
Sep 27 19:43:03 dockerworker-4 nomad.sh[1530]: 2018/09/27 19:43:03.246136 [INFO] driver.docker: stopped container 9076863e566ec9c7f865c07a58c29498f4ae4d1037728d2b623e1bffb21716c5
Sep 27 19:43:07 dockerworker-4 nomad.sh[1530]: 2018-09-27T19:43:07.248+0300 [DEBUG] plugin.nomad: 2018/09/27 19:43:07 [ERR] plugin: plugin server: accept unix /tmp/plugin170929565: use of closed network connection
Sep 27 19:43:07 dockerworker-4 nomad.sh[1530]: 2018-09-27T19:43:07.249+0300 [DEBUG] plugin: plugin process exited: path=/opt/nomad/nomad_0.8.6-playrix/nomad
Sep 27 19:43:07 dockerworker-4 nomad.sh[1530]: 2018/09/27 19:43:07.256426 [INFO] (runner) stopping
Sep 27 19:43:07 dockerworker-4 nomad.sh[1530]: 2018/09/27 19:43:07.256667 [INFO] client.gc: marking allocation da85f761-dbbb-956c-4aa1-9da13a4794d5 for GC
Sep 27 19:43:07 dockerworker-4 nomad.sh[1530]: 2018/09/27 19:43:07.257167 [INFO] (runner) received finish
Sep 27 19:50:02 dockerworker-4 nomad.sh[1530]: 2018/09/27 19:50:02.549081 [INFO] (runner) creating new runner (dry: false, once: false)
Sep 27 19:50:02 dockerworker-4 nomad.sh[1530]: 2018/09/27 19:50:02.549653 [INFO] (runner) creating watcher
Sep 27 19:50:02 dockerworker-4 nomad.sh[1530]: 2018/09/27 19:50:02.549974 [INFO] (runner) starting
Sep 27 19:50:02 dockerworker-4 nomad.sh[1530]: 2018/09/27 19:50:02.550260 [INFO] (runner) initiating run
Sep 27 19:50:02 dockerworker-4 nomad.sh[1530]: 2018/09/27 19:50:02.819835 [INFO] (runner) initiating run
Sep 27 19:50:02 dockerworker-4 nomad.sh[1530]: 2018/09/27 19:50:02.820093 [INFO] (runner) rendered "(dynamic)" => "/var/lib/nomad/alloc/f6633bc2-cec5-06f7-f5fb-63978789d56b/vault_debug_task/secrets/consul_token.env"
Sep 27 19:50:02 dockerworker-4 nomad.sh[1530]: 2018-09-27T19:50:02.821+0300 [DEBUG] plugin: starting plugin: path=/opt/nomad/nomad_0.8.6-playrix/nomad args="[/opt/nomad/nomad_0.8.6-playrix/nomad executor {"LogFile":"/var/lib/nomad/alloc/f6633bc2-cec5-06f7-f5fb-63978789d5
Sep 27 19:50:02 dockerworker-4 nomad.sh[1530]: 2018-09-27T19:50:02.821+0300 [DEBUG] plugin: waiting for RPC address: path=/opt/nomad/nomad_0.8.6-playrix/nomad
Sep 27 19:50:02 dockerworker-4 nomad.sh[1530]: 2018-09-27T19:50:02.878+0300 [DEBUG] plugin.nomad: plugin address: timestamp=2018-09-27T19:50:02.878+0300 address=/tmp/plugin968676666 network=unix
Sep 27 19:50:02 dockerworker-4 nomad.sh[1530]: 2018/09/27 19:50:02.911677 [INFO] driver.docker: created container 99b5de86d368ba60e436d4fbe487c1588fdb929994c1fe6cd4d5a8525f40e25d
Sep 27 19:50:03 dockerworker-4 nomad.sh[1530]: 2018/09/27 19:50:03.200440 [INFO] driver.docker: started container 99b5de86d368ba60e436d4fbe487c1588fdb929994c1fe6cd4d5a8525f40e25d
@dadgar We reproduce this on test stand
What we do
prohibit_overlap = true
) vagrant@consulnomad-2:~$ nomad status
ID Type Priority Status Submit Date
vault_debug-00 service 50 running 2018-09-27T02:40:21+03:00
vault_debug-02 batch/periodic 50 running 2018-09-27T02:41:29+03:00
vault_debug-02/periodic-1538067000 batch 50 running 2018-09-27T19:50:00+03:00
vault_debug-02/periodic-1538173994 batch 50 pending 2018-09-29T01:33:14+03:00
And we again see in logs
Sep 29 01:33:12 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:12 [WARN] raft: Failed to contact 192.168.142.101:4647 in 505.784124ms
Sep 29 01:33:12 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:12 [WARN] raft: Failed to contact 192.168.142.103:4647 in 503.875346ms
Sep 29 01:33:12 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:12 [WARN] raft: Failed to contact 192.168.142.101:4647 in 558.647808ms
Sep 29 01:33:12 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:12 [WARN] raft: Failed to contact quorum of nodes, stepping down
Sep 29 01:33:12 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:12 [INFO] raft: Node at 192.168.142.102:4647 [Follower] entering Follower state (Leader: "")
Sep 29 01:33:12 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:12 [INFO] raft: aborting pipeline replication to peer {Voter 192.168.142.101:4647 192.168.142.101:4647}
Sep 29 01:33:12 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:12 [INFO] raft: aborting pipeline replication to peer {Voter 192.168.142.103:4647 192.168.142.103:4647}
Sep 29 01:33:12 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:12.642723 [INFO] nomad: cluster leadership lost
Sep 29 01:33:12 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:12.642745 [ERR] worker: failed to dequeue evaluation: eval broker disabled
Sep 29 01:33:14 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:14 [WARN] raft: Heartbeat timeout from "" reached, starting election
Sep 29 01:33:14 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:14 [INFO] raft: Node at 192.168.142.102:4647 [Candidate] entering Candidate state in term 4
Sep 29 01:33:15 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:15 [WARN] raft: Election timeout reached, restarting election
Sep 29 01:33:15 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:15 [INFO] raft: Node at 192.168.142.102:4647 [Candidate] entering Candidate state in term 5
Sep 29 01:33:17 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:17 [WARN] raft: Election timeout reached, restarting election
Sep 29 01:33:17 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:17 [INFO] raft: Node at 192.168.142.102:4647 [Candidate] entering Candidate state in term 6
Sep 29 01:33:17 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:17.696591 [ERR] worker: failed to dequeue evaluation: No cluster leader
Sep 29 01:33:17 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:17.696622 [ERR] worker: failed to dequeue evaluation: No cluster leader
Sep 29 01:33:18 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:18 [WARN] raft: Election timeout reached, restarting election
Sep 29 01:33:18 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:18 [INFO] raft: Node at 192.168.142.102:4647 [Candidate] entering Candidate state in term 7
Sep 29 01:33:18 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:18 [INFO] memberlist: Suspect consulnomad-3.global has failed, no acks received
Sep 29 01:33:19 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:19 [WARN] raft: Election timeout reached, restarting election /o timeout
Sep 29 01:33:19 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:19 [INFO] raft: Node at 192.168.142.102:4647 [Candidate] entering Candidate state in term 8 /o timeout
Sep 29 01:33:20 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:20.492463 [ERR] http: Request /v1/agent/health?type=server, error: {"server":{"ok":false,"message":"No cluster leader"}}
Sep 29 01:33:21 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:21 [WARN] raft: Election timeout reached, restarting election
Sep 29 01:33:21 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:21 [INFO] raft: Node at 192.168.142.102:4647 [Candidate] entering Candidate state in term 9 /o timeout
Sep 29 01:33:22 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:22 [ERR] raft: Failed to heartbeat to 192.168.142.101:4647: read tcp 192.168.142.102:58886->192.168.142.101:4647: i/o timeout /o timeout
Sep 29 01:33:22 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:22 [ERR] raft: Failed to heartbeat to 192.168.142.103:4647: read tcp 192.168.142.102:34490->192.168.142.103:4647: i/o timeout
Sep 29 01:33:22 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:22 [ERR] yamux: keepalive failed: i/o deadline reached
Sep 29 01:33:22 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:22.351969 [ERR] nomad.rpc: multiplex conn accept failed: keepalive timeout /o timeout
Sep 29 01:33:22 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:22 [WARN] raft: Election timeout reached, restarting election /o timeout
Sep 29 01:33:22 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:22 [INFO] raft: Node at 192.168.142.102:4647 [Candidate] entering Candidate state in term 10
Sep 29 01:33:22 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:22 [ERR] yamux: keepalive failed: i/o deadline reached
Sep 29 01:33:22 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:22.502395 [ERR] nomad.rpc: multiplex conn accept failed: keepalive timeout
Sep 29 01:33:22 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:22.830493 [ERR] worker: failed to dequeue evaluation: No cluster leader /o timeout
Sep 29 01:33:22 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:22.869581 [ERR] worker: failed to dequeue evaluation: No cluster leader /o timeout
Sep 29 01:33:23 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:23 [WARN] raft: Election timeout reached, restarting election
Sep 29 01:33:23 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:23 [INFO] raft: Node at 192.168.142.102:4647 [Candidate] entering Candidate state in term 11
Sep 29 01:33:24 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:24 [ERR] raft: Failed to make RequestVote RPC to {Voter 192.168.142.103:4647 192.168.142.103:4647}: dial tcp 192.168.142.103:4647: i
Sep 29 01:33:24 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:24 [ERR] raft: Failed to make RequestVote RPC to {Voter 192.168.142.101:4647 192.168.142.101:4647}: dial tcp 192.168.142.101:4647: i
Sep 29 01:33:24 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:24 [WARN] raft: Election timeout reached, restarting election /o timeout
Sep 29 01:33:24 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:24 [INFO] raft: Node at 192.168.142.102:4647 [Candidate] entering Candidate state in term 12 /o timeout
Sep 29 01:33:25 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:25 [ERR] raft: Failed to make RequestVote RPC to {Voter 192.168.142.103:4647 192.168.142.103:4647}: dial tcp 192.168.142.103:4647: i
Sep 29 01:33:25 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:25 [ERR] raft: Failed to make RequestVote RPC to {Voter 192.168.142.101:4647 192.168.142.101:4647}: dial tcp 192.168.142.101:4647: i
Sep 29 01:33:26 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:26 [WARN] raft: Election timeout reached, restarting election vice/nomad?dc=test&near=_agent&stale=&tag=serf&wait=2000ms: net/http: r
Sep 29 01:33:26 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:26 [INFO] raft: Node at 192.168.142.102:4647 [Candidate] entering Candidate state in term 13
Sep 29 01:33:27 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:27 [ERR] raft: Failed to make RequestVote RPC to {Voter 192.168.142.101:4647 192.168.142.101:4647}: dial tcp 192.168.142.101:4647: i
Sep 29 01:33:27 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:27 [ERR] raft: Failed to make RequestVote RPC to {Voter 192.168.142.103:4647 192.168.142.103:4647}: dial tcp 192.168.142.103:4647: i/o timeout
Sep 29 01:33:27 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:27.975306 [ERR] worker: failed to dequeue evaluation: No cluster leader /o timeout
Sep 29 01:33:27 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:27 [WARN] raft: Election timeout reached, restarting election
Sep 29 01:33:27 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:27 [INFO] raft: Node at 192.168.142.102:4647 [Candidate] entering Candidate state in term 14
Sep 29 01:33:28 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:28 [ERR] raft: Failed to make RequestVote RPC to {Voter 192.168.142.103:4647 192.168.142.103:4647}: dial tcp 192.168.142.103:4647: i/o timeout
Sep 29 01:33:28 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:28 [ERR] raft: Failed to make RequestVote RPC to {Voter 192.168.142.101:4647 192.168.142.101:4647}: dial tcp 192.168.142.101:4647: i/o timeout
Sep 29 01:33:28 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:28.430221 [ERR] worker: failed to dequeue evaluation: No cluster leader
Sep 29 01:33:28 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:28 [INFO] memberlist: Suspect consulnomad-1.global has failed, no acks received
Sep 29 01:33:29 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:29 [WARN] raft: Election timeout reached, restarting election
Sep 29 01:33:29 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:29 [INFO] raft: Node at 192.168.142.102:4647 [Candidate] entering Candidate state in term 15
Sep 29 01:33:29 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:29 [ERR] raft: Failed to make RequestVote RPC to {Voter 192.168.142.101:4647 192.168.142.101:4647}: dial tcp 192.168.142.101:4647: irectional UDP
Sep 29 01:33:29 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:29 [ERR] raft: Failed to make RequestVote RPC to {Voter 192.168.142.103:4647 192.168.142.103:4647}: dial tcp 192.168.142.103:4647: i/o timeout
Sep 29 01:33:29 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:29 [ERR] yamux: keepalive failed: i/o deadline reached
Sep 29 01:33:29 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:29.857828 [ERR] nomad.rpc: multiplex conn accept failed: keepalive timeout
Sep 29 01:33:30 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:30.001957 [WARN] server.nomad: failed to query service "nomad" in Consul datacenter "test": Get http://127.0.0.1:8500/v1/catalog/service/nomad?dc=test&near=_agent&stale=&tag=serf&wait=2000ms: net/http: r
Sep 29 01:33:30 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:30 [WARN] raft: Election timeout reached, restarting election
Sep 29 01:33:30 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:30 [INFO] raft: Node at 192.168.142.102:4647 [Candidate] entering Candidate state in term 16
Sep 29 01:33:31 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:31 [ERR] raft: Failed to make RequestVote RPC to {Voter 192.168.142.101:4647 192.168.142.101:4647}: dial tcp 192.168.142.101:4647: i/o timeout
Sep 29 01:33:31 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:31 [ERR] raft: Failed to make RequestVote RPC to {Voter 192.168.142.103:4647 192.168.142.103:4647}: dial tcp 192.168.142.103:4647: i/o timeout
Sep 29 01:33:32 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:32 [WARN] raft: Election timeout reached, restarting election
Sep 29 01:33:32 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:32 [INFO] raft: Node at 192.168.142.102:4647 [Candidate] entering Candidate state in term 17
Sep 29 01:33:32 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:32 [ERR] raft: Failed to make RequestVote RPC to {Voter 192.168.142.101:4647 192.168.142.101:4647}: dial tcp 192.168.142.101:4647: i/o timeout
Sep 29 01:33:32 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:32 [ERR] raft: Failed to make RequestVote RPC to {Voter 192.168.142.103:4647 192.168.142.103:4647}: dial tcp 192.168.142.103:4647: i/o timeout
Sep 29 01:33:33 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:33.366631 [ERR] worker: failed to dequeue evaluation: No cluster leader
Sep 29 01:33:33 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:33 [INFO] raft: Node at 192.168.142.102:4647 [Follower] entering Follower state (Leader: "")
Sep 29 01:33:33 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:33 [WARN] raft: Failed to get previous log: 2980 log not found (last: 2975)
Sep 29 01:33:35 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:35 [WARN] memberlist: Refuting a suspect message (from: consulnomad-1.global)
Sep 29 01:33:43 consulnomad-2 nomad.sh[19505]: 2018/09/29 01:33:43 [WARN] memberlist: Was able to reach consulnomad-1.global via TCP but not UDP, network may be misconfigured and not allowing bidirectional UDP
Our job difinition is:
job "vault_debug-02"
{
datacenters = ["test"]
priority = 50
type = "batch"
periodic
{
cron = "*/10 * * * *"
prohibit_overlap = true
}
constraint
{
attribute = "${attr.kernel.name}"
value = "linux"
}
meta
{
CONTAINER_VERSION = "02"
}
group test
{
count = 2
task "vault_debug_task"
{
driver = "docker"
kill_timeout = "1m"
artifact
{
source = "http://docker.service.consul/playrix-vault_debug-${NOMAD_META_CONTAINER_VERSION}.tar.gz"
options
{
archive=false
}
}
config
{
image = "playrix/vault_debug:${NOMAD_META_CONTAINER_VERSION}"
load = "playrix-vault_debug-${NOMAD_META_CONTAINER_VERSION}.tar.gz"
command = "/sbin/init_plrx"
args = ["-c", "/sbin/runsvdir -P /etc/service"]
}
vault
{
policies = ["bind_password","vault_debug-consul"]
change_mode = "noop"
}
template
{
data = <<EOH
CONSUL_TOKEN="{{with secret "secrets/consul/plr/global/creds/vault_debug-consul"}}{{.Data.token}}{{end}}"
EOH
destination = "secrets/consul_token.env"
env = true
}
logs
{
max_files = 3
max_file_size = 10
}
resources
{
memory = 300
cpu = 200
network
{
mbits = 500
port "statsite"
{
static = 8125
}
}
}
}
}
}
@dadgar After we apply this patch we prevent overlapped periodic task from unnecessary launch when leader moves to another node, but doesn't fix statistic
diff --git a/nomad/leader.go b/nomad/leader.go
index 119e72ebf..64f782394 100644
--- a/nomad/leader.go
+++ b/nomad/leader.go
@@ -388,6 +388,15 @@ func (s *Server) restorePeriodicDispatcher() error {
continue
}
+ if job.Periodic.ProhibitOverlap {
+ running, err := s.RunningChildren(job)
+ if err != nil {
+ s.logger.Printf("[WARN] Can't detect is periodic job have already running childs: %v", err)
+ } else if running {
+ continue
+ }
+ }
+
// If the periodic job has never been launched before, launch will hold
// the time the periodic job was added. Otherwise it has the last launch
// time of the periodic job.
In our case periodic task may be launched more then they start intervals, so fo example task launched every 10 minutes, but run 20-30 minutes, in this cases when leader moves without patch nomad will launch additional job, but made this wrong, because already exist launched job
To solve statistic aditional patch needed
diff --git a/nomad/state/state_store.go b/nomad/state/state_store.go
index af0f0bfa4..3310aef70 100644
--- a/nomad/state/state_store.go
+++ b/nomad/state/state_store.go
@@ -2958,19 +2958,39 @@ func (s *StateStore) ReconcileJobSummaries(index uint64) error {
if err != nil {
return err
}
- for {
- rawJob := iter.Next()
- if rawJob == nil {
- break
- }
+
+ parentChildSummaryMap := make(map[structs.NamespacedID]*structs.JobChildrenSummary)
+
+ for rawJob := iter.Next(); rawJob != nil; rawJob = iter.Next(){
job := rawJob.(*structs.Job)
+ if job.ParentID != "" {
+ parentChildSummary, l_ok := parentChildSummaryMap[structs.NamespacedID{Namespace: job.Namespace, ID: job.ParentID}]
+ if !l_ok {
+ parentChildSummary = new(structs.JobChildrenSummary)
+ parentChildSummaryMap[structs.NamespacedID{Namespace: job.Namespace, ID: job.ParentID}] = parentChildSummary
+ }
+
+ // Increment new status
+ switch job.Status {
+ case structs.JobStatusPending:
+ parentChildSummary.Pending++
+ case structs.JobStatusRunning:
+ parentChildSummary.Running++
+ case structs.JobStatusDead:
+ parentChildSummary.Dead++
+ default:
+ return fmt.Errorf("unknown new job status %q", job.Status)
+ }
+ }
+
// Create a job summary for the job
summary := &structs.JobSummary{
JobID: job.ID,
Namespace: job.Namespace,
Summary: make(map[string]structs.TaskGroupSummary),
}
+
for _, tg := range job.TaskGroups {
summary.Summary[tg.Name] = structs.TaskGroupSummary{}
}
@@ -3029,6 +3049,27 @@ func (s *StateStore) ReconcileJobSummaries(index uint64) error {
}
}
+ for namespacedID, parentChildSummary := range parentChildSummaryMap {
+ summaryRaw, err := txn.First("job_summary", "id", namespacedID.Namespace, namespacedID.ID)
+ if err != nil {
+ return fmt.Errorf("unable to retrieve summary for parent job: %v", err)
+ }
+
+ if summaryRaw != nil {
+ existing := summaryRaw.(*structs.JobSummary)
+ pSummary := existing.Copy()
+ pSummary.Children = parentChildSummary
+
+ // Update the index
+ pSummary.ModifyIndex = index
+
+ // Insert the summary
+ if err := txn.Insert("job_summary", pSummary); err != nil {
+ return fmt.Errorf("job summary insert failed: %v", err)
+ }
+ }
+ }
+
// Update the indexes table for job summary
if err := txn.Insert("index", &IndexEntry{"job_summary", index}); err != nil {
return fmt.Errorf("index update failed: %v", err)
I'm seeing this issue as well on 1.3.1:
~ > nomad status prod_mash | head -n 25
ID = prod_mash
Name = prod_mash
Submit Date = 2022-07-11T16:16:13+02:00
Type = batch
Priority = 60
Datacenters = meyrin
Namespace = default
Status = running
Periodic = false
Parameterized = true
Parameterized Job
Payload = required
Required Metadata = PARENT_JOB
Optional Metadata = <none>
Parameterized Job Summary
Pending Running Dead
0 -374 965550
Dispatched Jobs
ID Status
prod_mash/dispatch-1657777031-43cf92a7 dead
prod_mash/dispatch-1657780692-5d8b1e72 dead
prod_mash/dispatch-1657780754-84b79f7c dead
...
The number of dead jobs is also a bit surprising, but it might be correct (this job has been deployed for a long time now). I'm sure there are currently 0 running jobs, though.
Some other issues that look related to this one: https://github.com/hashicorp/nomad/issues/13519 https://github.com/hashicorp/nomad/issues/10338 https://github.com/hashicorp/nomad/issues/10222 https://github.com/hashicorp/nomad/issues/13897.
Nomad version
Nomad v0.8.6 (fcc4149c55399eb4979cd3fe3fb983cfe6c8449a+CHANGES)
Issue
is it normal that
pending
andrunning
are negative