open-horizon / anax

Horizon agent control system
https://open-horizon.github.io/docs/anax/docs/
Apache License 2.0
73 stars 98 forks source link

Agreement is not marked as archived when the terminated reason is 'service start timeout' #3860

Open LiilyZhang opened 1 year ago

LiilyZhang commented 1 year ago

This is a intermittent issue found the e2edev.

hzn unregister command hangs because there is one agreement with "archived": false and "agreement_terminated_time": 1691441616

hzn eventlog:

"2023-08-03 15:17:28:   Start terminating agreement for k8s-service1. Termination reason: service start timeout",
  "2023-08-03 15:17:28:   Complete terminating agreement for k8s-service1. Termination reason: service start timeout",
  "2023-08-03 15:18:28:   Node received Cancel message for e2edev@somecomp.com/k8s-service1 from agbot IBM/agbot.",
  "2023-08-03 15:18:28:   Error handling Cancel message for k8s-service1. ignoring cancel, agreement 41171e31e5bf7262746be5b9ff5a1e01f28327e7ee638c730cb610d5f778f6bd is terminating",

agreement:

 {
        "name": "Policy for userdev/agent-in-kube merged with userdev/bp_k8s",
        "dependent_services": [],
        "archived": false,
        "current_agreement_id": "c2c01fc7d47a832d2242a70460fd7c53d03f6acd08817faf440dd2c86a94bca8",
        "consumer_id": "IBM/agbot",
        "counterparty_address": "IBM/agbot",
        "agreement_creation_time": 1691441005,
        "agreement_accepted_time": 1691441015,
        "agreement_bc_update_ack_time": 0,
        "agreement_finalized_time": 1691441076,
        "agreement_terminated_time": 1691441616,
        "agreement_force_terminated_time": 0,
        "agreement_execution_start_time": 0,
        "agreement_data_received_time": 0,
        "current_deployment": {},
        "extended_deployment": {},
        "proposal": "{\"type\":\"proposal\",\"protocol\":\"Basic\",\"version\":1,\"agreementId\":\"c2c01fc7d47a832d2242a70460fd7c53d03f6acd08817faf440dd2c86a94bca8\",\"tsandcs\":\"{\\\"header\\\":{\\\"name\\\":\\\"Policy for userdev/agent-in-kube merged with userdev/bp_k8s\\\",\\\"version\\\":\\\"2.0\\\"},\\\"agreementProtocols\\\":[{\\\"name\\\":\\\"Basic\\\",\\\"protocolVersion\\\":1}],\\\"workloads\\\":[{\\\"cluster_deployment\\\":\\\"{\\\\\\\"metadata\\\\\\\":{\\\\\\\"namespace\\\\\\\":\\\\\\\"\\\\\\\"},\\\\\\\"operatorYamlArchive\\\\\\\":\\\\\\\"H4sIAPWCxl4AA+1Z3W/bNhDPc/4KIn0IMMySZflj0JuXZluxtjGcoHsMaIm2uVKiRlLu3GH/+47UhyVXkd0tdjCUvwdbOpLH493xeEc57sXJ0QdMRn3zD9j/N8+ePx57w8nY9zR9MvH8CzQ6vWgXF5lUWCB0IThXXf0Otf9P4biSiA0NySMOQ54lytnimD3vHNrA4+HwSft73kDbfzD2++ORD3TPnwwnF6j/vGK04xu3P07pByIk5UmANt7lR5pEAbrPXWKae8RlTBSOsMLBJUIJjkmAFE8Lt+nxlAisuLh86ZVY/Bs4ruCMnGDT13Bg/w/6/m7/TyAWQEAYDT27/8+B+v4XCxw6OFNrLuhnrIDmfPxBOpS7VWSYg7M04kEoiOn5QGMCmozTACUZY52RQmSMyOCyh2DynwXPUqk59dDVFfwJInkmQlLQUh5J81Bwab64S5pgRj/DCgydJFHKaaLyt1QvTCqSqA1nWUxChmlcdNyQslfIkyVdxTgtWcOSTNOGiEUhhVklMY8RYaR4XBFl/hlMkk+IVbg2T1kalQM+GeIXi8WpmXBvuRFJGd/GlXARJjFPJCleBTTTEFfvoHFFlhmT55AYBKFgPpqsnJALwiX8xV8uobBN0XtPrFKAQrxDWnkPPlS0tDlSl/qarlEToVjo13jf3gKOtOW+sZq2PcRzt16nXK+z4Pxzu9Kvv7t+bvu/dGSyOAfy8/9xAdFdb+yT5AGH8n8o/fbz/4E9/8+D3bH+Y+4Cl0dnBMeVBTJb/E5CZc771uqiM1MAueZkqWeo5R9dI1AVRzukt7GtguOGIpKnvQQ68v5nOIKf/tCH/W/Chb3/OQMK+3dlG4+7xrJNPsKoow+LA/HfH/Qnu/pPx//B0PeHNv6fA/VoD8/kTyiX9JvcBfoFxPmy/rsBdfF4XiSfrwkk2VQH1s7DoPIapzOplSkJ9fBVHr0PJMBJWRqUR8MDT4uT5W53GCCT6v76RIe3eRqMUMoygVmrzKZdwsGYMSzaekAHGcJLgEy1kuKQRJqWLRo5OjLFWiYD9NffOlWH2iQyZ1LeCAyS6ezNB/8+XJMY50QEubsMBU1Nv5YFICqRWhOUD0JLoOjXlmUgYF7wTIWmKlrKpVFzg4q2N/u1Fi/vAw1geJLPvclpJAI1GCn4EuggGNQ/oAEod8wya2yR7oITxE1m4JiEAJggueYZi3RBDq8Kxod8lUAFV3KG+biZkkGlUliuBBT9REDBpxWbke+BfYRivAUmeg6UJTVupot00DuoYmHgkgdorRQUXq67oqp0fHCzOAPv3sJTogRdZFqPbkQ2hLmSrnpYhGuqgHsmiAsK7BnBE2V2Txy9qsx/XZNUbbWnSKXL6IpsHPhJvWvv1YbGxbBc/p16NUlrZX57/1DVhcYETZ0bbe+GyZ3itaJAD0TkhlsKHhuO5XWKeQkZJUlT6eDjMVXa0n+AQpW2j4NucJJwhRakKCkjB71JgBoTdgPF8MnVrjUse1qlhxVfj1rNjrm2KnIZm1ot9OW+vIfujU0C/anQjqwvbLT7PxGtukTIw8dXCGEGNMTgCx0X/osce6RN7dtB9VZcQeTHgGlAKJ8X4qfISBENucArUlBe+hj8ZnFM/rfxWlJAyACPTQAP1f+Dfl7/Q+7newNf53/Dvq3/z4J6/tflBLtPAC0Royv5q9K6V2gaRWhJCYskWhOhw4CE8zhAnt3+LwbHrUx9sm+AB/a/NxoNyvp/NBlDP28Az3b/nwPN+i+Vu33+uvpccexNX7HRy+8eemPrg59BqsBFnrjE+tPCW7wgrMpkuu7yFIlTnewXg/eyNdbg082pmcLJxhXk+85xSBckCkMOJWpz9Q7MluMVmoMycJGOo09UrU0etsgoU4jGkAAZPrUhhhigCOvI67awDzynX+uv82XcLB+e+ljWmGKWMTbjYKhtgKbsE97WqwWSbOoMd8v9bfpw88vj++m72/vZ9Oa20QflhclPUD0Eew0oj/vFVW5r2wyrdVCZ2EnKWrpVjNndayPEaedvnfpudjufPtzNn5w/QFct6r+yZ5yFhYWFhYWFhYWFhYWFhYWFhYWFhYWFhYWFhYWFhYWFhYWFhYWFhcUz4R+dFvclAFAAAA==\\\\\\\"}\\\",\\\"cluster_deployment_signature\\\":\\\"fhddKl3jvls4cLZs+TR10DTysoFRY4dU4EpkzpgVJMxnIU5OE5+t6Mv1N2rrEJF7KSuP/h0df5+yuwX1br/Xx2X9FaLHXPz7SEU7wuK6kg2Qy2kd8Mqc7hXtgB7kgWJEySIN8oaYWYu075F3lpjKEKjfhD+S9U1jg9xhiMqu4v06XVqFUv/UWhAi4lJCuRpr8fk8d13V32UMj2LPemy0O+q/VJbXGVnPhED8ENgw7Rp3hXMXlsghtWENoHxNEGnOYxUtCiHgBmRtFqoM6g+PahGfPzixSJf3PzDUWgvTSZvPV8FAC9NiHarfBgmCojSBR7o2NI46ITW+dVxp7hAM2eIphDus6hxkcU4vWMZR5snawCfMP0p/P+ADlfD7IKFH+MkgY73Zw/T2yTTr9aQU4Zq177FTv9yp+PPIyc7FMDuqGJWEFh+8HDgrxAxLg4K55QmIhMDHVeSlYERmNFnTOmB0H9Cy5FzseIVU1YtysEfFZWuGshX9Tfs5xyy0r38QIIcX1SlzepgM6yLvZvzJkJszqKpXoJ8w80W5ucP286UAw2KSBsgIrTs9LH7+DjA/U7BmYM90iUMbkQaTXCbgXM/lZJqepJbEHDKEgyPiOU8N/9aXgsWJ3sl3IGLOik5pnsySlrMQ6xalRF49vZ59g9cgc6gzFvTaqy68Fk89wAw=\\\",\\\"priority\\\":{},\\\"workloadUrl\\\":\\\"k8s-service1\\\",\\\"organization\\\":\\\"e2edev@somecomp.com\\\",\\\"version\\\":\\\"1.0.0\\\",\\\"arch\\\":\\\"amd64\\\"}],\\\"valueExchange\\\":{},\\\"dataVerification\\\":{\\\"metering\\\":{}},\\\"proposalRejection\\\":{},\\\"properties\\\":[{\\\"name\\\":\\\"iame2edev\\\",\\\"value\\\":\\\"true\\\"},{\\\"name\\\":\\\"NOK8S\\\",\\\"value\\\":false},{\\\"name\\\":\\\"has.service.embedded.ns\\\",\\\"value\\\":false},{\\\"name\\\":\\\"openhorizon.service.url\\\",\\\"value\\\":\\\"k8s-service1\\\"},{\\\"name\\\":\\\"openhorizon.service.name\\\",\\\"value\\\":\\\"k8s-service1\\\"},{\\\"name\\\":\\\"openhorizon.service.org\\\",\\\"value\\\":\\\"e2edev@somecomp.com\\\"},{\\\"name\\\":\\\"openhorizon.service.version\\\",\\\"value\\\":\\\"1.0.0\\\"},{\\\"name\\\":\\\"openhorizon.service.arch\\\",\\\"value\\\":\\\"amd64\\\"},{\\\"name\\\":\\\"openhorizon.allowPrivileged\\\",\\\"value\\\":false}],\\\"constraints\\\":[\\\"purpose == network-testing\\\"],\\\"nodeHealth\\\":{},\\\"clusterNamespace\\\":\\\"ns-in-policy\\\"}\",\"producerPolicy\":\"{\\\"header\\\":{\\\"name\\\":\\\"Policy for userdev/agent-in-kube\\\",\\\"version\\\":\\\"2.0\\\"},\\\"valueExchange\\\":{},\\\"dataVerification\\\":{\\\"metering\\\":{}},\\\"proposalRejection\\\":{},\\\"properties\\\":[{\\\"name\\\":\\\"purpose\\\",\\\"value\\\":\\\"network-testing\\\"},{\\\"name\\\":\\\"openhorizon.arch\\\",\\\"value\\\":\\\"amd64\\\"},{\\\"name\\\":\\\"openhorizon.cpu\\\",\\\"value\\\":4},{\\\"name\\\":\\\"openhorizon.allowPrivileged\\\",\\\"value\\\":true},{\\\"name\\\":\\\"openhorizon.kubernetesVersion\\\",\\\"value\\\":\\\"v1.18.20\\\"},{\\\"name\\\":\\\"openhorizon.kubernetesNamespace\\\",\\\"value\\\":\\\"agent-namespace\\\"},{\\\"name\\\":\\\"openhorizon.kubernetesNamespaceScoped\\\",\\\"value\\\":false},{\\\"name\\\":\\\"openhorizon.memory\\\",\\\"value\\\":33654}],\\\"constraints\\\":[\\\"has.service.embedded.ns == false\\\"],\\\"nodeHealth\\\":{},\\\"clusterNamespace\\\":\\\"agent-namespace\\\"}\",\"consumerId\":\"IBM/agbot\"}",
        "proposal_sig": "",
        "agreement_protocol": "Basic",
        "protocol_version": 1,
        "requested_cluster_namespace": "",
        "terminated_reason": 104,
        "terminated_description": "service start timeout",
        "agreement_protocol_terminated_time": 1691441616,
        "workload_terminated_time": 0,
        "metering_notification": {
          "amount": 0,
          "start_time": 0,
          "current_time": 0,
          "missed_time": 0,
          "consumer_meter_signature": "",
          "agreement_hash": "",
          "consumer_agreement_signature": "",
          "consumer_address": "",
          "producer_agreement_signature": "",
          "blockchain_type": ""
        },
        "workload_to_run": {
          "url": "k8s-service1",
          "org": "e2edev@somecomp.com",
          "version": "1.0.0",
          "arch": "amd64"
        },
        "agreement_timeout": 600,
        "service_definition_id": "",
        "failed_verification_attempts": 0,
        "last_verification_update_time": 1691441586
      },

This agreement should be marked as "archived" or delete from agent database

LiilyZhang commented 1 year ago

Place that fixes this issue: change this line to:

    if agreement.TerminatedReason == basicprotocol.CANCEL_NOT_EXECUTED_TIMEOUT || agreement.WorkloadTerminatedTime != 0 { //service timeout, this field is 0