Closed sujoykr1 closed 10 months ago
@sujoykr1, Can you please provide more log on the failing test case?
=================================== FAILURES =================================== _ test_upgrade_mcg_io __
mcg_workload_job = <ocs_ci.ocs.resources.ocs.OCS object at 0x3ff95000790>
@post_upgrade
@skipif_managed_service
@pytest.mark.polarion_id("OCS-2207")
@bugzilla("1874243")
@red_squad
def test_upgrade_mcg_io(mcg_workload_job):
"""
Confirm that there is MCG workload job running after upgrade.
"""
assert wait_for_active_pods(
mcg_workload_job, 1 ), f"Job {mcg_workload_job.name} doesn't have any running pod" E AssertionError: Job mcg-workload doesn't have any running pod E assert False E + where False = wait_for_active_pods(<ocs_ci.ocs.resources.ocs.OCS object at 0x3ff95000790>, 1)
tests/ecosystem/upgrade/test_noobaa.py:181: AssertionError =============================== warnings summary =============================== tests/ecosystem/upgrade/test_configuration.py: 4 warnings tests/ecosystem/upgrade/test_noobaa.py: 6 warnings tests/manage/monitoring/test_workload_with_distruptions.py: 189 warnings tests/ecosystem/upgrade/test_upgrade.py: 2 warnings tests/ecosystem/upgrade/testresources.py: 8 warnings /opt/ocs-ci/.venv/lib/python3.10/site-packages/urllib3/util/ssl.py:260: DeprecationWarning: ssl.PROTOCOL_TLS is deprecated context = SSLContext(ssl_version or PROTOCOL_TLS)
tests/ecosystem/upgrade/test_configuration.py: 4 warnings tests/ecosystem/upgrade/test_noobaa.py: 4 warnings tests/manage/monitoring/test_workload_with_distruptions.py: 189 warnings tests/ecosystem/upgrade/test_upgrade.py: 2 warnings tests/ecosystem/upgrade/test_resources.py: 6 warnings /opt/ocs-ci/.venv/lib/python3.10/site-packages/urllib3/connectionpool.py:981: InsecureRequestWarning: Unverified HTTPS request is being made to host 'prometheus-k8s-openshift-monitoring.apps.m42lp41.test.ocs'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings warnings.warn(
tests/ecosystem/upgrade/test_noobaa.py: 3 warnings tests/ecosystem/upgrade/test_resources.py: 6 warnings tests/managed-service/test_ms_upgrade.py: 5 warnings /opt/ocs-ci/.venv/lib/python3.10/site-packages/urllib3/connection.py:407: DeprecationWarning: ssl.match_hostname() is deprecated match_hostname(cert, asserted_hostname)
tests/ecosystem/upgrade/test_noobaa.py::test_start_upgrade_mcg_io tests/ecosystem/upgrade/test_noobaa.py::test_start_upgrade_mcg_io tests/ecosystem/upgrade/test_noobaa.py::test_start_upgrade_mcg_io tests/ecosystem/upgrade/test_noobaa.py::test_start_upgrade_mcg_io tests/ecosystem/upgrade/test_noobaa.py::test_start_upgrade_mcg_io tests/ecosystem/upgrade/test_noobaa.py::test_start_upgrade_mcg_io /opt/ocs-ci/.venv/lib/python3.10/site-packages/botocore/httpsession.py:57: DeprecationWarning: ssl.PROTOCOL_TLS is deprecated context = SSLContext(ssl_version or ssl.PROTOCOL_SSLv23)
tests/ecosystem/upgrade/test_upgrade.py::test_upgrade /opt/ocs-ci/.venv/lib/python3.10/site-packages/_pytest/threadexception.py:75: PytestUnhandledThreadExceptionWarning: Exception in thread Thread-15
Traceback (most recent call last): File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner self.run() File "/opt/ocs-ci/ocs_ci/ocs/cluster.py", line 1043, in run self.latest_health_status = self.ceph_cluster.get_ceph_health(detail=True) File "/opt/ocs-ci/ocs_ci/ocs/cluster.py", line 581, in get_ceph_health return self.toolbox.exec_cmd_on_pod( File "/opt/ocs-ci/ocs_ci/ocs/resources/pod.py", line 190, in exec_cmd_on_pod return self.ocp.exec_oc_cmd( File "/opt/ocs-ci/ocs_ci/ocs/ocp.py", line 170, in exec_oc_cmd out = run_cmd( File "/opt/ocs-ci/ocs_ci/utility/utils.py", line 479, in run_cmd completed_process = exec_cmd( File "/opt/ocs-ci/ocs_ci/utility/utils.py", line 658, in exec_cmd raise CommandFailed( ocs_ci.ocs.exceptions.CommandFailed: Error during execution of command: oc -n openshift-storage rsh rook-ceph-tools-7f9d67b95f-sfk4m ceph health detail. Error is Error from server (NotFound): pods "rook-ceph-tools-7f9d67b95f-sfk4m" not found
warnings.warn(pytest.PytestUnhandledThreadExceptionWarning(msg))
-- Docs: https://docs.pytest.org/en/stable/warnings.html ---- generated xml file: /opt/ocs-ci-logs/upgrade_4.13.0-219_2023-06-12.xml ----
This issue has been automatically marked as stale because it has not had recent activity. It will be closed in 30 days if no further activity occurs.
This issue has been automatically closed due to inactivity. Please re-open if this still requires investigation.
Issue is still seen in ODF4.14 automation run.
mcg-workload job is failing:
2023-11-02 17:00:21,723 - MainThread - INFO - ocs_ci.utility.utils.exec_cmd.624 - Executing command: oc --kubeconfig /home/jenkins/current-cluster-dir/openshift-cluster-dir/auth/kubeconfig -n namespace-test-fa8c721e9280420e85d4c844e get Job mcg-workload -n namespace-test-fa8c721e9280420e85d4c844e -o yaml
2023-11-02 17:00:21,901 - MainThread - DEBUG - ocs_ci.utility.utils.exec_cmd.645 - Command stdout: apiVersion: batch/v1
kind: Job
metadata:
annotations:
batch.kubernetes.io/job-tracking: ""
creationTimestamp: "2023-11-02T12:58:48Z"
generation: 1
labels:
batch.kubernetes.io/controller-uid: ae28220a-5158-4bd4-a713-c445bf628ab5
batch.kubernetes.io/job-name: mcg-workload
controller-uid: ae28220a-5158-4bd4-a713-c445bf628ab5
job-name: mcg-workload
name: mcg-workload
namespace: namespace-test-fa8c721e9280420e85d4c844e
resourceVersion: "153919"
uid: ae28220a-5158-4bd4-a713-c445bf628ab5
spec:
backoffLimit: 0
completionMode: NonIndexed
completions: 1
parallelism: 1
selector:
matchLabels:
batch.kubernetes.io/controller-uid: ae28220a-5158-4bd4-a713-c445bf628ab5
suspend: false
template:
metadata:
creationTimestamp: null
labels:
batch.kubernetes.io/controller-uid: ae28220a-5158-4bd4-a713-c445bf628ab5
batch.kubernetes.io/job-name: mcg-workload
controller-uid: ae28220a-5158-4bd4-a713-c445bf628ab5
job-name: mcg-workload
name: mcg-workload
spec:
containers:
- command:
- /usr/bin/fio
- --output-format=json
- /etc/fio/workload.fio
image: quay.io/fbalak/fio-fedora:latest
imagePullPolicy: Always
name: fio
resources: {}
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
volumeMounts:
- mountPath: /etc/fio
name: mcg-workload-config-vol
dnsPolicy: ClusterFirst
restartPolicy: Never
schedulerName: default-scheduler
securityContext: {}
terminationGracePeriodSeconds: 30
volumes:
- configMap:
defaultMode: 420
name: mcg-workload-config
name: mcg-workload-config-vol
status:
conditions:
- lastProbeTime: "2023-11-02T13:30:41Z"
lastTransitionTime: "2023-11-02T13:30:41Z"
message: Job has reached the specified backoff limit
reason: BackoffLimitExceeded
status: "True"
type: Failed
failed: 1
ready: 0
startTime: "2023-11-02T12:58:48Z"
uncountedTerminatedPods: {}
2023-11-02 17:00:21,901 - MainThread - DEBUG - ocs_ci.utility.utils.exec_cmd.654 - Command stderr is empty
2023-11-02 17:00:21,901 - MainThread - DEBUG - ocs_ci.utility.utils.exec_cmd.655 - Command return code: 0
2023-11-02 17:00:21,908 - MainThread - ERROR - ocs_ci.utility.utils.__iter__.1293 - Exception raised during iteration: 'active'
Traceback (most recent call last):
File "/home/jenkins/workspace/qe-deploy-ocs-cluster-prod/ocs-ci/ocs_ci/utility/utils.py", line 1290, in __iter__
yield self.func(*self.func_args, **self.func_kwargs)
File "/home/jenkins/workspace/qe-deploy-ocs-cluster-prod/ocs-ci/ocs_ci/ocs/mcg_workload.py", line 200, in _retrieve_job_state
return job_obj["status"]["active"]
KeyError: 'active'
We're also seeing this in all upgrade jobs and all environments. It looks like the FIO job failed during upgrade, perhaps because the default backingstore was temporarily rejected:
[2023-10-06T18:12:40.266Z] 18:12:40 - MainThread - ocs_ci.ocs.mcg_workload - INFO - Description of job pod mcg-workload-pkccv:
{
"apiVersion":"v1",
"kind":"Pod",
"metadata":{
"annotations":{
"k8s.v1.cni.cncf.io/network-status":"[{\n \"name\": \"openshift-sdn\",\n \"interface\": \"eth0\",\n \"ips\": [\n \"10.131.2.29\"\n ],\n \"default\": true,\n \"dns\": {}\n}]",
"k8s.v1.cni.cncf.io/networks-status":"[{\n \"name\": \"openshift-sdn\",\n \"interface\": \"eth0\",\n \"ips\": [\n \"10.131.2.29\"\n ],\n \"default\": true,\n \"dns\": {}\n}]",
"openshift.io/scc":"restricted-v2",
"seccomp.security.alpha.kubernetes.io/pod":"runtime/default"
},
"creationTimestamp":"2023-10-06T15:51:52Z",
"generateName":"mcg-workload-",
"labels":{
"controller-uid":"0f0b14df-e513-4d41-9bb5-c0e1691bd9ac",
"job-name":"mcg-workload"
},
"name":"mcg-workload-pkccv",
"namespace":"namespace-test-fce4ca3820d741259933d60f1",
"ownerReferences":[
{
"apiVersion":"batch/v1",
"blockOwnerDeletion":true,
"controller":true,
"kind":"Job",
"name":"mcg-workload",
"uid":"0f0b14df-e513-4d41-9bb5-c0e1691bd9ac"
}
],
"resourceVersion":"336444",
"uid":"1096d398-c502-406c-abac-c637bf5fb757"
},
"spec":{
"containers":[
{
"command":[
"/usr/bin/fio",
"--output-format=json",
"/etc/fio/workload.fio"
],
"image":"quay.io/fbalak/fio-fedora:latest",
"imagePullPolicy":"Always",
"name":"fio",
"resources":{
},
"securityContext":{
"allowPrivilegeEscalation":false,
"capabilities":{
"drop":[
"ALL"
]
},
"runAsNonRoot":true,
"runAsUser":1000760000
},
"terminationMessagePath":"/dev/termination-log",
"terminationMessagePolicy":"File",
"volumeMounts":[
{
"mountPath":"/etc/fio",
"name":"mcg-workload-config-vol"
},
{
"mountPath":"/var/run/secrets/kubernetes.io/serviceaccount",
"name":"kube-api-access-q4k5f",
"readOnly":true
}
]
}
],
"dnsPolicy":"ClusterFirst",
"enableServiceLinks":true,
"imagePullSecrets":[
{
"name":"default-dockercfg-ddkm8"
}
],
"nodeName":"j-054vi1cs33-uba-rxl4g-app-qbqds",
"preemptionPolicy":"PreemptLowerPriority",
"priority":0,
"restartPolicy":"Never",
"schedulerName":"default-scheduler",
"securityContext":{
"fsGroup":1000760000,
"seLinuxOptions":{
"level":"s0:c28,c2"
},
"seccompProfile":{
"type":"RuntimeDefault"
}
},
"serviceAccount":"default",
"serviceAccountName":"default",
"terminationGracePeriodSeconds":30,
"tolerations":[
{
"effect":"NoExecute",
"key":"node.kubernetes.io/not-ready",
"operator":"Exists",
"tolerationSeconds":300
},
{
"effect":"NoExecute",
"key":"node.kubernetes.io/unreachable",
"operator":"Exists",
"tolerationSeconds":300
}
],
"volumes":[
{
"configMap":{
"defaultMode":420,
"name":"mcg-workload-config"
},
"name":"mcg-workload-config-vol"
},
{
"name":"kube-api-access-q4k5f",
"projected":{
"defaultMode":420,
"sources":[
{
"serviceAccountToken":{
"expirationSeconds":3607,
"path":"token"
}
},
{
"configMap":{
"items":[
{
"key":"ca.crt",
"path":"ca.crt"
}
],
"name":"kube-root-ca.crt"
}
},
{
"downwardAPI":{
"items":[
{
"fieldRef":{
"apiVersion":"v1",
"fieldPath":"metadata.namespace"
},
"path":"namespace"
}
]
}
},
{
"configMap":{
"items":[
{
"key":"service-ca.crt",
"path":"service-ca.crt"
}
],
"name":"openshift-service-ca.crt"
}
}
]
}
}
]
},
"status":{
"conditions":[
{
"lastProbeTime":"None",
"lastTransitionTime":"2023-10-06T15:51:52Z",
"status":"True",
"type":"Initialized"
},
{
"lastProbeTime":"None",
"lastTransitionTime":"2023-10-06T16:29:11Z",
"reason":"PodFailed",
"status":"False",
"type":"Ready"
},
{
"lastProbeTime":"None",
"lastTransitionTime":"2023-10-06T16:29:11Z",
"reason":"PodFailed",
"status":"False",
"type":"ContainersReady"
},
{
"lastProbeTime":"None",
"lastTransitionTime":"2023-10-06T15:51:52Z",
"status":"True",
"type":"PodScheduled"
}
],
"containerStatuses":[
{
"containerID":"cri-o://a90cf743fbf1d07c98b9b137b37d9498381d0b530303a2073e6bae5a1bf9cc7d",
"image":"quay.io/fbalak/fio-fedora:latest",
"imageID":"quay.io/fbalak/fio-fedora@sha256:c733784e57d19b34124cca397c330d9094b4e18129f7a546564e1a3de96dc2c4",
"lastState":{
},
"name":"fio",
"ready":false,
"restartCount":0,
"started":false,
"state":{
"terminated":{
"containerID":"cri-o://a90cf743fbf1d07c98b9b137b37d9498381d0b530303a2073e6bae5a1bf9cc7d",
"exitCode":1,
"finishedAt":"2023-10-06T16:29:11Z",
"reason":"Error",
"startedAt":"2023-10-06T15:53:20Z"
}
}
}
],
"hostIP":"10.1.113.156",
"phase":"Failed",
"podIP":"10.131.2.29",
"podIPs":[
{
"ip":"10.131.2.29"
}
],
"qosClass":"BestEffort",
"startTime":"2023-10-06T15:51:52Z"
}
}
RP links for reference:
Apparently this will get resolved when https://url.corp.redhat.com/c960ed4 is complete - I'll add a skip for this test until it's finished.
_ test_upgrade_mcg_io __
mcg_workload_job = <ocs_ci.ocs.resources.ocs.OCS object at 0x3ff68c07070>
tests/ecosystem/upgrade/test_noobaa.py:181: AssertionError