red-hat-storage / ocs-ci

https://ocs-ci.readthedocs.io/en/latest/
MIT License
108 stars 166 forks source link

test_pvc_creation_deletion_measurement_performance - all test cases in 4.15 are failing #9215

Closed ypersky1980 closed 8 months ago

ypersky1980 commented 9 months ago

Test case is failing - re-run the test and determine whether this is a product bug ( open a bz) or a test bug ( submit a pr with a fix)

https://reportportal-ocs4.apps.ocp-c1.prod.psi.redhat.com/ui/#ocs/launches/all/17989/883997/884039/log

self = <test_pvc_creation_deletion_performance.TestPVCCreationDeletionPerformance object at 0x7fe5e9754fa0> interface_type = 'CephBlockPool', pvc_size = '5Gi'

@pytest.mark.parametrize( argnames=["interface_type", "pvc_size"], argvalues=[ pytest.param( [constants.CEPHBLOCKPOOL, "5Gi"], ), pytest.param( [constants.CEPHBLOCKPOOL, "15Gi"], ), pytest.param( [constants.CEPHBLOCKPOOL, "25Gi"], ), pytest.param( [constants.CEPHFILESYSTEM, "5Gi"], ), pytest.param( [constants.CEPHFILESYSTEM, "15Gi"], ), pytest.param( [constants.CEPHFILESYSTEM, "25Gi"], ), ], ) def test_pvc_creation_deletion_measurement_performance( self, interface_type, pvc_size ): """ Measuring PVC creation and deletion times for pvc samples. filling up each PVC with 70% of data. Verifying that those times are within the required limits

Args:
    interface_type (str): the interface type to run against -
        CephBlockPool or CephFileSystem
    pvc_size (str): the size of the pvc to create
"""

# Initializing test variables
self.interface = interface_type

num_of_samples = 5
if self.dev_mode:
    num_of_samples = 2

accepted_creation_time = 5  # old_value=1
accepted_deletion_time = Interface_Info[self.interface]["delete_time"]
accepted_creation_deviation_percent = 50
accepted_deletion_deviation_percent = 50

all_mesuring_times = {
    "create": [],
    "delete": [],
    "csi_create": [],
    "csi_delete": [],
}

msg_prefix = f"Interface: {self.interface}, PVC size: {pvc_size}."

self.set_results_path_and_file(
    "test_pvc_creation_deletion_measurement_performance"
)

self.start_time = self.get_time()

self.get_env_info()

# Initialize the results doc file.
self.full_results = self.init_full_results(
    ResultsAnalyse(
        self.uuid,
        self.crd_data,
        self.full_log_path,
        "pvc_create_delete_fullres",
    )
)
self.full_results.add_key("pvc_size", pvc_size)
self.full_results.add_key("samples", num_of_samples)

self.create_fio_pod_yaml(pvc_size=int(pvc_size.replace("Gi", "")))

# Creating PVC(s) for creation time mesurment

start_time = self.create_pvcs_and_wait_for_bound( msg_prefix, num_of_samples, pvc_size, burst=False )

tests/e2e/performance/csi_tests/test_pvc_creation_deletion_performance.py:291:

tests/e2e/performance/csi_tests/test_pvc_creation_deletion_performance.py:134: in create_pvcs_and_wait_for_bound performance_lib.wait_for_resource_bulk_status(

resource = 'pvc', resource_count = 5 namespace = 'namespace-pas-test-65cc2254fce9483d9d70f', status = 'Bound' timeout = -5, sleep_time = 5

def wait_for_resource_bulk_status( resource, resource_count, namespace, status, timeout=60, sleep_time=3 ): """ Waiting for bulk of resources (from the same type) to reach the desire status

Args:
    resource (str): the resoure type to wait for
    resource_count (int):  the number of rusource to wait for - to wait for deleteion
        of resources, this should be '0'
    namespace (str): the namespace where the resources should be
    status (str): the status of the resources to be in.
    timeout (int): how much time to wait for the resources (in sec.)- default is 1 Minute
    sleep_time (int): how much time to wait between each iteration check - default is 3 sec.

Return:
    bool : 'True' if all resources reach the desire state

Raise:
    Exception : in case of not all resources reach the desire state.

"""
while timeout >= 0:
    results = 0
    for line in run_oc_command(f"get {resource}", namespace=namespace):
        if status in line:
            results += 1
    if results == resource_count:
        return True
    else:
        logger.info(
            f"{results} {resource} out of {resource_count} are in {status} state !"
        )
        logger.info(f"wait {sleep_time} sec for next iteration")
        time.sleep(sleep_time)
        timeout -= sleep_time

err_msg = f"{resource.upper()} failed reaching {status} on time"
logger.error(err_msg)

raise Exception(err_msg) E Exception: PVC failed reaching Bound on time

ocs_ci/helpers/performance_lib.py:794: Exception

2024-01-13 12:57:04

https://reportportal-ocs4.apps.ocp-c1.prod.psi.redhat.com/ui/#ocs/launches/all/17989/883997/884046/log

self = <test_pvc_creation_deletion_performance.TestPVCCreationDeletionPerformance object at 0x7fe67793aaf0>

def setup(self): """ Setting up test parameters """ log.info("Starting the test setup")

super(TestPVCCreationDeletionPerformance, self).setup()

tests/e2e/performance/csi_tests/test_pvc_creation_deletion_performance.py:52:

ocs_ci/ocs/perftests.py:97: in setup self.get_osd_info() ocs_ci/ocs/perftests.py:229: in get_osd_info osd_info = ct_pod.exec_ceph_cmd(ceph_cmd="ceph osd df") ocs_ci/ocs/resources/pod.py:345: in exec_ceph_cmd out = self.exec_cmd_on_pod( ocs_ci/ocs/resources/pod.py:192: in exec_cmd_on_pod return self.ocp.exec_oc_cmd( ocs_ci/ocs/ocp.py:178: in exec_oc_cmd out = run_cmd( ocs_ci/utility/utils.py:484: in run_cmd completed_process = exec_cmd( ocs_ci/utility/utils.py:633: in exec_cmd completed_process = subprocess.run( /usr/lib64/python3.8/subprocess.py:495: in run stdout, stderr = process.communicate(input, timeout=timeout) /usr/lib64/python3.8/subprocess.py:1028: in communicate stdout, stderr = self._communicate(input, endtime, timeout) /usr/lib64/python3.8/subprocess.py:1869: in _communicate self._check_timeout(endtime, orig_timeout, stdout, stderr)

self = <subprocess.Popen object at 0x7fe60c3f93a0>, endtime = 177427.306202319 orig_timeout = 600, stdout_seq = [], stderr_seq = [] skip_check_and_raise = False

def _check_timeout(self, endtime, orig_timeout, stdout_seq, stderr_seq, skip_check_and_raise=False): """Convenience for checking if a timeout has expired.""" if endtime is None: return if skip_check_and_raise or _time() > endtime:

  raise TimeoutExpired(
            self.args, orig_timeout,
            output=b''.join(stdout_seq) if stdout_seq else None,
            stderr=b''.join(stderr_seq) if stderr_seq else None)

E subprocess.TimeoutExpired: Command '['oc', '--kubeconfig', '/home/jenkins/current-cluster-dir/openshift-cluster-dir/auth/kubeconfig', '-n', 'openshift-storage', 'rsh', 'rook-ceph-tools-7997d9b857-g4kns', 'ceph', 'osd', 'df', '--format', 'json-pretty']' timed out after 600 seconds

/usr/lib64/python3.8/subprocess.py:1072: TimeoutExpired

2024-01-13 13:35:45

ypersky1980 commented 9 months ago

https://reportportal-ocs4.apps.ocp-c1.prod.psi.redhat.com/ui/#ocs/launches/all/17989/883997/884044/log

ypersky1980 commented 9 months ago

https://reportportal-ocs4.apps.ocp-c1.prod.psi.redhat.com/ui/#ocs/launches/all/17989/883997/884040/log

ypersky1980 commented 9 months ago

https://reportportal-ocs4.apps.ocp-c1.prod.psi.redhat.com/ui/#ocs/launches/all/17989/883997/884041/log

ypersky1980 commented 9 months ago

https://reportportal-ocs4.apps.ocp-c1.prod.psi.redhat.com/ui/#ocs/launches/all/17989/883997/884042/log

ypersky1980 commented 8 months ago

https://ocs4-jenkins-csb-odf-qe.apps.ocp-c1.prod.psi.redhat.com/job/qe-deploy-ocs-cluster/33623/testReport/tests.cross_functional.performance.csi_tests.test_pvc_creation_deletion_performance/TestPVCCreationDeletionPerformance/test_getting_all_results/

In the above job all the test cases passed except the results - do analyze!

ypersky1980 commented 8 months ago

Closing the issue since lately this test has passed on 3 different pplatforms : IBM cloud 4.15 and 4.14 + VMware LSO 4.14.