test_small_file_workload.py - the test fails in 4.14 - a fix is needed

ypersky1980 commented 1 year ago

test_small_file_workload.py - all test cases failed on both AWS and VMware LSO platforms while running on 4.14. A fix is needed!

The test passed in 4.13 on all the platforms.

VMware LSO: https://ocs4-jenkins-csb-odf-qe.apps.ocp-c1.prod.psi.redhat.com/job/qe-deploy-ocs-cluster/28837/testReport/ AWS https://ocs4-jenkins-csb-odf-qe.apps.ocp-c1.prod.psi.redhat.com/view/Performance/job/qe-trigger-aws-ipi-3az-rhcos-3m-3w-performance/101/testReport/

ypersky1980 commented 1 year ago

=================================== FAILURES =================================== _ TestSmallFileWorkload.test_smallfile_workload[4-5000-22-5-33-CephBlockPool] __

self = <test_small_file_workload.TestSmallFileWorkload object at 0x7f82c3c00b50> file_size = 4, files = 5000, threads = 22, samples = 5, clients = 33 interface = 'CephBlockPool'

@pytest.mark.parametrize(
    argnames=["file_size", "files", "threads", "samples", "clients", "interface"],
    argvalues=[
        pytest.param(*[4, 5000, 22, 5, 33, constants.CEPHBLOCKPOOL]),
        pytest.param(*[16, 5000, 8, 5, 21, constants.CEPHBLOCKPOOL]),
        pytest.param(*[4, 2500, 4, 5, 9, constants.CEPHFILESYSTEM]),
        pytest.param(*[16, 1500, 4, 5, 9, constants.CEPHFILESYSTEM]),
    ],
)
@pytest.mark.polarion_id("OCS-1295")
def test_smallfile_workload(
    self, file_size, files, threads, samples, clients, interface
):
    """
    Run SmallFile Workload

    Args:
        file_size (int) : the size of the file to be used
        files (int) : number of files to use
        threads (int) : number of threads to be use in the test
        samples (int) : how meany samples to run for each test
        interface (str) : the volume type (rbd / cephfs)

    """
    if config.PERF.get("deploy_internal_es"):
        self.es = ElasticSearch()
    else:
        if config.PERF.get("internal_es_server") == "":
            self.es = None
            return
        else:
            self.es = {
                "server": config.PERF.get("internal_es_server"),
                "port": config.PERF.get("internal_es_port"),
                "url": f"http://{config.PERF.get('internal_es_server')}:{config.PERF.get('internal_es_port')}",
            }
            # verify that the connection to the elasticsearch server is OK
            if not super(TestSmallFileWorkload, self).es_connect():
                self.es = None
                return

    # deploy the benchmark-operator
    self.deploy_benchmark_operator()

    # verify that there is an elasticsearch server for the benchmark
    if not self.es:
        log.error("This test must have an Elasticsearch server")
        return False

    # Getting the full path for the test logs
    self.full_log_path = get_full_test_logs_path(cname=self)
    self.results_path = get_full_test_logs_path(cname=self)
    self.full_log_path += (
        f"-{file_size}-{files}-{threads}-{samples}-{clients}-{interface}"
    )
    log.info(f"Logs file path name is : {self.full_log_path}")

    # Loading the main template yaml file for the benchmark
    log.info("Create resource file for small_files workload")
    self.crd_data = templating.load_yaml(constants.SMALLFILE_BENCHMARK_YAML)

    # Saving the Original elastic-search IP and PORT - if defined in yaml
    self.es_info_backup(self.es)

    self.set_storageclass(interface=interface)

    # Setting the data set to 40% of the total storage capacity
    self.setting_storage_usage(file_size, files, threads, samples, clients)

    self.get_env_info()

  if not self.run():

tests/e2e/performance/io_workload/test_small_file_workload.py:619:

tests/e2e/performance/io_workload/test_small_file_workload.py:517: in run self.wait_for_wl_to_finish(sleep=30)

self = <test_small_file_workload.TestSmallFileWorkload object at 0x7f82c3c00b50> timeout = 18000, sleep = 30

def wait_for_wl_to_finish(self, timeout=18000, sleep=300):
    """
    Waiting until the workload is finished and get the test log

    Args:
        timeout (int): time in second to wait until the benchmark start
        sleep (int): Sleep interval seconds

    Raise:
        exception for too much restarts of the test.
        ResourceWrongStatusException : test Failed / Error
        TimeoutExpiredError : test did not completed on time.

    """
    log.info(f"Waiting for {self.client_pod_name} to complete")

    Finished = 0
    restarts = 0
    total_time = timeout
    while not Finished and total_time > 0:
        results = run_oc_command(
            "get pod --no-headers -o custom-columns=:metadata.name,:status.phase",
            namespace=benchmark_operator.BMO_NAME,
        )
        (fname, status) = ["", ""]
        for name in results:
            # looking for the pod which run the benchmark (not the IO)
            # this pod contain the `client` in his name, and there is only one
            # pod like this, other pods have the `server` in the name.
            (fname, status) = name.split()
            if re.search("client", fname):
                break
            else:
                (fname, status) = ["", ""]

        if fname == "":  # there is no `client` pod !
            err_msg = f"{self.client_pod} Failed to run !!!"
            log.error(err_msg)
            raise Exception(err_msg)

        if not fname == self.client_pod:
            # The client pod name is different from previous check, it was restarted
            log.info(
                f"The pod {self.client_pod} was restart. the new client pod is {fname}"
            )
            self.client_pod = fname
            restarts += 1
            # in case of restarting the benchmark, reset the timeout as well
            total_time = timeout

        if restarts > 3:  # we are tolerating only 3 restarts
            err_msg = f"Too much restarts of the benchmark ({restarts})"
            log.error(err_msg)
            raise Exception(err_msg)

        if status == "Succeeded":
            # Getting the end time of the benchmark - for reporting.
            self.end_time = self.get_time()
            self.test_logs = self.pod_obj.exec_oc_cmd(
                f"logs {self.client_pod}", out_yaml_format=False
            )
            log.info(f"{self.client_pod} completed successfully")
            Finished = 1
        elif (
            status != constants.STATUS_RUNNING
            and status != constants.STATUS_PENDING
        ):
            # if the benchmark pod is not in Running state (and not Completed/Pending),
            # no need to wait for timeout.
            # Note: the pod can be in pending state in case of restart.
            err_msg = f"{self.client_pod} Failed to run - ({status})"
            log.error(err_msg)

          raise exceptions.ResourceWrongStatusException(
self.client_pod, describe_out=err_msg, column="Status", expected="Succeeded", got=status, ) E ocs_ci.ocs.exceptions.ResourceWrongStatusException: Resource smallfile-client-1-benchmark-abbc7fb5-gr4tj in column Status was in state Failed but expected Succeeded describe output: smallfile-client-1-benchmark-abbc7fb5-gr4tj Failed to run - (Failed)

ocs_ci/ocs/perftests.py:477: ResourceWrongStatusException

github-actions[bot] commented 11 months ago

This issue has been automatically marked as stale because it has not had recent activity. It will be closed in 30 days if no further activity occurs.

github-actions[bot] commented 10 months ago

This issue has been automatically closed due to inactivity. Please re-open if this still requires investigation.

red-hat-storage / ocs-ci

test_small_file_workload.py - the test fails in 4.14 - a fix is needed #8643