red-hat-storage / ocs-ci

https://ocs-ci.readthedocs.io/en/latest/
MIT License
109 stars 166 forks source link

test_pvc_clone_performance_multiple_files[CephFileSystem-7-1800] in 4.15 is failing #9208

Closed ypersky1980 closed 7 months ago

ypersky1980 commented 8 months ago

Test case is failing - re-run the test and determine whether this is a product bug ( open a bz) or a test bug ( submit a pr with a fix)

https://reportportal-ocs4.apps.ocp-c1.prod.psi.redhat.com/ui/#ocs/launches/all/17989/883997/884036/log?item1Params=page.sort%3Dstatus%252CASC

resource = <ocs_ci.ocs.resources.pvc.PVC object at 0x7fe67726f2b0> state = 'Bound', timeout = 18000

def wait_for_resource_state(resource, state, timeout=60): """ Wait for a resource to get to a given status

Args:
    resource (OCS obj): The resource object
    state (str): The status to wait for
    timeout (int): Time in seconds to wait

Raises:
    ResourceWrongStatusException: In case the resource hasn't
        reached the desired state

"""
if check_cluster_is_compact():
    timeout = 180
if (
    resource.name == constants.DEFAULT_STORAGECLASS_CEPHFS
    or resource.name == constants.DEFAULT_STORAGECLASS_RBD
):
    logger.info("Attempt to default default Secret or StorageClass")
    return
try:

  resource.ocp.wait_for_resource(
        condition=state, resource_name=resource.name, timeout=timeout
    )

ocs_ci/helpers/helpers.py:117:

self = <ocs_ci.ocs.ocp.OCP object at 0x7fe6e8628160>, condition = 'Bound' resource_name = 'clone-pas-test-1', column = 'STATUS', selector = None resource_count = 0, timeout = 18000, sleep = 3 dont_allow_other_resources = False, error_condition = None

def wait_for_resource( self, condition, resource_name="", column="STATUS", selector=None, resource_count=0, timeout=60, sleep=3, dont_allow_other_resources=False, error_condition=None, ): """ Wait for a resource to reach to a desired condition

Args:
    condition (str): The desired state the resource that is sampled
        from 'oc get <kind> <resource_name>' command
    resource_name (str): The name of the resource to wait
        for (e.g.my-pv1)
    column (str): The name of the column to compare with
    selector (str): The resource selector to search with.
        Example: 'app=rook-ceph-mds'
    resource_count (int): How many resources expected to be
    timeout (int): Time in seconds to wait
    sleep (int): Sampling time in seconds
    dont_allow_other_resources (bool): If True it will not allow other
        resources in different state. For example you are waiting for 2
        resources and there are currently 3 (2 in running state,
        1 in ContainerCreating) the function will continue to next
        iteration to wait for only 2 resources in running state and no
        other exists.
    error_condition (str): State of the resource that is sampled
        from 'oc get <kind> <resource_name>' command, which makes this
        method to fail immediately without waiting for a timeout. This
        is optional and makes sense only when there is a well defined
        unrecoverable state of the resource(s) which is not expected to
        be part of a workflow under test, and at the same time, the
        timeout itself is large.

Returns:
    bool: True in case all resources reached desired condition,
        False otherwise

"""
if condition == error_condition:
    # when this fails, this method is used in a wrong way
    raise ValueError(
        f"Condition '{condition}' we are waiting for must be different"
        f" from error condition '{error_condition}'"
        " which describes unexpected error state."
    )
log.info(
    (
        f"Waiting for a resource(s) of kind {self._kind}"
        f" identified by name '{resource_name}'"
        f" using selector {selector}"
        f" at column name {column}"
        f" to reach desired condition {condition}"
    )
)
resource_name = resource_name if resource_name else self.resource_name
selector = selector if selector else self.selector

# actual status of the resource we are waiting for, setting it to None
# now prevents UnboundLocalError raised when waiting timeouts
actual_status = None

try:
    for sample in TimeoutSampler(
        timeout, sleep, self.get, resource_name, True, selector
    ):
        # Only 1 resource expected to be returned
        if resource_name:
            retry = int(timeout / sleep if sleep else timeout / 1)
            status = self.get_resource(
                resource_name,
                column,
                retry=retry,
                wait=sleep,
            )
            if status == condition:
                log.info(
                    f"status of {resource_name} at {column}"
                    " reached condition!"
                )
                return True
            log.info(
                (
                    f"status of {resource_name} at column {column} was {status},"
                    f" but we were waiting for {condition}"
                )
            )
            actual_status = status
            if error_condition is not None and status == error_condition:
                raise ResourceWrongStatusException(
                    resource_name,
                    column=column,
                    expected=condition,
                    got=status,
                )
        # More than 1 resources returned
        elif sample.get("kind") == "List":
            in_condition = []
            in_condition_len = 0
            actual_status = []
            sample = sample["items"]
            sample_len = len(sample)
            for item in sample:
                try:
                    item_name = item.get("metadata").get("name")
                    status = self.get_resource(item_name, column)
                    actual_status.append(status)
                    if status == condition:
                        in_condition.append(item)
                        in_condition_len = len(in_condition)
                    if (
                        error_condition is not None
                        and status == error_condition
                    ):
                        raise ResourceWrongStatusException(
                            item_name,
                            column=column,
                            expected=condition,
                            got=status,
                        )
                except CommandFailed as ex:
                    log.info(
                        f"Failed to get status of resource: {item_name} at column {column}, "
                        f"Error: {ex}"
                    )
                if resource_count:
                    if in_condition_len == resource_count:
                        log.info(
                            f"{in_condition_len} resources already "
                            f"reached condition!"
                        )
                        if (
                            dont_allow_other_resources
                            and sample_len != in_condition_len
                        ):
                            log.info(
                                f"There are {sample_len} resources in "
                                f"total. Continue to waiting as "
                                f"you don't allow other resources!"
                            )
                            continue
                        return True
                elif len(sample) == len(in_condition):
                    return True
            # preparing logging message with expected number of
            # resource items we are waiting for
            if resource_count > 0:
                exp_num_str = f"all {resource_count}"
            else:
                exp_num_str = "all"
            log.info(
                (
                    f"status of {resource_name} at column {column} - item(s) were {actual_status},"
                    f" but we were waiting"
                    f" for {exp_num_str} of them to be {condition}"
                )
            )
except TimeoutExpiredError as ex:
    log.error(f"timeout expired: {ex}")
    # run `oc describe` on the resources we were waiting for to provide
    # evidence so that we can understand what was wrong
    output = self.describe(resource_name, selector=selector)
    log.warning(
        "Description of the resource(s) we were waiting for:\n%s", output
    )
    log.error(
        (
            f"Wait for {self._kind} resource {resource_name} at column {column}"
            f" to reach desired condition {condition} failed,"
            f" last actual status was {actual_status}"
        )
    )

  raise (ex)

ocs_ci/ocs/ocp.py:795:

self = <ocs_ci.ocs.ocp.OCP object at 0x7fe6e8628160>, condition = 'Bound' resource_name = 'clone-pas-test-1', column = 'STATUS', selector = None resource_count = 0, timeout = 18000, sleep = 3 dont_allow_other_resources = False, error_condition = None

def wait_for_resource( self, condition, resource_name="", column="STATUS", selector=None, resource_count=0, timeout=60, sleep=3, dont_allow_other_resources=False, error_condition=None, ): """ Wait for a resource to reach to a desired condition

Args:
    condition (str): The desired state the resource that is sampled
        from 'oc get <kind> <resource_name>' command
    resource_name (str): The name of the resource to wait
        for (e.g.my-pv1)
    column (str): The name of the column to compare with
    selector (str): The resource selector to search with.
        Example: 'app=rook-ceph-mds'
    resource_count (int): How many resources expected to be
    timeout (int): Time in seconds to wait
    sleep (int): Sampling time in seconds
    dont_allow_other_resources (bool): If True it will not allow other
        resources in different state. For example you are waiting for 2
        resources and there are currently 3 (2 in running state,
        1 in ContainerCreating) the function will continue to next
        iteration to wait for only 2 resources in running state and no
        other exists.
    error_condition (str): State of the resource that is sampled
        from 'oc get <kind> <resource_name>' command, which makes this
        method to fail immediately without waiting for a timeout. This
        is optional and makes sense only when there is a well defined
        unrecoverable state of the resource(s) which is not expected to
        be part of a workflow under test, and at the same time, the
        timeout itself is large.

Returns:
    bool: True in case all resources reached desired condition,
        False otherwise

"""
if condition == error_condition:
    # when this fails, this method is used in a wrong way
    raise ValueError(
        f"Condition '{condition}' we are waiting for must be different"
        f" from error condition '{error_condition}'"
        " which describes unexpected error state."
    )
log.info(
    (
        f"Waiting for a resource(s) of kind {self._kind}"
        f" identified by name '{resource_name}'"
        f" using selector {selector}"
        f" at column name {column}"
        f" to reach desired condition {condition}"
    )
)
resource_name = resource_name if resource_name else self.resource_name
selector = selector if selector else self.selector

# actual status of the resource we are waiting for, setting it to None
# now prevents UnboundLocalError raised when waiting timeouts
actual_status = None

try:

  for sample in TimeoutSampler(
        timeout, sleep, self.get, resource_name, True, selector
    ):

ocs_ci/ocs/ocp.py:686:

self = <ocs_ci.utility.utils.TimeoutSampler object at 0x7fe64f36cc40>

def iter(self): if self.start_time is None: self.start_time = time.time() while True: self.last_sample_time = time.time() if self.timeout <= (self.last_sample_time - self.start_time):

      raise self.timeout_exc_cls(*self.timeout_exc_args)

E ocs_ci.ocs.exceptions.TimeoutExpiredError: Timed out after 18000s running get("clone-pas-test-1", True, None)

ocs_ci/utility/utils.py:1288: TimeoutExpiredError

During handling of the above exception, another exception occurred:

self = <test_pvc_clone_performance.TestPVCClonePerformance object at 0x7fe6d1ef55e0> secret_factory = <function secret_factory_fixture..factory at 0x7fe60e874a60> interface = 'CephFileSystem', copies = 7, timeout = 1800

@pytest.mark.parametrize( argnames=["interface", "copies", "timeout"], argvalues=[ pytest.param( [constants.CEPHBLOCKPOOL, 7, 1800], marks=pytest.mark.polarion_id("OCS-2673"), ), pytest.param( [constants.CEPHFILESYSTEM, 7, 1800], marks=[ pytest.mark.polarion_id("OCS-2674"), pytest.mark.bugzilla("2101874"), ], ), ], ) def test_pvc_clone_performance_multiple_files( self, secret_factory, interface, copies, timeout, ): """ Test assign nodeName to a pod using RWX pvc Each kernel (unzipped) is 892M and 61694 files The test creates a pvc and a pods, writes kernel files multiplied by number of copies The test creates number of clones samples, calculates creation and deletion times for each one the clones and calculates the average creation and average deletion times """

# Initialize some variabels
self.interface = interface
self.timeout = timeout
self.pvc_size = "100"
if self.dev_mode:
    self.pvc_size = "10"
    copies = 1
self.results_path = get_full_test_logs_path(cname=self)
# Initialize the results doc file.
full_results = self.init_full_results(
    ClonesResultsAnalyse(
        self.uuid,
        self.crd_data,
        self.full_log_path,
        "test_pvc_clone_performance_multiple_files",
    )
)
files_written = ""
data_written = ""

test_start_time = self.get_time()

# Create new pool and sc only for RBD, for CepgFS use thr default
if self.interface == constants.CEPHBLOCKPOOL:
    # Creating new pool to run the test on it
    self.create_new_pool_and_sc(secret_factory)
else:
    self.sc_obj = ocs.OCS(
        kind="StorageCluster",
        metadata={
            "namespace": self.namespace,
            "name": Interfaces_info[self.interface]["sc"],
        },
    )
    self.pool_name = "ocs-storagecluster-cephfilesystem"
# Create a PVC
self.create_pvc_and_wait_for_bound()
# Create a POD
self.create_pod_and_wait_for_completion(
    command=["/opt/multiple_files.sh"],
    command_args=[f"{copies}", "/mnt"],
)

# Get the number of files and total written data from the pod
for line in self.pod_object.ocp.get_logs(name=self.pod_object.name).split("\n"):
    if "Number Of Files" in line:
        files_written = line.split(" ")[-1]
    if "Total Data" in line:
        data_written = line.split(" ")[-1]
logger.info("Getting the amount of data written to the PVC")
logger.info(f"The amount of written data is {data_written}")
logger.info(
    f"For {self.interface} - The number of files written to the pod is {int(files_written):,}"
)

# increasing the timeout since clone creation time is longer than pod attach time
self.timeout = 18000

# taking the time, so parsing the provision log will be faster.
start_time = self.get_time("csi")

clones_list = self.create_and_delete_clones()

tests/e2e/performance/csi_tests/test_pvc_clone_performance.py:504:

tests/e2e/performance/csi_tests/test_pvc_clone_performance.py:249: in create_and_delete_clones helpers.wait_for_resource_state(

resource = <ocs_ci.ocs.resources.pvc.PVC object at 0x7fe67726f2b0> state = 'Bound', timeout = 18000

def wait_for_resource_state(resource, state, timeout=60): """ Wait for a resource to get to a given status

Args:
    resource (OCS obj): The resource object
    state (str): The status to wait for
    timeout (int): Time in seconds to wait

Raises:
    ResourceWrongStatusException: In case the resource hasn't
        reached the desired state

"""
if check_cluster_is_compact():
    timeout = 180
if (
    resource.name == constants.DEFAULT_STORAGECLASS_CEPHFS
    or resource.name == constants.DEFAULT_STORAGECLASS_RBD
):
    logger.info("Attempt to default default Secret or StorageClass")
    return
try:
    resource.ocp.wait_for_resource(
        condition=state, resource_name=resource.name, timeout=timeout
    )
except TimeoutExpiredError:
    logger.error(f"{resource.kind} {resource.name} failed to reach {state}")
    resource.reload()

  raise ResourceWrongStatusException(resource.name, resource.describe())

E ocs_ci.ocs.exceptions.ResourceWrongStatusException: Resource clone-pas-test-1 describe output: Name: clone-pas-test-1 E Namespace: namespace-pas-test-058960c607af4789baa76 E StorageClass: ocs-storagecluster-cephfs E Status: Pending E Volume: E Labels:
E Annotations: volume.beta.kubernetes.io/storage-provisioner: openshift-storage.cephfs.csi.ceph.com E volume.kubernetes.io/storage-provisioner: openshift-storage.cephfs.csi.ceph.com E Finalizers: [kubernetes.io/pvc-protection] E Capacity: E Access Modes: E VolumeMode: Filesystem E DataSource: E Kind: PersistentVolumeClaim E Name: pvc-pas-test E Used By:
E Events: E Type Reason Age From Message E ---- ------ ---- ---- ------- E Normal ExternalProvisioning 4m56s (x1211 over 5h) persistentvolume-controller Waiting for a volume to be created either by the external provisioner 'openshift-storage.cephfs.csi.ceph.com' or manually by the system administrator. If volume creation is delayed, please verify that the provisioner is running and correctly registered. E Normal Provisioning 4m1s (x87 over 5h) openshift-storage.cephfs.csi.ceph.com_csi-cephfsplugin-provisioner-7569b959fd-t9xzg_8bb4b643-6f55-48f6-97f0-c34ab048e33c External provisioner is provisioning volume for claim "namespace-pas-test-058960c607af4789baa76/clone-pas-test-1"

ocs_ci/helpers/helpers.py:123: ResourceWrongStatusException

2024-01-13 07:25:10

ypersky1980 commented 7 months ago

Since this issue was open - the test has passed on 3 different plaftorms ( ODF 4.14 and ODF 4.15) , therefore closing it. No fixes are needed.