canonical / mir-ci

Mir CI helpers
1 stars 1 forks source link

RuntimeError: Unable to get the cpu time for cgroup #50

Closed Saviq closed 9 months ago

Saviq commented 9 months ago

E.g. https://github.com/MirServer/mir-ci/actions/runs/6325899473/job/17179867323:

_______ TestAppsCanRun.test_app_can_run[ubuntu_frame-mir-kiosk-scummvm] ________

self = <mir_ci.cgroups.Cgroup object at 0x7f84f07808b0>

    def get_cpu_time_microseconds(self) -> int:
        try:
>           for line in self._read_file("cpu.stat"):

cgroups.py:57: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

self = <mir_ci.cgroups.Cgroup object at 0x7f84f07808b0>, file_name = 'cpu.stat'

    def _read_file(self, file_name: str) -> Iterator[str]:
        file_path = f"{self.path}/{file_name}"
>       with open(file_path, "r") as file:
E       FileNotFoundError: [Errno 2] No such file or directory: '/sys/fs/cgroup/user.slice/user-1001.slice/user@1001.service/app.slice/snap.ubuntu-frame.ubuntu-frame-3789243b-1bb4-4dfb-a14d-e931affafded.scope/cpu.stat'

cgroups.py:52: FileNotFoundError

The above exception was the direct cause of the following exception:

self = <mir_ci.test_apps_can_run.TestAppsCanRun object at 0x7f84f12c1a90>
server = <mir_ci.apps.App object at 0x7f84f0ff9940>
app = <mir_ci.apps.App object at 0x7f84f1354250>
record_property = <function record_property.<locals>.append_property at 0x7f84f1005e50>

    @pytest.mark.smoke
    @pytest.mark.parametrize(
        "app",
        [
            apps.wpe(),
            apps.snap("mir-kiosk-neverputt"),
            apps.snap("mir-kiosk-scummvm"),
            apps.snap("mir-kiosk-kodi"),
            apps.pluma(),
            apps.qterminal(),
        ],
    )
    async def test_app_can_run(self, server, app, record_property) -> None:
        server_instance = DisplayServer(server)
        program = server_instance.program(app)
        benchmarker = Benchmarker({"compositor": server_instance, "client": program}, poll_time_seconds=0.1)
        async with benchmarker:
>           await asyncio.sleep(short_wait_time)

test_apps_can_run.py:29: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
benchmarker.py:45: in __aexit__
    await self.task
benchmarker.py:22: in _run
    await self.backend.poll()
benchmarker.py:76: in poll
    self.data_records[name].cpu_time_microseconds = cgroup.get_cpu_time_microseconds()
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

self = <mir_ci.cgroups.Cgroup object at 0x7f84f07808b0>

    def get_cpu_time_microseconds(self) -> int:
        try:
            for line in self._read_file("cpu.stat"):
                split_line = line.split(" ")
                if split_line[0] == "usage_usec":
                    return int(split_line[1])

            raise RuntimeError("usage_usec line not found")
        except Exception as ex:
>           raise RuntimeError(f"Unable to get the cpu time for cgroup with pid: {self.pid}") from ex
E           RuntimeError: Unable to get the cpu time for cgroup with pid: 10442

cgroups.py:64: RuntimeError

This looks like either we're too quick, or too late. Need to make a call on what to do - skip just the one measurement? The whole iteration? Interrupt the benchmark for that process?

Saviq commented 9 months ago

Another related problem - likely racing with the cgroups folder populating:

_____ TestAppsCanRun.test_app_can_run[mir_test_tools-wpe-webkit-mir-kiosk] _____

self = <mir_ci.test_apps_can_run.TestAppsCanRun object at 0x7fd0ac538430>
server = <mir_ci.apps.App object at 0x7fd0abbe8640>
app = <mir_ci.apps.App object at 0x7fd0ac13ca90>
record_property = <function record_property.<locals>.append_property at 0x7fd0ab3db7f0>

    @pytest.mark.smoke
    @pytest.mark.parametrize('app', [
        apps.wpe(),
        apps.snap('mir-kiosk-neverputt'),
        apps.snap('mir-kiosk-scummvm'),
        apps.snap('mir-kiosk-kodi'),
        apps.pluma(),
        apps.qterminal(),
    ])
    async def test_app_can_run(self, server, app, record_property) -> None:
        server_instance = DisplayServer(server)
        program = server_instance.program(app)
        benchmarker = Benchmarker({"compositor": server_instance, "client": program}, poll_time_seconds=0.1)
>       async with benchmarker:

test_apps_can_run.py:25: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
benchmarker.py:46: in __aexit__
    await self.task
benchmarker.py:23: in _run
    await self.backend.poll()
benchmarker.py:76: in poll
    cgroup = await info.program.get_cgroup()
display_server.py:50: in get_cgroup
    return await self.server.get_cgroup()
program.py:58: in get_cgroup
    await self.cgroups_task
cgroups.py:19: in inner
    path = await Cgroup.get_cgroup_dir(pid)
cgroups.py:27: in get_cgroup_dir
    parent_path = Cgroup._get_cgroup_dir_internal(os.getpid())
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

pid = 19550

    @staticmethod
    def _get_cgroup_dir_internal(pid: int) -> pathlib.Path:
        cgroup_file = f"/proc/{pid}/cgroup"

        with open(cgroup_file, "r") as group_file:
            for line in group_file.readlines():
>               assert line.startswith("0::"), f"Line in cgroup file does not start with 0:: for pid: {pid}"
E               AssertionError: Line in cgroup file does not start with 0:: for pid: 19550

cgroups.py:44: AssertionError