facebookresearch / lingua

Meta Lingua: a lean, efficient, and easy-to-hack codebase to research LLMs.
BSD 3-Clause "New" or "Revised" License
1.92k stars 83 forks source link

stool not working due to sinfo schema compatibility? #10

Open zkx06111 opened 1 day ago

zkx06111 commented 1 day ago

I'm trying to launch a debug job using the command provided in readme.

python -m lingua.stool script=apps.main.train config=apps/main/configs/debug.yaml nodes=1 partition=<partition>

The error I got was this:

  File "path/to/lingua/lingua/stool.py", line 112, in retrieve_max_time_per_partition
    sinfo = json.loads(subprocess.check_output("sinfo --json", shell=True))["sinfo"]
            ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^
KeyError: 'sinfo'

When I manually run sinfo in bash, the output looks like this, which doesn't match the schema used in retrieve_max_time_per_partition. I'm assuming this is because of the version of slurm.

{
  "meta": {
    "plugin": {
      "type": "openapi\/v0.0.37",
      "name": "Slurm OpenAPI v0.0.37"
    },
    "Slurm": {
      "version": {
        "major": 21,
        "micro": 5,
        "minor": 8
      },
      "release": "21.08.5"
    }
  },
  "errors": [
  ],
  "nodes": [
    {
      "architecture": "x86_64",
      "burstbuffer_network_address": "",
      "boards": 1,
      "boot_time": 1728946930,
      "comment": "",
      "cores": 28,
      "cpu_binding": 0,
      "cpu_load": 2400,
      "extra": "",
      "free_memory": 936621,
      "cpus": 112,
      "last_busy": 1729092962,
      "features": "",
      "active_features": "",
      "gres": "gpu:a6000:8,shard:256",
      "gres_drained": "N\/A",
      "gres_used": "gpu:a6000:2(IDX:2-3),shard:0",
      "mcs_label": "",
      "name": "aries",
      "next_state_after_reboot": "invalid",
      "address": "aries",
      "hostname": "aries",
      "state": "mixed",
      "state_flags": [
      ],
      "next_state_after_reboot_flags": [
      ],
      "operating_system": "Linux 6.8.0-45-generic #45~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Wed Sep 11 15:25:05 UTC 2",
      "owner": null,
      "partitions": [
        "aries"
      ],
      "port": 9001,
      "real_memory": 960000,
      "reason": "",
      "reason_changed_at": 0,
      "reason_set_by_user": null,
      "slurmd_start_time": 1728947062,
      "sockets": 2,
      "threads": 2,
      "temporary_disk": 0,
      "weight": 1,
      "tres": "cpu=112,mem=937.50G,billing=112,gres\/gpu=8,gres\/shard=256",
      "slurmd_version": "21.08.5",
      "alloc_memory": 65536,
      "alloc_cpus": 32,
      "idle_cpus": 80,
      "tres_used": "cpu=32,mem=64G,gres\/gpu=2",
      "tres_weighted": 32.0
    },
    {
      "architecture": "x86_64",
      "burstbuffer_network_address": "",
      "boards": 1,
      "boot_time": 1726099456,
      "comment": "",
      "cores": 32,
      "cpu_binding": 0,
      "cpu_load": 719,
      "extra": "",
      "free_memory": 15879,
      "cpus": 128,
      "last_busy": 1726099613,
      "features": "",
      "active_features": "",
      "gres": "gpu:a6000:8,shard:256",
      "gres_drained": "N\/A",
      "gres_used": "gpu:a6000:0(IDX:N\/A),shard:0",
      "mcs_label": "",
      "name": "gemini",
      "next_state_after_reboot": "invalid",
      "address": "gemini",
      "hostname": "gemini",
      "state": "idle",
      "state_flags": [
      ],
      "next_state_after_reboot_flags": [
      ],
      "operating_system": "Linux 6.5.0-28-generic #29~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Apr  4 14:39:20 UTC 2",
      "owner": null,
      "partitions": [
        "gemini"
      ],
      "port": 9001,
      "real_memory": 960000,
      "reason": "",
      "reason_changed_at": 0,
      "reason_set_by_user": null,
      "slurmd_start_time": 1726099613,
      "sockets": 2,
      "threads": 2,
      "temporary_disk": 0,
      "weight": 1,
      "tres": "cpu=128,mem=937.50G,billing=128,gres\/gpu=8,gres\/shard=256",
      "slurmd_version": "21.08.5",
      "alloc_memory": 0,
      "alloc_cpus": 0,
      "idle_cpus": 128,
      "tres_used": null,
      "tres_weighted": 0.0
    },
    {
      "architecture": "x86_64",
      "burstbuffer_network_address": "",
      "boards": 1,
      "boot_time": 1726087646,
      "comment": "",
      "cores": 32,
      "cpu_binding": 0,
      "cpu_load": 65173,
      "extra": "",
      "free_memory": 88015,
      "cpus": 128,
      "last_busy": 1726087695,
      "features": "",
      "active_features": "",
      "gres": "gpu:a6000:8,shard:256",
      "gres_drained": "N\/A",
      "gres_used": "gpu:a6000:4(IDX:4-7),shard:0",
      "mcs_label": "",
      "name": "taurus",
      "next_state_after_reboot": "invalid",
      "address": "taurus",
      "hostname": "taurus",
      "state": "mixed",
      "state_flags": [
      ],
      "next_state_after_reboot_flags": [
      ],
      "operating_system": "Linux 6.8.0-40-generic #40~22.04.3-Ubuntu SMP PREEMPT_DYNAMIC Tue Jul 30 17:30:19 UTC 2",
      "owner": null,
      "partitions": [
        "taurus"
      ],
      "port": 9001,
      "real_memory": 960000,
      "reason": "",
      "reason_changed_at": 0,
      "reason_set_by_user": null,
      "slurmd_start_time": 1726087695,
      "sockets": 2,
      "threads": 2,
      "temporary_disk": 0,
      "weight": 1,
      "tres": "cpu=128,mem=937.50G,billing=128,gres\/gpu=8,gres\/shard=256",
      "slurmd_version": "21.08.5",
      "alloc_memory": 418816,
      "alloc_cpus": 18,
      "idle_cpus": 110,
      "tres_used": "cpu=18,mem=409G,gres\/gpu=4",
      "tres_weighted": 18.0
    }
  ]
}
BadrYoubiIdrissi commented 21 hours ago

Thank you for raising this issue ! Sadly slurm versions change the schema so we will try to make this more robust !

zkx06111 commented 17 hours ago

I think changes made to the json schema can be found on this page: https://slurm.schedmd.com/openapi_release_notes.html

The version I'm using seems to be deprecated.