Change the job launcher to mpirun on summit

carbonscott commented 5 months ago

https://docs.olcf.ornl.gov/software/python/pytorch_frontier.html#torchrun indicates torchrun is not optimized for running distributed pytorch processes on summit (IBM LSF job scheduler).

@frobnitzem has suggested the following changes as a solution:

def init_comm_size_and_rank():
    world_size = 1
    world_rank = 0
    local_rank = 0

    if os.getenv("OMPI_COMM_WORLD_SIZE") and os.getenv("OMPI_COMM_WORLD_RANK"):
        ## Summit
        world_size = int(os.environ["OMPI_COMM_WORLD_SIZE"])
        world_rank = int(os.environ["OMPI_COMM_WORLD_RANK"])
        local_rank = int(os.environ["OMPI_COMM_WORLD_LOCAL_RANK"])
    elif os.getenv("SLURM_NPROCS") and os.getenv("SLURM_PROCID"):
        ## CADES
        world_size = int(os.environ["SLURM_NPROCS"])
        world_rank = int(os.environ["SLURM_PROCID"])
        local_rank = int(os.environ["SLURM_LOCALID"])

    return world_size, world_rank, local_rank

def setup_ddp(backend):
    """ "Initialize DDP"""

    if dist.is_initialized():
        return init_comm_size_and_rank()

    if backend == "":
        if dist.is_nccl_available() and torch.cuda.is_available():
            backend = "nccl"
        elif torch.distributed.is_gloo_available():
            backend = "gloo"
        else:
            raise RuntimeError("No parallel backends available")

    world_size, world_rank, local_rank = init_comm_size_and_rank()

    ## Default setting
    master_addr = "127.0.0.1"
    master_port = "8889"

    if os.getenv("LSB_HOSTS") is not None:
        ## source: https://www.olcf.ornl.gov/wp-content/uploads/2019/12/Scaling-DL-on-Summit.pdf
        ## The following is Summit specific
        master_addr = os.environ["LSB_HOSTS"].split()[1]
    elif os.getenv("LSB_MCPU_HOSTS") is not None:
        master_addr = os.environ["LSB_MCPU_HOSTS"].split()[2]
    elif os.getenv("SLURM_NODELIST") is not None:
        ## The following is CADES specific
        master_addr = parse_slurm_nodelist(os.environ["SLURM_NODELIST"])[0]

    try:
        if backend in ["nccl", "gloo"]:
            os.environ["MASTER_ADDR"] = master_addr
            os.environ["MASTER_PORT"] = master_port
            os.environ["WORLD_SIZE"] = str(world_size)
            os.environ["RANK"] = str(world_rank)

        if (backend == "gloo") and ("GLOO_SOCKET_IFNAME" not in os.environ):
            ifname = find_ifname(master_addr)
            if ifname is not None:
                os.environ["GLOO_SOCKET_IFNAME"] = ifname

        if world_rank == 0:
            print(
                "Distributed data parallel: %s master at %s:%s"
                % (backend, master_addr, master_port),
            )

        if not dist.is_initialized():
            dist.init_process_group(
                backend=backend, init_method="env://", timeout=timedelta(seconds=1800)
            )
        print(f"RANK:{world_rank},LOCAL_RANK:{local_rank},WORLD_SIZE:{world_size}")

    except KeyError:
        print("DDP has to be initialized within a job - Running in sequential mode")

    return world_size, world_rank, local_rank

Need to adapt this code to the script train/train.fsdp.py.

carbonscott commented 5 months ago

a6f216fd6733e49cdead61e4ca8b55d38e3f6f25 adds a function to initialize distributed environment variables on summit.

carbonscott commented 5 months ago

5ce3180aa87c0ae60298b777202ad286e14c1aee updates the BSUB template.

carbonscott commented 4 months ago

Tests confirmed the changes work!

carbonscott / maxie

Change the job launcher to mpirun on summit #1