FlagOpen / FlagScale

FlagScale is a large model toolkit based on open-sourced projects.
Other
167 stars 42 forks source link

[Profiler] Outputting process-to-hardware and process-to-communication-group mapings. #177

Closed phoenixdong closed 2 months ago

phoenixdong commented 3 months ago

Add print_ranks method for output during initialization:

Enable this feature by adding the following fields to the configured yaml file, for example:

system:
  ...
  analyze:
    analyze_save_dir: ./examples/aquila/analyze

If not configured, the above file information will be output to stdout by default.

phoenixdong commented 3 months ago

Example parallelism_to_groups.json

{
    "dp": [
        [
            0,
            4
        ],
        [
            1,
            5
        ],
        [
            2,
            6
        ],
        [
            3,
            7
        ],
        [
            8,
            12
        ],
        [
            9,
            13
        ],
        [
            10,
            14
        ],
        [
            11,
            15
        ]
    ],
    "dp-cp": [
        [
            0,
            4
        ],
        [
            1,
            5
        ],
        [
            2,
            6
        ],
        [
            3,
            7
        ],
        [
            8,
            12
        ],
        [
            9,
            13
        ],
        [
            10,
            14
        ],
        [
            11,
            15
        ]
    ],
    "cp": [
        [
            0
        ],
        [
            1
        ],
        [
            2
        ],
        [
            3
        ],
        [
            4
        ],
        [
            5
        ],
        [
            6
        ],
        [
            7
        ],
        [
            8
        ],
        [
            9
        ],
        [
            10
        ],
        [
            11
        ],
        [
            12
        ],
        [
            13
        ],
        [
            14
        ],
        [
            15
        ]
    ],
    "tp-pp": [
        [
            0,
            1,
            2,
            3,
            8,
            9,
            10,
            11
        ],
        [
            4,
            5,
            6,
            7,
            12,
            13,
            14,
            15
        ]
    ],
    "tp-ep-pp": [
        [
            0,
            1,
            2,
            3,
            8,
            9,
            10,
            11
        ],
        [
            4,
            5,
            6,
            7,
            12,
            13,
            14,
            15
        ]
    ],
    "tp": [
        [
            0,
            1,
            2,
            3
        ],
        [
            4,
            5,
            6,
            7
        ],
        [
            8,
            9,
            10,
            11
        ],
        [
            12,
            13,
            14,
            15
        ]
    ],
    "pp": [
        [
            0,
            8
        ],
        [
            1,
            9
        ],
        [
            2,
            10
        ],
        [
            3,
            11
        ],
        [
            4,
            12
        ],
        [
            5,
            13
        ],
        [
            6,
            14
        ],
        [
            7,
            15
        ]
    ],
    "tp-dp-cp": [
        [
            0,
            1,
            2,
            3,
            4,
            5,
            6,
            7
        ],
        [
            8,
            9,
            10,
            11,
            12,
            13,
            14,
            15
        ]
    ],
    "tp-dp": [
        [
            0,
            1,
            2,
            3,
            4,
            5,
            6,
            7
        ],
        [
            8,
            9,
            10,
            11,
            12,
            13,
            14,
            15
        ]
    ],
    "tp-cp": [
        [
            0,
            1,
            2,
            3
        ],
        [
            4,
            5,
            6,
            7
        ],
        [
            8,
            9,
            10,
            11
        ],
        [
            12,
            13,
            14,
            15
        ]
    ],
    "tp-ep": [
        [
            0,
            1,
            2,
            3
        ],
        [
            4,
            5,
            6,
            7
        ],
        [
            8,
            9,
            10,
            11
        ],
        [
            12,
            13,
            14,
            15
        ]
    ],
    "ep": [
        [
            0
        ],
        [
            1
        ],
        [
            2
        ],
        [
            3
        ],
        [
            4
        ],
        [
            5
        ],
        [
            6
        ],
        [
            7
        ],
        [
            8
        ],
        [
            9
        ],
        [
            10
        ],
        [
            11
        ],
        [
            12
        ],
        [
            13
        ],
        [
            14
        ],
        [
            15
        ]
    ]
}
phoenixdong commented 3 months ago

Example rank_to_host_name_and_ip.json


    "0": {
        "host_name:": "node_0",
        "host_ip": "127.0.0.0"
    },
    "1": {
        "host_name:": "node_0",
        "host_ip": "127.0.0.0"
    },
    "2": {
        "host_name:": "node_0",
        "host_ip": "127.0.0.0"
    },
    "3": {
        "host_name:": "node_0",
        "host_ip": "127.0.0.0"
    },
    "4": {
        "host_name:": "node_0",
        "host_ip": "127.0.0.0"
    },
    "5": {
        "host_name:": "node_0",
        "host_ip": "127.0.0.0"
    },
    "6": {
        "host_name:": "node_0",
        "host_ip": "127.0.0.0"
    },
    "7": {
        "host_name:": "node_0",
        "host_ip": "127.0.0.0"
    },
    "8": {
        "host_name:": "node_1",
        "host_ip": "127.0.0.1"
    },
    "9": {
        "host_name:": "node_1",
        "host_ip": "127.0.0.1"
    },
    "10": {
        "host_name:": "node_1",
        "host_ip": "127.0.0.1"
    },
    "11": {
        "host_name:": "node_1",
        "host_ip": "127.0.0.1"
    },
    "12": {
        "host_name:": "node_1",
        "host_ip": "127.0.0.1"
    },
    "13": {
        "host_name:": "node_1",
        "host_ip": "127.0.0.1"
    },
    "14": {
        "host_name:": "node_1",
        "host_ip": "127.0.0.1"
    },
    "15": {
        "host_name:": "node_1",
        "host_ip": "127.0.0.1"
    }
}
phoenixdong commented 3 months ago

Example rank_to_parallelism_to_group_id.json


{
    "0": {
        "dp": 0,
        "dp-cp": 0,
        "cp": 0,
        "tp-pp": 0,
        "tp-ep-pp": 0,
        "tp": 0,
        "pp": 0,
        "tp-dp-cp": 0,
        "tp-dp": 0,
        "tp-cp": 0,
        "tp-ep": 0,
        "ep": 0
    },
    "4": {
        "dp": 0,
        "dp-cp": 0,
        "cp": 4,
        "tp-pp": 1,
        "tp-ep-pp": 1,
        "tp": 1,
        "pp": 4,
        "tp-dp-cp": 0,
        "tp-dp": 0,
        "tp-cp": 1,
        "tp-ep": 1,
        "ep": 4
    },
    "1": {
        "dp": 1,
        "dp-cp": 1,
        "cp": 1,
        "tp-pp": 0,
        "tp-ep-pp": 0,
        "tp": 0,
        "pp": 1,
        "tp-dp-cp": 0,
        "tp-dp": 0,
        "tp-cp": 0,
        "tp-ep": 0,
        "ep": 1
    },
    "5": {
        "dp": 1,
        "dp-cp": 1,
        "cp": 5,
        "tp-pp": 1,
        "tp-ep-pp": 1,
        "tp": 1,
        "pp": 5,
        "tp-dp-cp": 0,
        "tp-dp": 0,
        "tp-cp": 1,
        "tp-ep": 1,
        "ep": 5
    },
    "2": {
        "dp": 2,
        "dp-cp": 2,
        "cp": 2,
        "tp-pp": 0,
        "tp-ep-pp": 0,
        "tp": 0,
        "pp": 2,
        "tp-dp-cp": 0,
        "tp-dp": 0,
        "tp-cp": 0,
        "tp-ep": 0,
        "ep": 2
    },
    "6": {
        "dp": 2,
        "dp-cp": 2,
        "cp": 6,
        "tp-pp": 1,
        "tp-ep-pp": 1,
        "tp": 1,
        "pp": 6,
        "tp-dp-cp": 0,
        "tp-dp": 0,
        "tp-cp": 1,
        "tp-ep": 1,
        "ep": 6
    },
    "3": {
        "dp": 3,
        "dp-cp": 3,
        "cp": 3,
        "tp-pp": 0,
        "tp-ep-pp": 0,
        "tp": 0,
        "pp": 3,
        "tp-dp-cp": 0,
        "tp-dp": 0,
        "tp-cp": 0,
        "tp-ep": 0,
        "ep": 3
    },
    "7": {
        "dp": 3,
        "dp-cp": 3,
        "cp": 7,
        "tp-pp": 1,
        "tp-ep-pp": 1,
        "tp": 1,
        "pp": 7,
        "tp-dp-cp": 0,
        "tp-dp": 0,
        "tp-cp": 1,
        "tp-ep": 1,
        "ep": 7
    },
    "8": {
        "dp": 4,
        "dp-cp": 4,
        "cp": 8,
        "tp-pp": 0,
        "tp-ep-pp": 0,
        "tp": 2,
        "pp": 0,
        "tp-dp-cp": 1,
        "tp-dp": 1,
        "tp-cp": 2,
        "tp-ep": 2,
        "ep": 8
    },
    "12": {
        "dp": 4,
        "dp-cp": 4,
        "cp": 12,
        "tp-pp": 1,
        "tp-ep-pp": 1,
        "tp": 3,
        "pp": 4,
        "tp-dp-cp": 1,
        "tp-dp": 1,
        "tp-cp": 3,
        "tp-ep": 3,
        "ep": 12
    },
    "9": {
        "dp": 5,
        "dp-cp": 5,
        "cp": 9,
        "tp-pp": 0,
        "tp-ep-pp": 0,
        "tp": 2,
        "pp": 1,
        "tp-dp-cp": 1,
        "tp-dp": 1,
        "tp-cp": 2,
        "tp-ep": 2,
        "ep": 9
    },
    "13": {
        "dp": 5,
        "dp-cp": 5,
        "cp": 13,
        "tp-pp": 1,
        "tp-ep-pp": 1,
        "tp": 3,
        "pp": 5,
        "tp-dp-cp": 1,
        "tp-dp": 1,
        "tp-cp": 3,
        "tp-ep": 3,
        "ep": 13
    },
    "10": {
        "dp": 6,
        "dp-cp": 6,
        "cp": 10,
        "tp-pp": 0,
        "tp-ep-pp": 0,
        "tp": 2,
        "pp": 2,
        "tp-dp-cp": 1,
        "tp-dp": 1,
        "tp-cp": 2,
        "tp-ep": 2,
        "ep": 10
    },
    "14": {
        "dp": 6,
        "dp-cp": 6,
        "cp": 14,
        "tp-pp": 1,
        "tp-ep-pp": 1,
        "tp": 3,
        "pp": 6,
        "tp-dp-cp": 1,
        "tp-dp": 1,
        "tp-cp": 3,
        "tp-ep": 3,
        "ep": 14
    },
    "11": {
        "dp": 7,
        "dp-cp": 7,
        "cp": 11,
        "tp-pp": 0,
        "tp-ep-pp": 0,
        "tp": 2,
        "pp": 3,
        "tp-dp-cp": 1,
        "tp-dp": 1,
        "tp-cp": 2,
        "tp-ep": 2,
        "ep": 11
    },
    "15": {
        "dp": 7,
        "dp-cp": 7,
        "cp": 15,
        "tp-pp": 1,
        "tp-ep-pp": 1,
        "tp": 3,
        "pp": 7,
        "tp-dp-cp": 1,
        "tp-dp": 1,
        "tp-cp": 3,
        "tp-ep": 3,
        "ep": 15
    }
}