facebookresearch / HolisticTraceAnalysis

A library to analyze PyTorch traces.
http://hta.readthedocs.io
MIT License
293 stars 40 forks source link

Self-Dependencies in External Event IDs in SYNC_DEPENDENCY #156

Open TaekyungHeo opened 4 months ago

TaekyungHeo commented 4 months ago

What is your question?

I have written a simple Python script to load sync dependencies from a Kineto trace and then print out event IDs and external event IDs. When I print out event IDs, I don't find any self-dependencies. However, when I print out external event IDs of the start event and end event, I see self-dependencies. Is this expected behavior?

How to reproduce

$ git clone git@github.com:facebookresearch/HolisticTraceAnalysis.git
$ cd HolisticTraceAnalysis
$ pip install .
$ python sync_dep.py --input ~/Downloads/cuda-sync/kineto_0.json --rank 0 | grep THEO > /tmp/out
$ cat /tmp/out

sync_dep.py

import argparse
import logging
import os
from typing import Dict, List

from hta.analyzers.critical_path_analysis import CPEdgeType
from hta.trace_analysis import TraceAnalysis

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def load_sync_dependencies(
    rank: int, kineto_file: str, annotation: str = "ProfilerStep", instance_id: int = 0
) -> Dict[int, List[int]]:
    """
    Load synchronization dependencies using Holistic Trace Analysis (HTA).

    Args:
        rank (int): Rank for the input Kineto trace.
        kineto_file (str): Path to the Kineto trace file.
        annotation (str): Annotation to use for the analysis. Defaults to "ProfilerStep".
        instance_id (int): Instance ID for the analysis. Defaults to 0.

    Returns:
        Dict[int, List[int]]: A dictionary mapping end event's external ID to a list of start event's external IDs
            that have synchronization dependencies.
    """
    sync_dependencies = {}
    trace_analysis = TraceAnalysis(trace_dir=os.path.dirname(kineto_file))
    cp_graph, success = trace_analysis.critical_path_analysis(rank=rank, annotation=annotation, instance_id=instance_id)
    if not success:
        logger.error("Failed to load Critical Path Graph")
        return sync_dependencies

    raw_events = trace_analysis.t.get_raw_trace_for_one_rank(rank=rank)["traceEvents"]
    for edge in cp_graph.critical_path_edges_set:
        if edge.type in [CPEdgeType.SYNC_DEPENDENCY]:
            start_event_id, end_event_id = cp_graph.get_events_for_edge(edge)
            start_event, end_event = raw_events[start_event_id], raw_events[end_event_id]
            if "External id" in end_event["args"] and "External id" in start_event["args"]:
                start_cat = start_event["cat"]
                end_event_external_id = end_event["args"]["External id"]
                start_event_external_id = start_event["args"]["External id"]
                start_name = start_event["name"]
                end_name = end_event["name"]
                print(
                    f"THEO: start_event_id {start_event_id}, end_event_id {end_event_id}, "
                    f"start_event_external_id {start_event_external_id}, end_event_external_id {end_event_external_id}"
                )
            else:
                logger.warning(
                    f"Synchronization dependency from event {start_event_id} to event {end_event_id} will "
                    "not be considered due to missing external IDs."
                )
    return sync_dependencies

def main() -> None:
    """
    Main function to parse arguments and load synchronization dependencies.
    """
    parser = argparse.ArgumentParser(description="Load and print critical paths from Kineto traces.")
    parser.add_argument("--input", type=str, help="Path to the Kineto trace file.")
    parser.add_argument("--rank", type=int, help="Rank for the input traces.")
    args = parser.parse_args()

    load_sync_dependencies(args.rank, args.input)

if __name__ == "__main__":
    main()

kineto_0.json

/tmp/out

THEO: start_event_id 24920, end_event_id 24926, start_event_external_id 16410, end_event_external_id 16410
THEO: start_event_id 24536, end_event_id 24650, start_event_external_id 13847, end_event_external_id 91874
THEO: start_event_id 24876, end_event_id 24882, start_event_external_id 16396, end_event_external_id 16396
THEO: start_event_id 25036, end_event_id 25042, start_event_external_id 16459, end_event_external_id 16459
THEO: start_event_id 25100, end_event_id 25106, start_event_external_id 5, end_event_external_id 5
THEO: start_event_id 25108, end_event_id 25114, start_event_external_id 11, end_event_external_id 11
THEO: start_event_id 24868, end_event_id 24874, start_event_external_id 94785, end_event_external_id 94792
THEO: start_event_id 25116, end_event_id 25122, start_event_external_id 15, end_event_external_id 15
THEO: start_event_id 24928, end_event_id 24934, start_event_external_id 16416, end_event_external_id 16416
THEO: start_event_id 25124, end_event_id 25130, start_event_external_id 23, end_event_external_id 23
THEO: start_event_id 24944, end_event_id 24950, start_event_external_id 16428, end_event_external_id 16428
THEO: start_event_id 25076, end_event_id 25082, start_event_external_id 16468, end_event_external_id 16468
THEO: start_event_id 24960, end_event_id 24966, start_event_external_id 16442, end_event_external_id 16442
THEO: start_event_id 24656, end_event_id 24662, start_event_external_id 10097, end_event_external_id 10097
THEO: start_event_id 24936, end_event_id 24942, start_event_external_id 16420, end_event_external_id 16420
THEO: start_event_id 24952, end_event_id 24958, start_event_external_id 16435, end_event_external_id 16435

Environment