Open terryysun opened 3 days ago
Thanks for your pull request! It looks like this may be your first contribution to a Google open source project. Before we can look at your pull request, you'll need to sign a Contributor License Agreement (CLA).
View this failed invocation of the CLA check for more information.
For the most up to date status, view the checks section at the bottom of the pull request.
posting dumped profile csv here:
before (on main branch):
NVTX Range,Style,PID,TID,NVTX Inst,Kern Inst,Total Time (ns),Avg (ns),Med (ns),Min (ns),Max (ns),StdDev (ns),Kernel Name
NCCL:ncclGroupEnd,PushPop,109047,109596,1,1,7052334,7052334.0,7052334.0,7052334,7052334,0.0,"ncclDevKernel_SendRecv(ncclDevComm *, unsigned long, ncclWork *)"
NCCL:ncclGroupEnd,PushPop,109047,109598,1,1,979806,979806.0,979806.0,979806,979806,0.0,"ncclDevKernel_SendRecv(ncclDevComm *, unsigned long, ncclWork *)"
NCCL:ncclGroupEnd,PushPop,109047,109600,1,1,1448988,1448988.0,1448988.0,1448988,1448988,0.0,"ncclDevKernel_SendRecv(ncclDevComm *, unsigned long, ncclWork *)"
NCCL:ncclGroupEnd,PushPop,109047,109602,1,1,1286679,1286679.0,1286679.0,1286679,1286679,0.0,"ncclDevKernel_SendRecv(ncclDevComm *, unsigned long, ncclWork *)"
NCCL:ncclGroupEnd,PushPop,109047,109604,1,1,866814,866814.0,866814.0,866814,866814,0.0,"ncclDevKernel_SendRecv(ncclDevComm *, unsigned long, ncclWork *)"
NCCL:ncclGroupEnd,PushPop,109047,109606,1,1,13187909,13187909.0,13187909.0,13187909,13187909,0.0,"ncclDevKernel_SendRecv(ncclDevComm *, unsigned long, ncclWork *)"
NCCL:ncclGroupEnd,PushPop,109047,109608,1,1,407870,407870.0,407870.0,407870,407870,0.0,"ncclDevKernel_SendRecv(ncclDevComm *, unsigned long, ncclWork *)"
NCCL:ncclGroupEnd,PushPop,109047,109610,1,1,126625,126625.0,126625.0,126625,126625,0.0,"ncclDevKernel_SendRecv(ncclDevComm *, unsigned long, ncclWork *)"
TSL:Thunk:#hlo_op=wrapped_transpose#,PushPop,109047,109596,1,1,1921,1921.0,1921.0,1921,1921,0.0,wrapped_transpose
TSL:Thunk:#hlo_op=wrapped_transpose#,PushPop,109047,109598,1,1,1856,1856.0,1856.0,1856,1856,0.0,wrapped_transpose
TSL:Thunk:#hlo_op=wrapped_transpose#,PushPop,109047,109600,1,1,1920,1920.0,1920.0,1920,1920,0.0,wrapped_transpose
TSL:Thunk:#hlo_op=wrapped_transpose#,PushPop,109047,109602,1,1,1824,1824.0,1824.0,1824,1824,0.0,wrapped_transpose
TSL:Thunk:#hlo_op=wrapped_transpose#,PushPop,109047,109604,1,1,2016,2016.0,2016.0,2016,2016,0.0,wrapped_transpose
TSL:Thunk:#hlo_op=wrapped_transpose#,PushPop,109047,109606,1,1,1887,1887.0,1887.0,1887,1887,0.0,wrapped_transpose
TSL:Thunk:#hlo_op=wrapped_transpose#,PushPop,109047,109608,1,1,1920,1920.0,1920.0,1920,1920,0.0,wrapped_transpose
TSL:Thunk:#hlo_op=wrapped_transpose#,PushPop,109047,109610,1,1,2209,2209.0,2209.0,2209,2209,0.0,wrapped_transpose
"TSL:Thunk:#name=,hlo_op=all-to-all.3.1#",PushPop,109047,109596,1,1,7052334,7052334.0,7052334.0,7052334,7052334,0.0,"ncclDevKernel_SendRecv(ncclDevComm *, unsigned long, ncclWork *)"
"TSL:Thunk:#name=,hlo_op=all-to-all.3.1#",PushPop,109047,109598,1,1,979806,979806.0,979806.0,979806,979806,0.0,"ncclDevKernel_SendRecv(ncclDevComm *, unsigned long, ncclWork *)"
"TSL:Thunk:#name=,hlo_op=all-to-all.3.1#",PushPop,109047,109600,1,1,1448988,1448988.0,1448988.0,1448988,1448988,0.0,"ncclDevKernel_SendRecv(ncclDevComm *, unsigned long, ncclWork *)"
"TSL:Thunk:#name=,hlo_op=all-to-all.3.1#",PushPop,109047,109602,1,1,1286679,1286679.0,1286679.0,1286679,1286679,0.0,"ncclDevKernel_SendRecv(ncclDevComm *, unsigned long, ncclWork *)"
"TSL:Thunk:#name=,hlo_op=all-to-all.3.1#",PushPop,109047,109604,1,1,866814,866814.0,866814.0,866814,866814,0.0,"ncclDevKernel_SendRecv(ncclDevComm *, unsigned long, ncclWork *)"
"TSL:Thunk:#name=,hlo_op=all-to-all.3.1#",PushPop,109047,109606,1,1,13187909,13187909.0,13187909.0,13187909,13187909,0.0,"ncclDevKernel_SendRecv(ncclDevComm *, unsigned long, ncclWork *)"
"TSL:Thunk:#name=,hlo_op=all-to-all.3.1#",PushPop,109047,109608,1,1,407870,407870.0,407870.0,407870,407870,0.0,"ncclDevKernel_SendRecv(ncclDevComm *, unsigned long, ncclWork *)"
"TSL:Thunk:#name=,hlo_op=all-to-all.3.1#",PushPop,109047,109610,1,1,126625,126625.0,126625.0,126625,126625,0.0,"ncclDevKernel_SendRecv(ncclDevComm *, unsigned long, ncclWork *)"
"TSL:XlaModule:#prefix=pmap(<lambda>)/jit(main)/all_to_all[split_axis=2 concat_axis=0 axis_name=i axis_index_groups=None tiled=False],hlo_module=pmap__lambda_,program_id=7#",PushPop,109047,109596,1,1,7052334,7052334.0,7052334.0,7052334,7052334,0.0,"ncclDevKernel_SendRecv(ncclDevComm *, unsigned long, ncclWork *)"
"TSL:XlaModule:#prefix=pmap(<lambda>)/jit(main)/all_to_all[split_axis=2 concat_axis=0 axis_name=i axis_index_groups=None tiled=False],hlo_module=pmap__lambda_,program_id=7#",PushPop,109047,109596,1,1,1921,1921.0,1921.0,1921,1921,0.0,wrapped_transpose
"TSL:XlaModule:#prefix=pmap(<lambda>)/jit(main)/all_to_all[split_axis=2 concat_axis=0 axis_name=i axis_index_groups=None tiled=False],hlo_module=pmap__lambda_,program_id=7#",PushPop,109047,109598,1,1,979806,979806.0,979806.0,979806,979806,0.0,"ncclDevKernel_SendRecv(ncclDevComm *, unsigned long, ncclWork *)"
"TSL:XlaModule:#prefix=pmap(<lambda>)/jit(main)/all_to_all[split_axis=2 concat_axis=0 axis_name=i axis_index_groups=None tiled=False],hlo_module=pmap__lambda_,program_id=7#",PushPop,109047,109598,1,1,1856,1856.0,1856.0,1856,1856,0.0,wrapped_transpose
"TSL:XlaModule:#prefix=pmap(<lambda>)/jit(main)/all_to_all[split_axis=2 concat_axis=0 axis_name=i axis_index_groups=None tiled=False],hlo_module=pmap__lambda_,program_id=7#",PushPop,109047,109600,1,1,1448988,1448988.0,1448988.0,1448988,1448988,0.0,"ncclDevKernel_SendRecv(ncclDevComm *, unsigned long, ncclWork *)"
"TSL:XlaModule:#prefix=pmap(<lambda>)/jit(main)/all_to_all[split_axis=2 concat_axis=0 axis_name=i axis_index_groups=None tiled=False],hlo_module=pmap__lambda_,program_id=7#",PushPop,109047,109600,1,1,1920,1920.0,1920.0,1920,1920,0.0,wrapped_transpose
"TSL:XlaModule:#prefix=pmap(<lambda>)/jit(main)/all_to_all[split_axis=2 concat_axis=0 axis_name=i axis_index_groups=None tiled=False],hlo_module=pmap__lambda_,program_id=7#",PushPop,109047,109602,1,1,1286679,1286679.0,1286679.0,1286679,1286679,0.0,"ncclDevKernel_SendRecv(ncclDevComm *, unsigned long, ncclWork *)"
"TSL:XlaModule:#prefix=pmap(<lambda>)/jit(main)/all_to_all[split_axis=2 concat_axis=0 axis_name=i axis_index_groups=None tiled=False],hlo_module=pmap__lambda_,program_id=7#",PushPop,109047,109602,1,1,1824,1824.0,1824.0,1824,1824,0.0,wrapped_transpose
"TSL:XlaModule:#prefix=pmap(<lambda>)/jit(main)/all_to_all[split_axis=2 concat_axis=0 axis_name=i axis_index_groups=None tiled=False],hlo_module=pmap__lambda_,program_id=7#",PushPop,109047,109604,1,1,866814,866814.0,866814.0,866814,866814,0.0,"ncclDevKernel_SendRecv(ncclDevComm *, unsigned long, ncclWork *)"
"TSL:XlaModule:#prefix=pmap(<lambda>)/jit(main)/all_to_all[split_axis=2 concat_axis=0 axis_name=i axis_index_groups=None tiled=False],hlo_module=pmap__lambda_,program_id=7#",PushPop,109047,109604,1,1,2016,2016.0,2016.0,2016,2016,0.0,wrapped_transpose
"TSL:XlaModule:#prefix=pmap(<lambda>)/jit(main)/all_to_all[split_axis=2 concat_axis=0 axis_name=i axis_index_groups=None tiled=False],hlo_module=pmap__lambda_,program_id=7#",PushPop,109047,109606,1,1,13187909,13187909.0,13187909.0,13187909,13187909,0.0,"ncclDevKernel_SendRecv(ncclDevComm *, unsigned long, ncclWork *)"
"TSL:XlaModule:#prefix=pmap(<lambda>)/jit(main)/all_to_all[split_axis=2 concat_axis=0 axis_name=i axis_index_groups=None tiled=False],hlo_module=pmap__lambda_,program_id=7#",PushPop,109047,109606,1,1,1887,1887.0,1887.0,1887,1887,0.0,wrapped_transpose
"TSL:XlaModule:#prefix=pmap(<lambda>)/jit(main)/all_to_all[split_axis=2 concat_axis=0 axis_name=i axis_index_groups=None tiled=False],hlo_module=pmap__lambda_,program_id=7#",PushPop,109047,109608,1,1,407870,407870.0,407870.0,407870,407870,0.0,"ncclDevKernel_SendRecv(ncclDevComm *, unsigned long, ncclWork *)"
"TSL:XlaModule:#prefix=pmap(<lambda>)/jit(main)/all_to_all[split_axis=2 concat_axis=0 axis_name=i axis_index_groups=None tiled=False],hlo_module=pmap__lambda_,program_id=7#",PushPop,109047,109608,1,1,1920,1920.0,1920.0,1920,1920,0.0,wrapped_transpose
"TSL:XlaModule:#prefix=pmap(<lambda>)/jit(main)/all_to_all[split_axis=2 concat_axis=0 axis_name=i axis_index_groups=None tiled=False],hlo_module=pmap__lambda_,program_id=7#",PushPop,109047,109610,1,1,126625,126625.0,126625.0,126625,126625,0.0,"ncclDevKernel_SendRecv(ncclDevComm *, unsigned long, ncclWork *)"
"TSL:XlaModule:#prefix=pmap(<lambda>)/jit(main)/all_to_all[split_axis=2 concat_axis=0 axis_name=i axis_index_groups=None tiled=False],hlo_module=pmap__lambda_,program_id=7#",PushPop,109047,109610,1,1,2209,2209.0,2209.0,2209,2209,0.0,wrapped_transpose
after (on this branch):
NCCL:ncclGroupEnd,PushPop,106159,106708,1,1,131551,131551.0,131551.0,131551,131551,0.0,"ncclDevKernel_SendRecv(ncclDevComm *, unsigned long, ncclWork *)"
NCCL:ncclGroupEnd,PushPop,106159,106710,1,1,1852954,1852954.0,1852954.0,1852954,1852954,0.0,"ncclDevKernel_SendRecv(ncclDevComm *, unsigned long, ncclWork *)"
NCCL:ncclGroupEnd,PushPop,106159,106712,1,1,19590467,19590467.0,19590467.0,19590467,19590467,0.0,"ncclDevKernel_SendRecv(ncclDevComm *, unsigned long, ncclWork *)"
NCCL:ncclGroupEnd,PushPop,106159,106714,1,1,542492,542492.0,542492.0,542492,542492,0.0,"ncclDevKernel_SendRecv(ncclDevComm *, unsigned long, ncclWork *)"
NCCL:ncclGroupEnd,PushPop,106159,106716,1,1,701502,701502.0,701502.0,701502,701502,0.0,"ncclDevKernel_SendRecv(ncclDevComm *, unsigned long, ncclWork *)"
NCCL:ncclGroupEnd,PushPop,106159,106718,1,1,12339304,12339304.0,12339304.0,12339304,12339304,0.0,"ncclDevKernel_SendRecv(ncclDevComm *, unsigned long, ncclWork *)"
NCCL:ncclGroupEnd,PushPop,106159,106720,1,1,352223,352223.0,352223.0,352223,352223,0.0,"ncclDevKernel_SendRecv(ncclDevComm *, unsigned long, ncclWork *)"
NCCL:ncclGroupEnd,PushPop,106159,106722,1,1,2437383,2437383.0,2437383.0,2437383,2437383,0.0,"ncclDevKernel_SendRecv(ncclDevComm *, unsigned long, ncclWork *)"
TSL:Thunk:#hlo_op=wrapped_transpose#,PushPop,106159,106708,1,1,1888,1888.0,1888.0,1888,1888,0.0,wrapped_transpose
TSL:Thunk:#hlo_op=wrapped_transpose#,PushPop,106159,106710,1,1,1921,1921.0,1921.0,1921,1921,0.0,wrapped_transpose
TSL:Thunk:#hlo_op=wrapped_transpose#,PushPop,106159,106712,1,1,1921,1921.0,1921.0,1921,1921,0.0,wrapped_transpose
TSL:Thunk:#hlo_op=wrapped_transpose#,PushPop,106159,106714,1,1,1888,1888.0,1888.0,1888,1888,0.0,wrapped_transpose
TSL:Thunk:#hlo_op=wrapped_transpose#,PushPop,106159,106716,1,1,2016,2016.0,2016.0,2016,2016,0.0,wrapped_transpose
TSL:Thunk:#hlo_op=wrapped_transpose#,PushPop,106159,106718,1,1,1920,1920.0,1920.0,1920,1920,0.0,wrapped_transpose
TSL:Thunk:#hlo_op=wrapped_transpose#,PushPop,106159,106720,1,1,1888,1888.0,1888.0,1888,1888,0.0,wrapped_transpose
TSL:Thunk:#hlo_op=wrapped_transpose#,PushPop,106159,106722,1,1,2336,2336.0,2336.0,2336,2336,0.0,wrapped_transpose
"TSL:Thunk:#name=,hlo_op=all-to-all-start#",PushPop,106159,106708,1,1,131551,131551.0,131551.0,131551,131551,0.0,"ncclDevKernel_SendRecv(ncclDevComm *, unsigned long, ncclWork *)"
"TSL:Thunk:#name=,hlo_op=all-to-all-start#",PushPop,106159,106710,1,1,1852954,1852954.0,1852954.0,1852954,1852954,0.0,"ncclDevKernel_SendRecv(ncclDevComm *, unsigned long, ncclWork *)"
"TSL:Thunk:#name=,hlo_op=all-to-all-start#",PushPop,106159,106712,1,1,19590467,19590467.0,19590467.0,19590467,19590467,0.0,"ncclDevKernel_SendRecv(ncclDevComm *, unsigned long, ncclWork *)"
"TSL:Thunk:#name=,hlo_op=all-to-all-start#",PushPop,106159,106714,1,1,542492,542492.0,542492.0,542492,542492,0.0,"ncclDevKernel_SendRecv(ncclDevComm *, unsigned long, ncclWork *)"
"TSL:Thunk:#name=,hlo_op=all-to-all-start#",PushPop,106159,106716,1,1,701502,701502.0,701502.0,701502,701502,0.0,"ncclDevKernel_SendRecv(ncclDevComm *, unsigned long, ncclWork *)"
"TSL:Thunk:#name=,hlo_op=all-to-all-start#",PushPop,106159,106718,1,1,12339304,12339304.0,12339304.0,12339304,12339304,0.0,"ncclDevKernel_SendRecv(ncclDevComm *, unsigned long, ncclWork *)"
"TSL:Thunk:#name=,hlo_op=all-to-all-start#",PushPop,106159,106720,1,1,352223,352223.0,352223.0,352223,352223,0.0,"ncclDevKernel_SendRecv(ncclDevComm *, unsigned long, ncclWork *)"
"TSL:Thunk:#name=,hlo_op=all-to-all-start#",PushPop,106159,106722,1,1,2437383,2437383.0,2437383.0,2437383,2437383,0.0,"ncclDevKernel_SendRecv(ncclDevComm *, unsigned long, ncclWork *)"
"TSL:XlaModule:#prefix=pmap(<lambda>)/jit(main)/all_to_all[split_axis=2 concat_axis=0 axis_name=i axis_index_groups=None tiled=False],hlo_module=pmap__lambda_,program_id=7#",PushPop,106159,106708,1,1,131551,131551.0,131551.0,131551,131551,0.0,"ncclDevKernel_SendRecv(ncclDevComm *, unsigned long, ncclWork *)"
"TSL:XlaModule:#prefix=pmap(<lambda>)/jit(main)/all_to_all[split_axis=2 concat_axis=0 axis_name=i axis_index_groups=None tiled=False],hlo_module=pmap__lambda_,program_id=7#",PushPop,106159,106708,1,1,1888,1888.0,1888.0,1888,1888,0.0,wrapped_transpose
"TSL:XlaModule:#prefix=pmap(<lambda>)/jit(main)/all_to_all[split_axis=2 concat_axis=0 axis_name=i axis_index_groups=None tiled=False],hlo_module=pmap__lambda_,program_id=7#",PushPop,106159,106710,1,1,1852954,1852954.0,1852954.0,1852954,1852954,0.0,"ncclDevKernel_SendRecv(ncclDevComm *, unsigned long, ncclWork *)"
"TSL:XlaModule:#prefix=pmap(<lambda>)/jit(main)/all_to_all[split_axis=2 concat_axis=0 axis_name=i axis_index_groups=None tiled=False],hlo_module=pmap__lambda_,program_id=7#",PushPop,106159,106710,1,1,1921,1921.0,1921.0,1921,1921,0.0,wrapped_transpose
"TSL:XlaModule:#prefix=pmap(<lambda>)/jit(main)/all_to_all[split_axis=2 concat_axis=0 axis_name=i axis_index_groups=None tiled=False],hlo_module=pmap__lambda_,program_id=7#",PushPop,106159,106712,1,1,19590467,19590467.0,19590467.0,19590467,19590467,0.0,"ncclDevKernel_SendRecv(ncclDevComm *, unsigned long, ncclWork *)"
"TSL:XlaModule:#prefix=pmap(<lambda>)/jit(main)/all_to_all[split_axis=2 concat_axis=0 axis_name=i axis_index_groups=None tiled=False],hlo_module=pmap__lambda_,program_id=7#",PushPop,106159,106712,1,1,1921,1921.0,1921.0,1921,1921,0.0,wrapped_transpose
"TSL:XlaModule:#prefix=pmap(<lambda>)/jit(main)/all_to_all[split_axis=2 concat_axis=0 axis_name=i axis_index_groups=None tiled=False],hlo_module=pmap__lambda_,program_id=7#",PushPop,106159,106714,1,1,542492,542492.0,542492.0,542492,542492,0.0,"ncclDevKernel_SendRecv(ncclDevComm *, unsigned long, ncclWork *)"
"TSL:XlaModule:#prefix=pmap(<lambda>)/jit(main)/all_to_all[split_axis=2 concat_axis=0 axis_name=i axis_index_groups=None tiled=False],hlo_module=pmap__lambda_,program_id=7#",PushPop,106159,106714,1,1,1888,1888.0,1888.0,1888,1888,0.0,wrapped_transpose
"TSL:XlaModule:#prefix=pmap(<lambda>)/jit(main)/all_to_all[split_axis=2 concat_axis=0 axis_name=i axis_index_groups=None tiled=False],hlo_module=pmap__lambda_,program_id=7#",PushPop,106159,106716,1,1,701502,701502.0,701502.0,701502,701502,0.0,"ncclDevKernel_SendRecv(ncclDevComm *, unsigned long, ncclWork *)"
"TSL:XlaModule:#prefix=pmap(<lambda>)/jit(main)/all_to_all[split_axis=2 concat_axis=0 axis_name=i axis_index_groups=None tiled=False],hlo_module=pmap__lambda_,program_id=7#",PushPop,106159,106716,1,1,2016,2016.0,2016.0,2016,2016,0.0,wrapped_transpose
"TSL:XlaModule:#prefix=pmap(<lambda>)/jit(main)/all_to_all[split_axis=2 concat_axis=0 axis_name=i axis_index_groups=None tiled=False],hlo_module=pmap__lambda_,program_id=7#",PushPop,106159,106718,1,1,12339304,12339304.0,12339304.0,12339304,12339304,0.0,"ncclDevKernel_SendRecv(ncclDevComm *, unsigned long, ncclWork *)"
"TSL:XlaModule:#prefix=pmap(<lambda>)/jit(main)/all_to_all[split_axis=2 concat_axis=0 axis_name=i axis_index_groups=None tiled=False],hlo_module=pmap__lambda_,program_id=7#",PushPop,106159,106718,1,1,1920,1920.0,1920.0,1920,1920,0.0,wrapped_transpose
"TSL:XlaModule:#prefix=pmap(<lambda>)/jit(main)/all_to_all[split_axis=2 concat_axis=0 axis_name=i axis_index_groups=None tiled=False],hlo_module=pmap__lambda_,program_id=7#",PushPop,106159,106720,1,1,352223,352223.0,352223.0,352223,352223,0.0,"ncclDevKernel_SendRecv(ncclDevComm *, unsigned long, ncclWork *)"
"TSL:XlaModule:#prefix=pmap(<lambda>)/jit(main)/all_to_all[split_axis=2 concat_axis=0 axis_name=i axis_index_groups=None tiled=False],hlo_module=pmap__lambda_,program_id=7#",PushPop,106159,106720,1,1,1888,1888.0,1888.0,1888,1888,0.0,wrapped_transpose
"TSL:XlaModule:#prefix=pmap(<lambda>)/jit(main)/all_to_all[split_axis=2 concat_axis=0 axis_name=i axis_index_groups=None tiled=False],hlo_module=pmap__lambda_,program_id=7#",PushPop,106159,106722,1,1,2437383,2437383.0,2437383.0,2437383,2437383,0.0,"ncclDevKernel_SendRecv(ncclDevComm *, unsigned long, ncclWork *)"
"TSL:XlaModule:#prefix=pmap(<lambda>)/jit(main)/all_to_all[split_axis=2 concat_axis=0 axis_name=i axis_index_groups=None tiled=False],hlo_module=pmap__lambda_,program_id=7#",PushPop,106159,106722,1,1,2336,2336.0,2336.0,2336,2336,0.0,wrapped_transpose
I don't see any additional annotations in the after
you show above. Did I miss something? What are you trying to achieve with this change?
On second look, I see that some all-to-all.3.1
entries changed to all-to-all-start
, which is the intended change.
Problem: Currently in HLO dumping the syntactic sugar for async ops are turned on by default, while in nsys profile the op names are the actual op names, which is causing inefficiency when trying to correspond them.
Solution: Annotate syntactic sugar op name in nsys profile.