NVIDIA / nccl

Optimized primitives for collective multi-GPU communication
Other
3.08k stars 776 forks source link

Some questions about selecting NET when searching channels. #1313

Closed shanleo2024 closed 2 months ago

shanleo2024 commented 2 months ago

HI, I have some questions about selecting NET when searching channels. Here is my env, there are 8 GPUs and 2 NETs on one NODE. c92008835e6b9627473020f3cc2092f

The final channel searched by NCCL is as follows: 801f7ec4c7728d6f5043e330072f758

The topo.xml is like this:

<system version="1">
  <cpu numaid="0" affinity="0000,00000000,00000000,00000000,00000fff,ffffffff" arch="x86_64" vendor="GenuineIntel" familyid="6" modelid="143">
    <pci busid="0000:16:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
      <pci busid="0000:18:00.0" class="0x030200" vendor="0x10de" device="0x2324" subsystem_vendor="0x10de" subsystem_device="0x17a6" link_speed="32.0 GT/s PCIe" link_width="16">
        <gpu dev="0" sm="90" rank="0" gdr="1">
          <nvlink target="0000:06:00.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:07:00.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:08:00.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:05:00.0" count="2" tclass="0x068000"/>
        </gpu>
      </pci>
      <pci busid="0000:19:00.0" class="0x020700" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0023" link_speed="32.0 GT/s PCIe" link_width="16">
        <nic>
          <net name="mlx5_0" dev="0" speed="400000" port="1" latency="0.000000" guid="0x4afbd30003ae6d94" maxconn="131072" gdr="1"/>
        </nic>
      </pci>
    </pci>
    <pci busid="0000:38:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
      <pci busid="0000:3a:00.0" class="0x030200" vendor="0x10de" device="0x2324" subsystem_vendor="0x10de" subsystem_device="0x17a6" link_speed="32.0 GT/s PCIe" link_width="16">
        <gpu dev="1" sm="90" rank="1" gdr="1">
          <nvlink target="0000:06:00.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:08:00.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:07:00.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:05:00.0" count="2" tclass="0x068000"/>
        </gpu>
      </pci>
    </pci>
    <pci busid="0000:49:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
      <pci busid="0000:4c:00.0" class="0x030200" vendor="0x10de" device="0x2324" subsystem_vendor="0x10de" subsystem_device="0x17a6" link_speed="32.0 GT/s PCIe" link_width="16">
        <gpu dev="2" sm="90" rank="2" gdr="1">
          <nvlink target="0000:06:00.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:08:00.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:07:00.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:05:00.0" count="2" tclass="0x068000"/>
        </gpu>
      </pci>
    </pci>
    <pci busid="0000:5a:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
      <pci busid="0000:5d:00.0" class="0x030200" vendor="0x10de" device="0x2324" subsystem_vendor="0x10de" subsystem_device="0x17a6" link_speed="32.0 GT/s PCIe" link_width="16">
        <gpu dev="3" sm="90" rank="3" gdr="1">
          <nvlink target="0000:07:00.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:08:00.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:05:00.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:06:00.0" count="2" tclass="0x068000"/>
        </gpu>
      </pci>
    </pci>
  </cpu>
  <cpu numaid="1" affinity="0000,00000000,00000000,00ffffff,fffff000,00000000" arch="x86_64" vendor="GenuineIntel" familyid="6" modelid="143">
    <pci busid="0000:98:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
      <pci busid="0000:9a:00.0" class="0x030200" vendor="0x10de" device="0x2324" subsystem_vendor="0x10de" subsystem_device="0x17a6" link_speed="32.0 GT/s PCIe" link_width="16">
        <gpu dev="4" sm="90" rank="4" gdr="1">
          <nvlink target="0000:06:00.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:05:00.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:08:00.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:07:00.0" count="2" tclass="0x068000"/>
        </gpu>
      </pci>
    </pci>
    <pci busid="0000:b8:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
      <pci busid="0000:ba:00.0" class="0x030200" vendor="0x10de" device="0x2324" subsystem_vendor="0x10de" subsystem_device="0x17a6" link_speed="32.0 GT/s PCIe" link_width="16">
        <gpu dev="5" sm="90" rank="5" gdr="1">
          <nvlink target="0000:08:00.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:06:00.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:05:00.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:07:00.0" count="2" tclass="0x068000"/>
        </gpu>
      </pci>
    </pci>
    <pci busid="0000:c8:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
      <pci busid="0000:cb:00.0" class="0x030200" vendor="0x10de" device="0x2324" subsystem_vendor="0x10de" subsystem_device="0x17a6" link_speed="32.0 GT/s PCIe" link_width="16">
        <gpu dev="6" sm="90" rank="6" gdr="1">
          <nvlink target="0000:08:00.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:06:00.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:07:00.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:05:00.0" count="2" tclass="0x068000"/>
        </gpu>
      </pci>
    </pci>
    <pci busid="0000:d8:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
      <pci busid="0000:da:00.0" class="0x020700" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0023" link_speed="32.0 GT/s PCIe" link_width="16">
        <nic>
          <net name="mlx5_5" dev="1" speed="400000" port="1" latency="0.000000" guid="0x20a1d70003ae6d94" maxconn="131072" gdr="1"/>
        </nic>
      </pci>
      <pci busid="0000:db:00.0" class="0x030200" vendor="0x10de" device="0x2324" subsystem_vendor="0x10de" subsystem_device="0x17a6" link_speed="32.0 GT/s PCIe" link_width="16">
        <gpu dev="7" sm="90" rank="7" gdr="1">
          <nvlink target="0000:07:00.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:06:00.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:05:00.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:08:00.0" count="2" tclass="0x068000"/>
        </gpu>
      </pci>
    </pci>
  </cpu>
</system>

The question which confusing me is that why the out put NET is 0, but not select NET1, as the NET1 is closer to GPU7 than NET0. What are the principles for choosing a network card? Thank you so much!

sjeaugey commented 2 months ago

The drawing you posted does not correspond to the topology detected by NCCL. It looks like all GPUs are connected directly to the CPU, except for GPUs 0 and 7 which have a small PCI switch to connect the NIC and the GPU on the same PCI port from the CPU.

As for locality, I'm not sure I understand the problem. Data to/from NET 0 will go through GPU 0 and data to/from NET 1 will go through GPU 7. I don't see where GPU 7 sends to net 0. Even if that was the case, GPU 7 would move data to GPU 0 first, then data would go from GPU 0 to NET 0.

shanleo2024 commented 2 months ago

Hi @sjeaugey , Thank you for your response. Sorry for confusing you, the drawing is just the draft topo I draw, maybe there is some mismatch. Here is the topo file dump by NCCL, plesae help to check it.

<system version="1">
  <cpu numaid="0" affinity="0000,00000000,00000000,00000000,00000fff,ffffffff" arch="x86_64" vendor="GenuineIntel" familyid="6" modelid="143">
    <pci busid="0000:16:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
      <pci busid="0000:18:00.0" class="0x030200" vendor="0x10de" device="0x2324" subsystem_vendor="0x10de" subsystem_device="0x17a6" link_speed="32.0 GT/s PCIe" link_width="16">
        <gpu dev="0" sm="90" rank="0" gdr="1">
          <nvlink target="0000:06:00.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:07:00.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:08:00.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:05:00.0" count="2" tclass="0x068000"/>
        </gpu>
      </pci>
      <pci busid="0000:19:00.0" class="0x020700" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0023" link_speed="32.0 GT/s PCIe" link_width="16">
        <nic>
          <net name="mlx5_0" dev="0" speed="400000" port="1" latency="0.000000" guid="0x4afbd30003ae6d94" maxconn="131072" gdr="1"/>
        </nic>
      </pci>
    </pci>
    <pci busid="0000:38:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
      <pci busid="0000:3a:00.0" class="0x030200" vendor="0x10de" device="0x2324" subsystem_vendor="0x10de" subsystem_device="0x17a6" link_speed="32.0 GT/s PCIe" link_width="16">
        <gpu dev="1" sm="90" rank="1" gdr="1">
          <nvlink target="0000:06:00.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:08:00.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:07:00.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:05:00.0" count="2" tclass="0x068000"/>
        </gpu>
      </pci>
    </pci>
    <pci busid="0000:49:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
      <pci busid="0000:4c:00.0" class="0x030200" vendor="0x10de" device="0x2324" subsystem_vendor="0x10de" subsystem_device="0x17a6" link_speed="32.0 GT/s PCIe" link_width="16">
        <gpu dev="2" sm="90" rank="2" gdr="1">
          <nvlink target="0000:06:00.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:08:00.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:07:00.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:05:00.0" count="2" tclass="0x068000"/>
        </gpu>
      </pci>
    </pci>
    <pci busid="0000:5a:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
      <pci busid="0000:5d:00.0" class="0x030200" vendor="0x10de" device="0x2324" subsystem_vendor="0x10de" subsystem_device="0x17a6" link_speed="32.0 GT/s PCIe" link_width="16">
        <gpu dev="3" sm="90" rank="3" gdr="1">
          <nvlink target="0000:07:00.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:08:00.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:05:00.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:06:00.0" count="2" tclass="0x068000"/>
        </gpu>
      </pci>
    </pci>
  </cpu>
  <cpu numaid="1" affinity="0000,00000000,00000000,00ffffff,fffff000,00000000" arch="x86_64" vendor="GenuineIntel" familyid="6" modelid="143">
    <pci busid="0000:98:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
      <pci busid="0000:9a:00.0" class="0x030200" vendor="0x10de" device="0x2324" subsystem_vendor="0x10de" subsystem_device="0x17a6" link_speed="32.0 GT/s PCIe" link_width="16">
        <gpu dev="4" sm="90" rank="4" gdr="1">
          <nvlink target="0000:06:00.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:05:00.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:08:00.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:07:00.0" count="2" tclass="0x068000"/>
        </gpu>
      </pci>
    </pci>
    <pci busid="0000:b8:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
      <pci busid="0000:ba:00.0" class="0x030200" vendor="0x10de" device="0x2324" subsystem_vendor="0x10de" subsystem_device="0x17a6" link_speed="32.0 GT/s PCIe" link_width="16">
        <gpu dev="5" sm="90" rank="5" gdr="1">
          <nvlink target="0000:08:00.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:06:00.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:05:00.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:07:00.0" count="2" tclass="0x068000"/>
        </gpu>
      </pci>
    </pci>
    <pci busid="0000:c8:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
      <pci busid="0000:cb:00.0" class="0x030200" vendor="0x10de" device="0x2324" subsystem_vendor="0x10de" subsystem_device="0x17a6" link_speed="32.0 GT/s PCIe" link_width="16">
        <gpu dev="6" sm="90" rank="6" gdr="1">
          <nvlink target="0000:08:00.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:06:00.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:07:00.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:05:00.0" count="2" tclass="0x068000"/>
        </gpu>
      </pci>
    </pci>
    <pci busid="0000:d8:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
      <pci busid="0000:da:00.0" class="0x020700" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0023" link_speed="32.0 GT/s PCIe" link_width="16">
        <nic>
          <net name="mlx5_5" dev="1" speed="400000" port="1" latency="0.000000" guid="0x20a1d70003ae6d94" maxconn="131072" gdr="1"/>
        </nic>
      </pci>
      <pci busid="0000:db:00.0" class="0x030200" vendor="0x10de" device="0x2324" subsystem_vendor="0x10de" subsystem_device="0x17a6" link_speed="32.0 GT/s PCIe" link_width="16">
        <gpu dev="7" sm="90" rank="7" gdr="1">
          <nvlink target="0000:07:00.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:06:00.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:05:00.0" count="2" tclass="0x068000"/>
          <nvlink target="0000:08:00.0" count="2" tclass="0x068000"/>
        </gpu>
      </pci>
    </pci>
  </cpu>
</system>

The following graph can make sence, as the GPU0 will send the data to NET0 which is closer to GPU0.

    <channel>
      <net dev="0"/>
      <gpu dev="0"/>
      <gpu dev="7"/>
      <gpu dev="6"/>
      <gpu dev="5"/>
      <gpu dev="4"/>
      <gpu dev="3"/>
      <gpu dev="2"/>
      <gpu dev="1"/>
      <net dev="0"/>
    </channel>

Which confusing me is the other graph, the GPU0 will send the data to NET1, although the GPU0 can use PXN to transport the date to GPU7 (If I understand correctlly), then send out to NET1. But I think it is better if the GPU0 choose NET0 as the output NET, as the NET0 is closer to GPU0 than the NET1.

    <channel>
      <net dev="1"/>
      <gpu dev="7"/>
      <gpu dev="6"/>
      <gpu dev="5"/>
      <gpu dev="4"/>
      <gpu dev="3"/>
      <gpu dev="2"/>
      <gpu dev="1"/>
      <gpu dev="0"/>
      <net dev="1"/>
    </channel>

The logic for selecting NETs is too complex, with too many iterations, so please help to describe the principles for choosing an entrance NET and an exit NET. Thank you very much.

sjeaugey commented 2 months ago

But I think it is better if the GPU0 choose NET0 as the output NET, as the NET0 is closer to GPU0 than the NET1.

But NET0 is already used by the first channel; so we don't want to use it twice, which is why we exit from NET1 even if it means adding a step through GPU7.

Is that causing any performance problem?

shanleo2024 commented 2 months ago

Hi sjeaugey, There is no any performance problem till now. I am just wondering whether it is better if the two channels as follows:

NET0---GPU0-GPU1-GPU2--GPU3-GPU4-GPU5-GPU6-GPU7-NET1
NET1---GPU7-GPU6-GPU5--GPU4-GPU3-GPU2-GPU1-GPU0-NET0

or

NET0---GPU0-GPU1-GPU2--GPU3-GPU4-GPU5-GPU6-GPU7-NET1
NET1---GPU0-GPU1-GPU2--GPU3-GPU4-GPU5-GPU6-GPU7-NET0

That means NCCL can fullly use all of the NETs both in the two directions. If I ask the low level question, please forgive me as NCCL still has many places for me that I haven't touched yet。

jiangzhuti commented 2 months ago

Hi sjeaugey, There is no any performance problem till now. I am just wondering whether it is better if the two channels as follows:

NET0---GPU0-GPU1-GPU2--GPU3-GPU4-GPU5-GPU6-GPU7-NET1
NET1---GPU7-GPU6-GPU5--GPU4-GPU3-GPU2-GPU1-GPU0-NET0

or

NET0---GPU0-GPU1-GPU2--GPU3-GPU4-GPU5-GPU6-GPU7-NET1
NET1---GPU0-GPU1-GPU2--GPU3-GPU4-GPU5-GPU6-GPU7-NET0

That means NCCL can fullly use all of the NETs both in the two directions. If I ask the low level question, please forgive me as NCCL still has many places for me that I haven't touched yet。

you can try this https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html#nccl-cross-nic

sjeaugey commented 2 months ago

NCCL tries to not cross rails by default [1], so indeed that's why it doesn't go with your proposed rings. The extra PXN step to go back to the first GPU isn't a problem, especially if you don't have many NICs; the NVL bandwidth can handle that easily.

[1] To get best performance on rail-optimized systems, and not break on RoCE systems which don't allow inter-rail communication as they use a different IP subnet per NIC with no routing between rails. As mentioned, setting NCCL_CROSS_NIC=1 would change that default behavior.

shanleo2024 commented 2 months ago

Thank you @sjeaugey , more clear for me now.

I tested cased between PXN enabled and PXN disabled, the graphs are different. Maybe this is another topic, but I think it is related with the searching channel. Does it means if we enable PXN(by default), the input and output NET will be selected the same one? And if we disable PXN, the channel will be more like I proposed rings?

NCCL_PXN_DISABLE=0

root@h800-125:shanxs # cat graph1.xml
<graphs version="1">
  <graph id="0" pattern="4" crossnic="0" nchannels="4" speedintra="24" speedinter="24" latencyinter="0" typeintra="NVL" typeinter="PXN" samechannels="0">
    <channel>
      <net dev="0"/>
      <gpu dev="0"/>
      <gpu dev="7"/>
      <gpu dev="6"/>
      <gpu dev="5"/>
      <gpu dev="4"/>
      <gpu dev="3"/>
      <gpu dev="2"/>
      <gpu dev="1"/>
      <net dev="0"/>
    </channel>
    <channel>
      <net dev="1"/>
      <gpu dev="7"/>
      <gpu dev="6"/>
      <gpu dev="5"/>
      <gpu dev="4"/>
      <gpu dev="3"/>
      <gpu dev="2"/>
      <gpu dev="1"/>
      <gpu dev="0"/>
      <net dev="1"/>
    </channel>
    <channel>
      <net dev="0"/>
      <gpu dev="0"/>
      <gpu dev="7"/>
      <gpu dev="6"/>
      <gpu dev="5"/>
      <gpu dev="4"/>
      <gpu dev="3"/>
      <gpu dev="2"/>
      <gpu dev="1"/>
      <net dev="0"/>
    </channel>
    <channel>
      <net dev="1"/>
      <gpu dev="7"/>
      <gpu dev="6"/>
      <gpu dev="5"/>
      <gpu dev="4"/>
      <gpu dev="3"/>
      <gpu dev="2"/>
      <gpu dev="1"/>
      <gpu dev="0"/>
      <net dev="1"/>
    </channel>
  </graph>
  <graph id="1" pattern="3" crossnic="0" nchannels="4" speedintra="40" speedinter="24" latencyinter="0" typeintra="NVL" typeinter="PIX" samechannels="0">
    <channel>
      <net dev="0"/>
      <gpu dev="0"/>
      <gpu dev="1"/>
      <gpu dev="2"/>
      <gpu dev="3"/>
      <gpu dev="4"/>
      <gpu dev="5"/>
      <gpu dev="6"/>
      <gpu dev="7"/>
      <net dev="0"/>
    </channel>
    <channel>
      <net dev="1"/>
      <gpu dev="7"/>
      <gpu dev="0"/>
      <gpu dev="1"/>
      <gpu dev="2"/>
      <gpu dev="3"/>
      <gpu dev="4"/>
      <gpu dev="5"/>
      <gpu dev="6"/>
      <net dev="1"/>
    </channel>
    <channel>
      <net dev="0"/>
      <gpu dev="0"/>
      <gpu dev="1"/>
      <gpu dev="2"/>
      <gpu dev="3"/>
      <gpu dev="4"/>
      <gpu dev="5"/>
      <gpu dev="6"/>
      <gpu dev="7"/>
      <net dev="0"/>
    </channel>
    <channel>
      <net dev="1"/>
      <gpu dev="7"/>
      <gpu dev="0"/>
      <gpu dev="1"/>
      <gpu dev="2"/>
      <gpu dev="3"/>
      <gpu dev="4"/>
      <gpu dev="5"/>
      <gpu dev="6"/>
      <net dev="1"/>
    </channel>
  </graph>
  <graph id="2" pattern="3" crossnic="0" nchannels="0" speedintra="0" speedinter="0" latencyinter="0" typeintra="LOC" typeinter="LOC" samechannels="0"/>
  <graph id="3" pattern="5" crossnic="0" nchannels="2" speedintra="48" speedinter="48" latencyinter="0" typeintra="NVL" typeinter="PIX" samechannels="0">
    <channel>
      <net dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <net dev="0"/>
    </channel>
    <channel>
      <net dev="1"/>
      <gpu dev="7"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <net dev="1"/>
    </channel>
  </graph>
</graphs>
root@h800-125:shanxs #

NCCL_PXN_DISABLE=1

root@h800-125:shanxs # cat graph1.xml
<graphs version="1">
  <graph id="0" pattern="4" crossnic="1" nchannels="4" speedintra="24" speedinter="24" latencyinter="0" typeintra="NVL" typeinter="PIX" samechannels="0">
    <channel>
      <net dev="0"/>
      <gpu dev="0"/>
      <gpu dev="1"/>
      <gpu dev="2"/>
      <gpu dev="3"/>
      <gpu dev="4"/>
      <gpu dev="5"/>
      <gpu dev="6"/>
      <gpu dev="7"/>
      <net dev="1"/>
    </channel>
    <channel>
      <net dev="1"/>
      <gpu dev="7"/>
      <gpu dev="6"/>
      <gpu dev="5"/>
      <gpu dev="4"/>
      <gpu dev="3"/>
      <gpu dev="2"/>
      <gpu dev="1"/>
      <gpu dev="0"/>
      <net dev="0"/>
    </channel>
    <channel>
      <net dev="0"/>
      <gpu dev="0"/>
      <gpu dev="1"/>
      <gpu dev="2"/>
      <gpu dev="3"/>
      <gpu dev="4"/>
      <gpu dev="5"/>
      <gpu dev="6"/>
      <gpu dev="7"/>
      <net dev="1"/>
    </channel>
    <channel>
      <net dev="1"/>
      <gpu dev="7"/>
      <gpu dev="6"/>
      <gpu dev="5"/>
      <gpu dev="4"/>
      <gpu dev="3"/>
      <gpu dev="2"/>
      <gpu dev="1"/>
      <gpu dev="0"/>
      <net dev="0"/>
    </channel>
  </graph>
  <graph id="1" pattern="3" crossnic="0" nchannels="4" speedintra="40" speedinter="24" latencyinter="0" typeintra="NVL" typeinter="PIX" samechannels="0">
    <channel>
      <net dev="0"/>
      <gpu dev="0"/>
      <gpu dev="1"/>
      <gpu dev="2"/>
      <gpu dev="3"/>
      <gpu dev="4"/>
      <gpu dev="5"/>
      <gpu dev="6"/>
      <gpu dev="7"/>
      <net dev="0"/>
    </channel>
    <channel>
      <net dev="1"/>
      <gpu dev="7"/>
      <gpu dev="0"/>
      <gpu dev="1"/>
      <gpu dev="2"/>
      <gpu dev="3"/>
      <gpu dev="4"/>
      <gpu dev="5"/>
      <gpu dev="6"/>
      <net dev="1"/>
    </channel>
    <channel>
      <net dev="0"/>
      <gpu dev="0"/>
      <gpu dev="1"/>
      <gpu dev="2"/>
      <gpu dev="3"/>
      <gpu dev="4"/>
      <gpu dev="5"/>
      <gpu dev="6"/>
      <gpu dev="7"/>
      <net dev="0"/>
    </channel>
    <channel>
      <net dev="1"/>
      <gpu dev="7"/>
      <gpu dev="0"/>
      <gpu dev="1"/>
      <gpu dev="2"/>
      <gpu dev="3"/>
      <gpu dev="4"/>
      <gpu dev="5"/>
      <gpu dev="6"/>
      <net dev="1"/>
    </channel>
  </graph>
  <graph id="2" pattern="3" crossnic="0" nchannels="0" speedintra="0" speedinter="0" latencyinter="0" typeintra="LOC" typeinter="LOC" samechannels="0"/>
  <graph id="3" pattern="5" crossnic="0" nchannels="2" speedintra="48" speedinter="48" latencyinter="0" typeintra="NVL" typeinter="PIX" samechannels="0">
    <channel>
      <net dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <net dev="0"/>
    </channel>
    <channel>
      <net dev="1"/>
      <gpu dev="7"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <net dev="1"/>
    </channel>
  </graph>
</graphs>
root@h800-125:shanxs #
shanleo2024 commented 2 months ago

Hi @sjeaugey I tested PXN enable and PXN disabled with modifing the channel mannually, let the output GPU closed to the output NET. Found that the PXN disabled performance better than PXN enabled. I think when the PXN enabled, the GPU will copy the data to the intermediateRank through the NVLINK, while PXN disabled, the GPU will just copy the data to the same GPU. Did copy the data to the same GPU faster than copy to the other GPU thourgh NVLINK?

The following is the graph I have modified mannually. GPU0 and GPU1 are closer to NET0 than other GPUs, GPU6 and GPU7 are closer to NET1 than other GPUs.

And the picture is the performance between PXN enabled and PXN disabled. 图片

Thank you.

<graphs version="1">
  <graph id="0" pattern="4" crossnic="0" nchannels="4" speedintra="24" speedinter="24" latencyinter="0" typeintra="NVL" typeinter="SYS" samechannels="0">
    <channel>
      <net dev="0"/>
      <gpu dev="0"/>
      <gpu dev="7"/>
      <gpu dev="6"/>
      <gpu dev="5"/>
      <gpu dev="4"/>
      <gpu dev="3"/>
      <gpu dev="2"/>
      <gpu dev="1"/>
      <net dev="0"/>
    </channel>
    <channel>
      <net dev="1"/>
      <gpu dev="7"/>
      <gpu dev="0"/>
      <gpu dev="1"/>
      <gpu dev="2"/>
      <gpu dev="3"/>
      <gpu dev="4"/>
      <gpu dev="5"/>
      <gpu dev="6"/>
      <net dev="1"/>
    </channel>
    <channel>
      <net dev="0"/>
      <gpu dev="0"/>
      <gpu dev="7"/>
      <gpu dev="6"/>
      <gpu dev="5"/>
      <gpu dev="4"/>
      <gpu dev="3"/>
      <gpu dev="2"/>
      <gpu dev="1"/>
      <net dev="0"/>
    </channel>
    <channel>
      <net dev="1"/>
      <gpu dev="7"/>
      <gpu dev="0"/>
      <gpu dev="1"/>
      <gpu dev="2"/>
      <gpu dev="3"/>
      <gpu dev="4"/>
      <gpu dev="5"/>
      <gpu dev="6"/>
      <net dev="1"/>
    </channel>
  </graph>
  <graph id="1" pattern="3" crossnic="0" nchannels="4" speedintra="40" speedinter="24" latencyinter="0" typeintra="NVL" typeinter="PIX" samechannels="0">
    <channel>
      <net dev="0"/>
      <gpu dev="0"/>
      <gpu dev="1"/>
      <gpu dev="2"/>
      <gpu dev="3"/>
      <gpu dev="4"/>
      <gpu dev="5"/>
      <gpu dev="6"/>
      <gpu dev="7"/>
      <net dev="0"/>
    </channel>
    <channel>
      <net dev="1"/>
      <gpu dev="7"/>
      <gpu dev="0"/>
      <gpu dev="1"/>
      <gpu dev="2"/>
      <gpu dev="3"/>
      <gpu dev="4"/>
      <gpu dev="5"/>
      <gpu dev="6"/>
      <net dev="1"/>
    </channel>
    <channel>
      <net dev="0"/>
      <gpu dev="0"/>
      <gpu dev="1"/>
      <gpu dev="2"/>
      <gpu dev="3"/>
      <gpu dev="4"/>
      <gpu dev="5"/>
      <gpu dev="6"/>
      <gpu dev="7"/>
      <net dev="0"/>
    </channel>
    <channel>
      <net dev="1"/>
      <gpu dev="7"/>
      <gpu dev="0"/>
      <gpu dev="1"/>
      <gpu dev="2"/>
      <gpu dev="3"/>
      <gpu dev="4"/>
      <gpu dev="5"/>
      <gpu dev="6"/>
      <net dev="1"/>
    </channel>
  </graph>
  <graph id="2" pattern="3" crossnic="0" nchannels="0" speedintra="0" speedinter="0" latencyinter="0" typeintra="LOC" typeinter="LOC" samechannels="0"/>
  <graph id="3" pattern="5" crossnic="0" nchannels="2" speedintra="48" speedinter="48" latencyinter="0" typeintra="NVL" typeinter="PIX" samechannels="0">
    <channel>
      <net dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <net dev="0"/>
    </channel>
    <channel>
      <net dev="1"/>
      <gpu dev="7"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <gpu dev="0"/>
      <net dev="1"/>
    </channel>
  </graph>
</graphs>
sjeaugey commented 2 months ago

Indeed it seems disabling PXN improves the latency a bit (the bandwidth is unchanged).

If you don't have a rail-optimized fabric, you can set NCCL_CROSS_NIC=1 which will tell NCCL it should not try to avoid crossing rails. That should be equivalent to your manually-modified rings.

shanleo2024 commented 2 months ago

Hi @sjeaugey , I have tested with NCCL_CROSS_NIC=1 and NCCL_PXN_DISABLE=1, but the graph still cross the NET. 图片

NCCL_CROSS_NIC=0 and NCCL_PXN_DISABLE=1 will not cross the NET. 图片

Seems like NCCL_CROSS_NIC=0 which will tell NCCL it should not try to avoid crossing rails. Is there any mismatch?

And the performance of modified graph4.xml better than NCCL_CROSS_NIC=0: 图片

sjeaugey commented 2 months ago

Sorry I think you misunderstood. NCCL_CROSS_NIC=1 will tell NCCL that crossing rails is ok. NCCL_CROSS_NIC=0 will tell NCCL to avoid crossing rails.

My advice was to just set NCCL_CROSS_NIC=1. That will cross rails, which is fine in your case, and should be equivalent to writing a graph XML manually (which is not something you can use in production).

shanleo2024 commented 2 months ago

OK, I understand, thank you for your kindly suggestion!