Open Eevan-zq opened 2 weeks ago
Sorry for late response. Could you show me your python file?
Of course, I guess the ThreadblockPolicy I'm using is ThreadblockPolicy.auto Does it have any impact?
from msccl.language import *
from msccl.topologies import *
from msccl.language.collectives import AllReduce
# author by wzq
def reduce_scatter(ranks,offset,count):
len1 = len(ranks)
for l in range(0, len1):
index = offset + l * count # [0 2 4 6]
# cur_rank = ranks[(l+1)%len1]
# cur_chunk = chunk(cur_rank, Buffer.input, index, count)
for step in range(1,len1):
cur_rank = ranks[(step+l)%len1]
cur_chunk = chunk(cur_rank, Buffer.input, index, count)
next_rank = ranks[(step+l+1)%len1]
next_chunk = chunk(next_rank, Buffer.input, index, count)
next_chunk.reduce(cur_chunk,ch=ranks[l])
def all_gather(ranks,offset,count):
len1 = len(ranks)
for l in range(0,len1):
index = offset + l * count
cur_rank = ranks[l]
c = chunk(cur_rank, Buffer.input, index, count)
for step in range(1, len1):
next_rank = ranks[(step+l)%len1]
#c = c.copy(next_rank, Buffer.input, index,ch=ranks[l], recvtb=ranks[l],sendtb=ranks[l])
c = c.copy(next_rank, Buffer.input, index, ch=ranks[l])
def allreduce(num_gpus, instances, protocol):
num_nodes = 2 # N
gpus_per_node = num_gpus // num_nodes # = 4 G
topology = fully_connected(num_gpus)
collective = AllReduce(num_gpus, num_gpus, True)
with MSCCLProgram("allreduce_a800_GC3", topology, collective, instances, protocol=protocol,
interleaved_replication=False, threadblock_policy=ThreadblockPolicy.auto, dependence_nop=True):
#ThreadblockPolicy.auto
# - Intra-Node Reduce_Scatter -
for n in range(num_nodes): # 0 1
gpuIds = [i+n*gpus_per_node for i in range(gpus_per_node)]
reduce_scatter(gpuIds,0,num_nodes)
# - Inter-Node Reduce_Scatter && Inter-Node AllGather -
for g in range(gpus_per_node): # g = 0 1 2 3
cross_gpuIds = [i*gpus_per_node+g for i in range(num_nodes)] # i = 0 1
reduce_scatter(cross_gpuIds,g*num_nodes,1)
all_gather(cross_gpuIds,g*num_nodes,1)
# - Intra-Node AllGather -
for n in range(num_nodes):
gpuIds = [i+n*gpus_per_node for i in range(gpus_per_node)]
all_gather(gpuIds,0,num_nodes)
XML()
Check()
parser = argparse.ArgumentParser()
parser.add_argument('num_gpus', type=int, help ='number of gpus')
parser.add_argument('instances', type=int, help='number of instances')
parser.add_argument('--protocol', type=str, default='LL', choices=['Simple', 'LL128', 'LL'], help='Protocol')
args = parser.parse_args()
allreduce(args.num_gpus, args.instances, args.protocol)
Yeah, please try to use manual first. The auto policy is not maintained for a while and maybe deprecated in future.
When I use manual mode, I encounter the following error; why is this happening?
When I tested the hierarchical_allreduce.py file that comes with msccl-tools, the command I used was : python ./hierarchical_allreduce.py --protocol=Simple --schedule=manual 4 2 1 > hierarch_Simple_4_2_1.xml, but after running the mpirun command, the following error occurred. Why is that? @Binyang2014
hierarch_Simple_4_2_1.xml : xmlFile.txt
@Binyang2014 Excuse me, do you have time to answer my question?
Sorry, I don't have time to go through your case in recent weeks. One thing I suggested is using Simple protocol not LL. LL will double the buffer which make cause some issues. Maybe I can get time to check the error in next week.
Of course, thank you for your suggestions and response.
What could be the possible reasons for the following issue?
And my xml file: xml.txt