Open blacksheep-Aristotle opened 6 months ago
Paddle2.6动转静在Resnet50上性能未达预期。 版本 在不使用amp的情况下,为何动转静的性能会比动态图更差一点?
以下是测试代码 `
paddle.set_device('gpu')
class RandomDataset(paddle.io.Dataset): def init(self, num_samples): self.num_samples = num_samples
def __getitem__(self, idx): image = np.random.random([3, IMAGE_SIZE, IMAGE_SIZE]).astype('float32') label = np.random.randint(0, CLASS_NUM, (1, )).astype('int64') return image, label def __len__(self): return self.num_samples
def train(layer, loader, loss_fn, opt,prof_file='./prof'):
# callback = profiler.export_chrome_tracing(prof_file) # 创建导出性能数据到 profiler_demo 文件夹的回调函数 # callback(prof) # 执行该导出函数 # #prof.summary(sorted_by=profiler.SortedKeys.GPUTotal) # 打印表单,按 GPUTotal 排序表单项 # p = profiler.Profiler(#targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU], # on_trace_ready=my_on_trace_ready, # scheduler = (18, 20), # timer_only=False, # record_shapes =True) # p.start() loss_list=[] backward_time=[] for epoch_id in range(EPOCH_NUM): if epoch_id<=1: start_time=time.time() for batch_id, (images, labels) in enumerate(loader()): #with paddle.amp.auto_cast(custom_white_list={'elementwise_add'}, level='O1'): out = layer(images) loss = loss_fn(out, labels) loss_list.append(loss.numpy()) before_bp=time.time() loss.backward() after_bp=time.time() backward_time.append(after_bp-before_bp) opt.step() opt.clear_grad() # p.step() # #print("Epoch {} batch {}: loss = {}".format( # # epoch_id, batch_id, np.mean(loss.numpy()))) # p.stop() end_time=time.time() print('train_time : ',end_time-start_time) return loss_list,backward_time
def get_build_strategy(seq_exe=False): build_strategy = paddle.static.BuildStrategy()
build_strategy.enable_addto = True build_strategy.enable_inplace = True os.environ['FLAGS_max_inplace_grad_add'] = "100" build_strategy.memory_optimize=False build_strategy.fuse_broadcast_ops = False build_strategy.fuse_relu_depthwise_conv = False build_strategy.enable_sequential_execution = seq_exe build_strategy.fuse_elewise_add_act_ops = False #print(build_strategy) return build_strategy
dataset = RandomDataset(BATCH_NUM * BATCH_SIZE) loader = paddle.io.DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True, num_workers=2)
def start_train(d2s_flag=False,seq_exe=False,prof_file='./prof'): model = resnet50()
# 动转静,并设置计算图优化策略 if d2s_flag: paddle.jit.set_code_level() paddle.jit.set_verbosity(level=3, also_to_stdout=True) model = paddle.jit.to_static(model, build_strategy=get_build_strategy(seq_exe)) # print('static model',model) # 设置损失函数 loss_fn = nn.CrossEntropyLoss() # 设置优化器 adam = opt.Adam(learning_rate=0.001, parameters=model.parameters()) # 开始训练 return train(model, loader, loss_fn, adam,prof_file)
_,bp1=starttrain(False) ,bp2=start_train(True,prof_file='./d2s_prof')`
你好,动转静训练并不是在所有的模型上都会有性能收益,一般来说,在重计算的训练模型上(比如这里的ResNet50,不使用AMP)性能收益较小或者没有收益;在重调度的模型上(比如transformer base4096 AMP训练)会有比较明显的收益~
请提出你的问题 Please ask your question
Paddle2.6动转静在Resnet50上性能未达预期。 版本 在不使用amp的情况下,为何动转静的性能会比动态图更差一点?
以下是测试代码 `
place=paddle.CustomPlace('sdaa', 0)
paddle.set_device('gpu')
定义一个随机数数据集
class RandomDataset(paddle.io.Dataset): def init(self, num_samples): self.num_samples = num_samples
定义训练过程
def train(layer, loader, loss_fn, opt,prof_file='./prof'):
def my_on_trace_ready(prof): # 定义回调函数,性能分析器结束采集数据时会被调用
def get_build_strategy(seq_exe=False): build_strategy = paddle.static.BuildStrategy()
addto 策略常搭配 FLAGS_max_inplace_grad_add 变量使用
构建 DataLoader 数据读取器
dataset = RandomDataset(BATCH_NUM * BATCH_SIZE) loader = paddle.io.DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True, num_workers=2)
def start_train(d2s_flag=False,seq_exe=False,prof_file='./prof'): model = resnet50()
print('dynamic model',model)
loss_list_1=start_train(False)
loss_list_2=start_train(True,prof_file='./d2s_prof')
_,bp1=starttrain(False) ,bp2=start_train(True,prof_file='./d2s_prof')`