Open lxk1990727 opened 5 years ago
你这个错误,多半原因是:训练是在GPU上做的,所以参数都是保存在GPU上,预测却在CPU上做,所以报错。请贴一下相关的代码吧。
def train(args):
# parse config
config = parse_config(args.config)
train_config = merge_configs(config, 'train', vars(args))
train_model = models.get_model(args.model_name, train_config, 0.1, mode='train')
#compiled_prog = compiler.CompiledProgram(train_prog).with_data_parallel(loss_name=loss.name)
# build model
startup = fluid.Program()
train_prog = fluid.Program()
with fluid.program_guard(train_prog, startup):
with fluid.unique_name.guard():
train_model.build_input(not args.no_use_pyreader)
train_model.build_model()
train_feeds = train_model.feeds()
train_outputs = train_model.outputs()
train_pyreader = train_model.pyreader()
compiler_prog = fluid.compiler.CompiledProgram(train_prog).with_data_parallel(loss_name=train_outputs[0].name)
place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(startup)
assert os.path.exists(args.resume), \
"model dir {} not exist.".format(args.resume)
def if_exist(var):
return os.path.exists(os.path.join(args.resume, var.name))
fluid.io.load_vars(exe, args.resume, predicate=if_exist, main_program=train_prog)
train_reader = get_reader(args.model_name, 'train', train_config, place)
fetch_list = [x.name for x in train_outputs]
train_pyreader.decorate_tensor_provider(train_reader)
for epoch_id in range(1, 100):
train_pyreader.start()
train_iter = 0
try:
loss_step = []
while True:
infer_outs = exe.run(compiler_prog, fetch_list=fetch_list)
loss = np.array(infer_outs[0])
pos_dis_sum = np.array(infer_outs[1])
neg_dis_sum = np.array(infer_outs[2])
loss_p = np.array(infer_outs[3])
loss_step.append(loss[0])
train_iter += 1
if train_iter % 10 == 0:
cur_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
print(cur_time + " epoch %d, Batch %d, loss %f" % (epoch_id, train_iter, sum(loss_step)))
print(pos_dis_sum)
print(neg_dis_sum)
print(loss_p)
loss_step = []
if train_iter % 10 == 0:
fluid.io.save_persistables(exe, "/ssd3/lixiaokang04/repr/cm_paddle_"+str(epoch_id)+'_'+str(train_iter), main_program=train_prog)
except fluid.core.EOFException:
pass
finally:
train_pyreader.reset()
fluid.io.save_persistables(exe, "/ssd3/lixiaokang04/repr/cm_paddle_"+str(epoch_id), main_program=train_prog)
compiler_prog = fluid.compiler.CompiledProgram(train_prog).with_data_parallel(loss_name=train_outputs[0].name)
这句话放在
fluid.io.load_vars(exe, args.resume, predicate=if_exist, main_program=train_prog)
后面试试?
还是同样的问题,如果我不load checkpoint是可以跑通的。
继续训练ckpt,用下load_persistable接口试试?
这样都load不进去参数的
fluid.io.load_persistables(exe, args.resume, main_program=train_prog)
Cannot open file checkpoints/VideoText_epoch4_steps40000/fc_1.w_0 for load op at [/paddle/paddle/fluid/operators/load_op.h:37]
是对的预训练模型么?跑下eval看下
我单卡跑着一点问题都没有
多卡训练过程中报错paddle.fluid.core_avx.EnforceNotMet: Invoke operator mul_grad error