PaddlePaddle / Paddle

PArallel Distributed Deep LEarning: Machine Learning Framework from Industrial Practice (『飞桨』核心框架,深度学习&机器学习高性能单机、分布式训练和跨平台部署)
http://www.paddlepaddle.org/
Apache License 2.0
22.13k stars 5.55k forks source link

Cannot find fetch variable in scope #19114

Closed wzgwzg closed 1 year ago

wzgwzg commented 5 years ago

paddle 1.5.0, cuda 8.0, cudnn v7, k40 单卡训练

报错:Cannot find fetch variable in scope, fetch_var_name is tmp_12 at xxx fetch_op.cc:37

部分代码: def net_config(image, label, model, args): model_list = [m for m in dir(models) if "__" not in m] assert args.model in model_list, "{} is not lists: {}".format(args.model, model_list) model_name = args.model if "Ft_Net" in model_name: x3_g_pool_fc, x4_g_pool_fc, x4_p_pool_fc, x3_g_avg_fc, x4_g_avg_fc, x4_p_avg_fc, x3_g_max_fc, x4_g_max_fc, x4_p_max_fc = model.net(input=image) cost_1, pred_1 = calc_loss(x3_g_pool_fc, label) avg_cost_1 = fluid.layers.mean(x=cost_1) 。。。。。。 total_cost = (cost_1 + cost_2 + cost_3 + cost_4 + cost_5 + cost_6 + cost_7 + cost_8 + cost_9) / 9.0 acc_1 = fluid.layers.accuracy(input=pred_1, label=label, k=1) acc_2 = fluid.layers.accuracy(input=pred_2, label=label, k=1) acc_3 = fluid.layers.accuracy(input=pred_3, label=label, k=1) acc_4 = fluid.layers.accuracy(input=pred_4, label=label, k=1) acc_5 = fluid.layers.accuracy(input=pred_5, label=label, k=1) acc_6 = fluid.layers.accuracy(input=pred_6, label=label, k=1) acc_7 = fluid.layers.accuracy(input=pred_7, label=label, k=1) acc_8 = fluid.layers.accuracy(input=pred_8, label=label, k=1) acc_9 = fluid.layers.accuracy(input=pred_9, label=label, k=1)

    return total_cost, acc_1, acc_2, acc_3, acc_4, acc_5, acc_6, acc_7, acc_8, acc_9

def train(args): model_name = args.model checkpoint = args.checkpoint pretrained_model = args.pretrained_model with_memory_optimization = args.with_mem_opt model_save_dir = args.model_save_dir num_instances = args.num_instances

startup_prog = fluid.Program()
train_prog = fluid.Program()

train_py_reader, total_cost, acc_1, acc_2, acc_3, acc_4, acc_5, acc_6, acc_7, acc_8, acc_9, global_lr = \
    build_program(is_train=True, main_prog=train_prog, startup_prog=startup_prog, args=args)

train_fetch_vars = [total_cost, acc_1, acc_2, acc_3, acc_4, acc_5, acc_6, acc_7, acc_8, acc_9, global_lr]
train_fetch_list = []
for var in train_fetch_vars:
    var.persistable=True
    train_fetch_list.append(var.name)     

if with_memory_optimization:
    fluid.memory_optimize(train_prog)

place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(startup_prog)
。。。。。。
build_strategy = fluid.BuildStrategy()
build_strategy.enable_inplace = args.with_inplace
exec_strategy = fluid.ExecutionStrategy()
exec_strategy.num_threads = device_num
if num_trainers > 1 and args.use_gpu:
    dist_utils.prepare_for_multi_process(exe, build_strategy, train_prog)
    exec_strategy.num_threads = 1

train_exe = fluid.ParallelExecutor(
    main_program=train_prog,
    use_cuda=bool(args.use_gpu),
    loss_name=total_cost.name,
    build_strategy=build_strategy,
    exec_strategy=exec_strategy)

for pass_id in range(args.num_epochs):
    train_py_reader.start()
    train_info = [[], [], [], [], [], [], [], [], [], []]
    train_time = []
    batch_id = 0
    try:
        while True:
            t1 = time.time()
            total_loss, tmp_acc_1, tmp_acc_2, tmp_acc_3, tmp_acc_4, tmp_acc_5, tmp_acc_6, tmp_acc_7, tmp_acc_8, tmp_acc_9, lr = exe.run(
                fetch_list=train_fetch_list)
wopeizl commented 5 years ago

看起来是部分fetch的值不在网络中,可以检查一下total_cost, acc_1, acc_2, acc_3, acc_4, acc_5, acc_6, acc_7, acc_8, acc_9, global_lr这些项

wzgwzg commented 5 years ago

我的代码基本仿照PaddleCv/ImageClassification中的写法

def calc_loss(logit, label, class_dim=751, use_label_smoothing=True, epsilon=0.1): softmax_out = fluid.layers.softmax(logit) if use_label_smoothing: label_one_hot = fluid.layers.one_hot(input=label, depth=class_dim) smooth_label = fluid.layers.label_smooth(label=label_one_hot, epsilon=epsilon, dtype="float32") loss = fluid.layers.cross_entropy(input=softmax_out, label=smooth_label, soft_label=True) else: loss = fluid.layers.cross_entropy(input=softmax_out, label=label)

#loss = fluid.layers.reduce_mean(loss)
return loss, softmax_out

会报如下错误

Traceback (most recent call last): File "train.py", line 366, in main() File "train.py", line 362, in main train(args) File "train.py", line 227, in train build_program(is_train=True, main_prog=train_prog, startup_prog=startup_prog, args=args) File "train.py", line 199, in build_program optimizer.minimize(total_cost) File "</home/vis/wangjian33/env2/python_build/lib/python2.7/site-packages/decorator.pyc:decorator-gen-20>", line 2, in minimize File "/home/vis/wangjian33/env2/python_build/lib/python2.7/site-packages/paddle/fluid/wrapped_decorator.py", line 25, in impl return wrapped_func(*args, *kwargs) File "/home/vis/wangjian33/env2/python_build/lib/python2.7/site-packages/paddle/fluid/dygraph/base.py", line 87, in impl return func(args, **kwargs) File "/home/vis/wangjian33/env2/python_build/lib/python2.7/site-packages/paddle/fluid/optimizer.py", line 594, in minimize no_grad_set=no_grad_set) File "/home/vis/wangjian33/env2/python_build/lib/python2.7/site-packages/paddle/fluid/optimizer.py", line 493, in backward no_grad_set, callbacks) File "/home/vis/wangjian33/env2/python_build/lib/python2.7/site-packages/paddle/fluid/backward.py", line 578, in append_backward _append_backwardvars(root_block, fwd_op_num, grad_to_var, grad_info_map) File "/home/vis/wangjian33/env2/python_build/lib/python2.7/site-packages/paddle/fluid/backward.py", line 392, in _append_backwardvars op_desc.infer_shape(block.desc) paddle.fluid.core_avx.EnforceNotMet: Enforce failed. Expected dy_dims.size() == rank, but received dy_dims.size():1 != rank:2. Input(Y@Grad) and Input(X) should have the same rank. at [/home/vis/wangjian33/code/Paddle/paddle/fluid/operators/cross_entropy_op.cc:96]

如果在返回loss之前先做reduce_mean: loss = fluid.layers.reduce_mean(loss) 则上述错误能避免,但是会报Cannot find fetch variable in scope错误

wopeizl commented 5 years ago

看起来是shape没对上,用一下label=smooth_label.reshape(label.shape) 看看

wzgwzg commented 5 years ago

之前的shape问题解决了,是计算总loss的时候分支loss的变量名用错了。但是Cannot find fetch variable in scope的问题依然存在,而且应该就是loss处的问题,fetch_list中只写loss,也还是有错。 现在关键代码如下:

def net_config(image, label, model, args): model_list = [m for m in dir(models) if "__" not in m] assert args.model in model_list, "{} is not lists: {}".format(args.model, model_list) model_name = args.model

if "Ft_Net" in model_name:
    x3_g_pool_fc, x4_g_pool_fc, x4_p_pool_fc, x3_g_avg_fc, x4_g_avg_fc, x4_p_avg_fc, x3_g_max_fc, x4_g_max_fc, x4_p_max_fc = model.net(input=image)

    cost_1, pred_1 = fluid.layers.softmax_with_cross_entropy(x3_g_pool_fc, label, return_softmax=True)
    cost_2, pred_2 = fluid.layers.softmax_with_cross_entropy(x4_g_pool_fc, label, return_softmax=True)
    cost_3, pred_3 = fluid.layers.softmax_with_cross_entropy(x4_p_pool_fc, label, return_softmax=True)
    cost_4, pred_4 = fluid.layers.softmax_with_cross_entropy(x3_g_avg_fc, label, return_softmax=True)
    cost_5, pred_5 = fluid.layers.softmax_with_cross_entropy(x4_g_avg_fc, label, return_softmax=True)
    cost_6, pred_6 = fluid.layers.softmax_with_cross_entropy(x4_p_avg_fc, label, return_softmax=True)
    cost_7, pred_7 = fluid.layers.softmax_with_cross_entropy(x3_g_max_fc, label, return_softmax=True)
    cost_8, pred_8 = fluid.layers.softmax_with_cross_entropy(x4_g_max_fc, label, return_softmax=True)
    cost_9, pred_9 = fluid.layers.softmax_with_cross_entropy(x4_p_max_fc, label, return_softmax=True)

    avg_cost_1 = fluid.layers.mean(x=cost_1)
    avg_cost_2 = fluid.layers.mean(x=cost_2)
    avg_cost_3 = fluid.layers.mean(x=cost_3)
    avg_cost_4 = fluid.layers.mean(x=cost_4)
    avg_cost_5 = fluid.layers.mean(x=cost_5)
    avg_cost_6 = fluid.layers.mean(x=cost_6)
    avg_cost_7 = fluid.layers.mean(x=cost_7)
    avg_cost_8 = fluid.layers.mean(x=cost_8)
    avg_cost_9 = fluid.layers.mean(x=cost_9)

    total_cost = avg_cost_1 + avg_cost_2 + avg_cost_3 + avg_cost_4 + avg_cost_5 + avg_cost_6 + avg_cost_7 + avg_cost_8 + avg_cost_9 
    total_cost /= 9

    acc_1 = fluid.layers.accuracy(input=pred_1, label=label, k=1)
    acc_2 = fluid.layers.accuracy(input=pred_2, label=label, k=1)
    acc_3 = fluid.layers.accuracy(input=pred_3, label=label, k=1)
    acc_4 = fluid.layers.accuracy(input=pred_4, label=label, k=1)
    acc_5 = fluid.layers.accuracy(input=pred_5, label=label, k=1)
    acc_6 = fluid.layers.accuracy(input=pred_6, label=label, k=1)
    acc_7 = fluid.layers.accuracy(input=pred_7, label=label, k=1)
    acc_8 = fluid.layers.accuracy(input=pred_8, label=label, k=1)                                                                                                                                                          
    acc_9 = fluid.layers.accuracy(input=pred_9, label=label, k=1)
    return total_cost, acc_1, acc_2, acc_3, acc_4, acc_5, acc_6, acc_7, acc_8, acc_9

def build_program(is_train, main_prog, startup_prog, args): image_shape = [int(m) for m in args.image_shape.split(",")] model_name = args.model model_list = [m for m in dir(models) if "__" not in m] assert model_name in model_list, "{} is not in lists: {}".format(args.model, model_list) model = models.dict[model_name](layers=args.layers, class_num=args.class_dim, num_bottleneck=args.num_features, is_train=True) with fluid.program_guard(main_prog, startup_prog): py_reader = fluid.layers.py_reader( capacity=64, shapes=[[-1] + image_shape, [-1, 1]], lod_levels=[0, 0], dtypes=["float32", "int64"], use_double_buffer=True) with fluid.unique_name.guard(): image, label = fluid.layers.read_file(py_reader) if "Ft_Net" in model_name: print('This is Ft_Net') total_cost, acc_1, acc_2, acc_3, acc_4, acc_5, acc_6, acc_7, acc_8, acc_9 = net_config(image, label, model, args) total_cost.persistable = True acc_1.persistable = True acc_2.persistable = True acc_3.persistable = True acc_4.persistable = True acc_5.persistable = True acc_6.persistable = True acc_7.persistable = True acc_8.persistable = True acc_9.persistable = True else: print('model error!') if is_train: params = {} params["total_images"] = args.total_images params["lr"] = args.lr params["num_epochs"] = args.num_epochs params["learning_strategy"] = {} params["learning_strategy"]["batch_size"] = args.batch_size params["learning_strategy"]["name"] = args.lr_strategy params["l2_decay"] = args.l2_decay params["momentum_rate"] = args.momentum_rate optimizer = optimizer_setting(params) optimizer.minimize(total_cost) global_lr = optimizer._global_learning_rate() if is_train: return py_reader, total_cost, acc_1, acc_2, acc_3, acc_4, acc_5, acc_6, acc_7, acc_8, acc_9, global_lr

def train(args): model_name = args.model checkpoint = args.checkpoint pretrained_model = args.pretrained_model
with_memory_optimization = args.with_mem_opt model_save_dir = args.model_save_dir num_instances = args.num_instances

startup_prog = fluid.Program()
train_prog = fluid.Program()

train_py_reader, total_cost, acc_1, acc_2, acc_3, acc_4, acc_5, acc_6, acc_7, acc_8, acc_9, global_lr = \
    build_program(is_train=True, main_prog=train_prog, startup_prog=startup_prog, args=args)

if with_memory_optimization:
    fluid.memory_optimize(train_prog)

place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(startup_prog)

if checkpoint is not None:
    print('load from checkpoint')
    fluid.io.load_persistables(exe, checkpoint, main_program=train_prog)

if pretrained_model and checkpoint is None:
    print('load from pretrained_model')
    def if_exist(var):
        #print(var.name)
        #print(os.path.exists(os.path.join(pretrained_model, var.name)))
        if "fc_0" in var.name:
            return False
        return os.path.exists(os.path.join(pretrained_model, var.name))

    fluid.io.load_vars(
        exe, pretrained_model, main_program=train_prog, predicate=if_exist)

visible_device = os.getenv('CUDA_VISIBLE_DEVICES')
if visible_device:
    device_num = len(visible_device.split(','))
else:
    device_num = subprocess.check_output(
        ['nvidia-smi', '-L']).decode().count('\n')

train_batch_size = args.batch_size / device_num
settings = {'train_batch_size':train_batch_size, 'samples_each_class':num_instances,
                    'shuffle':False, 'mode':'train'}
train_reader = paddle.batch(
    reader.train(settings), batch_size=train_batch_size, drop_last=True)

train_py_reader.decorate_paddle_reader(train_reader)

build_strategy = fluid.BuildStrategy()
build_strategy.enable_inplace = args.with_inplace
exec_strategy = fluid.ExecutionStrategy()
exec_strategy.num_threads = device_num
if num_trainers > 1 and args.use_gpu:
    dist_utils.prepare_for_multi_process(exe, build_strategy, train_prog)
    exec_strategy.num_threads = 1

train_exe = fluid.ParallelExecutor(
    main_program=train_prog,
    use_cuda=bool(args.use_gpu),
    loss_name=total_cost.name,
    build_strategy=build_strategy,
    exec_strategy=exec_strategy)

train_fetch_list = [total_cost.name]
for pass_id in range(args.num_epochs):
    train_py_reader.start()
    train_info = [[], [], [], [], [], [], [], [], [], []]
    train_time = []
    batch_id = 0
    try:
        while True:
            t1 = time.time()
            total_loss = exe.run(
            #total_loss, tmp_acc_1, tmp_acc_2, tmp_acc_3, tmp_acc_4, tmp_acc_5, tmp_acc_6, tmp_acc_7, tmp_acc_8, tmp_acc_9, lr = exe.run(
                fetch_list=train_fetch_list)