PaddlePaddle / Paddle

PArallel Distributed Deep LEarning: Machine Learning Framework from Industrial Practice (『飞桨』核心框架,深度学习&机器学习高性能单机、分布式训练和跨平台部署)
http://www.paddlepaddle.org/
Apache License 2.0
22.18k stars 5.57k forks source link

[ValueError] function decorated by @declarative ... #36621

Closed LoveNingBo closed 2 years ago

LoveNingBo commented 3 years ago

报错信息

Traceback (most recent call last):
  File "model.py", line 464, in <module>
    output = model(a_t,b_t,mask_a_t,mask_b_t)
  File "/mnt/huiyanfei/conda_env/hyf_py3/lib/python3.6/site-packages/paddle/fluid/dygraph/layers.py", line 891, in __call__
    outputs = self.forward(*inputs, **kwargs)
  File "/mnt/huiyanfei/conda_env/hyf_py3/lib/python3.6/site-packages/paddle/fluid/dygraph/dygraph_to_static/program_translator.py", line 357, in __call__
    raise e
  File "/mnt/huiyanfei/conda_env/hyf_py3/lib/python3.6/site-packages/paddle/fluid/dygraph/dygraph_to_static/program_translator.py", line 335, in __call__
    *args, **kwargs)
  File "/mnt/huiyanfei/conda_env/hyf_py3/lib/python3.6/site-packages/paddle/fluid/dygraph/dygraph_to_static/program_translator.py", line 402, in get_concrete_program
    concrete_program, partial_program_layer = self._program_cache[cache_key]
  File "/mnt/huiyanfei/conda_env/hyf_py3/lib/python3.6/site-packages/paddle/fluid/dygraph/dygraph_to_static/program_translator.py", line 711, in __getitem__
    self._caches[item] = self._build_once(item)
  File "/mnt/huiyanfei/conda_env/hyf_py3/lib/python3.6/site-packages/paddle/fluid/dygraph/dygraph_to_static/program_translator.py", line 703, in _build_once
    return concrete_program, partial_program_from(concrete_program)
  File "/mnt/huiyanfei/conda_env/hyf_py3/lib/python3.6/site-packages/paddle/fluid/dygraph/dygraph_to_static/partial_program.py", line 408, in partial_program_from
    concrete_program.parameters)
  File "/mnt/huiyanfei/conda_env/hyf_py3/lib/python3.6/site-packages/paddle/fluid/dygraph/dygraph_to_static/partial_program.py", line 135, in __init__
    self._origin_main_program = self._verify_program(main_program)
  File "/mnt/huiyanfei/conda_env/hyf_py3/lib/python3.6/site-packages/paddle/fluid/dygraph/dygraph_to_static/partial_program.py", line 165, in _verify_program
    self._check_params_all_inited(main_program)
  File "/mnt/huiyanfei/conda_env/hyf_py3/lib/python3.6/site-packages/paddle/fluid/dygraph/dygraph_to_static/partial_program.py", line 382, in _check_params_all_inited
    % name)
ValueError: 
        We don't support to define layer with parameters in the function decorated by `@declarative`.
        Because that will re-defined parameters every time when you run the function.
        But we found parameter(conv1d_1.b_0) was created in the decorated function.
        Please define the layer with parameters in `__init__` function.

环境

paddle:2.0.0

源码

'''
@Author: your name
@Date: 2021-10-20 16:35:31
@LastEditTime: 2021-10-21 19:14:24
@LastEditors: Please set LastEditors
@Description: In User Settings Edit
@FilePath: /huiyanfei/work/re2_paddle_all/model.py
'''
import numpy
from typing import Collection
import math
import paddle
paddle.set_device("cpu")
import paddle.nn as nn
from paddle import fluid 
import paddle.nn.functional as f
from functools import partial 

def register(name=None, registry=None):
    def decorator(fn, registration_name=None):
        module_name = registration_name or _default_name(fn)
        if module_name in registry:
            raise LookupError(f"module {module_name} already registered.")
        registry[module_name] = fn
        return fn
    return lambda fn: decorator(fn, name)

def _default_name(obj_class):
    return obj_class.__name__

registry = {}
register = partial(register, registry=registry)

class Module(paddle.nn.Layer):
    def __init__(self):
        super().__init__()
        self.summary = {}

    def add_summary(self, name, val):
        if self.training:
            self.summary[name] = val.clone().numpy()

    def get_summary(self, base_name=''):
        summary = {}
        if base_name:
            base_name += '/'
        if self.summary:
            summary.update({base_name + name: val for name, val in self.summary.items()})
        for name, child in self.named_children():
            if hasattr(child, 'get_summary'):
                name = base_name + name
                summary.update(child.get_summary(name))
        return summary

class GeLU(paddle.nn.Layer):
    def forward(self, x):
        return 0.5 * x * (1. + paddle.tanh(x * 0.7978845608 * (1. + 0.044715 * x * x)))

class Linear(paddle.nn.Layer):
    def __init__(self, in_features, out_features, activations=False):
        super().__init__()
        weight_attr = paddle.framework.ParamAttr( 
            initializer=paddle.nn.initializer.Normal(mean=0.0, std=math.sqrt((2. if activations else 1.) / in_features)))
        bias_attr = paddle.framework.ParamAttr( 
            initializer=paddle.nn.initializer.Constant(value=0.0))
        linear = nn.Linear(in_features, out_features, weight_attr=weight_attr, bias_attr=bias_attr)
        # nn.init.normal_(linear.weight, std=math.sqrt((2. if activations else 1.) / in_features))
        # nn.init.zeros_(linear.bias)
        # linear.weight.normal_(0, math.sqrt((2. if activations else 1.) / in_features))
        # linear.bias.zero_()

        modules = [nn.utils.weight_norm(linear)]
        if activations:
            modules.append(GeLU())
        self.model = nn.Sequential(*modules)

    def forward(self, x):
        return self.model(x)

class Conv1d(paddle.nn.Layer):
    def __init__(self, in_channels, out_channels, kernel_sizes: Collection[int]):
        super().__init__()
        assert all(k % 2 == 1 for k in kernel_sizes), 'only support odd kernel sizes'
        assert out_channels % len(kernel_sizes) == 0, 'out channels must be dividable by kernels'
        out_channels = out_channels // len(kernel_sizes)
        convs = []
        for kernel_size in kernel_sizes:
            conv = nn.Conv1D(in_channels, out_channels, kernel_size,
                             padding=(kernel_size - 1) // 2)
            # nn.init.normal_(conv.weight, std=math.sqrt(2. / (in_channels * kernel_size)))
            # nn.init.zeros_(conv.bias)
            # n = kernel_size * out_channels
            # conv.weight.set_value(numpy.array(math.sqrt(2. / n)))
            # conv.bias.set_value(0)
            convs.append(nn.Sequential(nn.utils.weight_norm(conv), GeLU()))
        self.model = convs

    def forward(self, x):
        return paddle.concat([encoder(x) for encoder in self.model], axis=-1)

# @register('identity')
class Alignment(Module):
    def __init__(self, args, __):
        super().__init__()
        # self.temperature = nn.Parameter(paddle.tensor(1 / math.sqrt(args.hidden_size)))
        x = paddle.to_tensor(1 / math.sqrt(args.hidden_size))
        self.temperature = paddle.create_parameter(shape=x.shape,
                        dtype=str(x.numpy().dtype),
                        default_initializer=paddle.nn.initializer.Assign(x))

    def _attention(self, a, b):

        # return paddle.matmul(a, b.transpose(1, 2)) * self.temperature
        b = paddle.transpose(b, perm=[0, 2, 1])
        return paddle.matmul(a, b) * self.temperature

    def forward(self, a, b, mask_a, mask_b):
        attn = self._attention(a, b)

        # mask = paddle.matmul(mask_a.float(), mask_b.transpose(1, 2).float()).byte()
        mask_b = paddle.transpose(mask_b.astype("float32"), perm=[0, 2, 1])
        mask = paddle.matmul(mask_a.astype("float32"), mask_b)#.astype("bool")
        # attn.masked_fill_(~mask, -1e7)
        mask_ = paddle.full_like(mask, -1e7)
        attn = paddle.where(mask>0, attn, mask_)
        attn_a = f.softmax(attn, axis=1)
        attn_b = f.softmax(attn, axis=2)

        # feature_b = paddle.matmul(attn_a.transpose(1, 2), a)
        attn_a_trans = paddle.transpose(attn_a, perm=[0, 2, 1])
        feature_b = paddle.matmul(attn_a_trans, a)
        feature_a = paddle.matmul(attn_b, b)
        self.add_summary('temperature', self.temperature)
        self.add_summary('attention_a', attn_a)
        self.add_summary('attention_b', attn_b)
        return feature_a, feature_b

# @register('linear')
# class MappedAlignment(Alignment):
#     def __init__(self, args, input_size):
#         super().__init__(args, input_size)
#         self.projection = nn.Sequential(
#             nn.Dropout(args.dropout),
#             Linear(input_size, args.hidden_size, activations=True),
#         )

#     def _attention(self, a, b):
#         a = self.projection(a)
#         b = self.projection(b)
#         return super()._attention(a, b)

# @register('none')
# class NullConnection(paddle.nn.Layer):
#     def forward(self, x, _, __):
#         return x

# @register('residual')
# class Residual(paddle.nn.Layer):
#     def __init__(self, args):
#         super().__init__()
#         self.linear = Linear(args.embedding_dim, args.hidden_size)

#     def forward(self, x, res, i):
#         if i == 1:
#             res = self.linear(res)
#         return (x + res) * math.sqrt(0.5)

# @register('aug')
class AugmentedResidual(paddle.nn.Layer):
    def forward(self, x, res, i):
        if i == 1:
            return paddle.concat([x, res], axis=-1)  # res is embedding
        hidden_size = x.size(-1)
        x = (res[:, :, :hidden_size] + x) * math.sqrt(0.5)
        return paddle.concat([x, res[:, :, hidden_size:]], axis=-1)  # latter half of res is embedding

class Embedding(paddle.nn.Layer):
    def __init__(self, args):
        super().__init__()
        self.fix_embeddings = args.fix_embeddings
        self.embedding = nn.Embedding(args.num_vocab, args.embedding_dim, padding_idx=0)
        self.dropout = args.dropout

    def set_(self, value):
        # self.embedding.weight.requires_grad = not self.fix_embeddings
        pretrained_attr = paddle.ParamAttr(name='embedding',
                                   initializer=paddle.nn.initializer.Assign(value),
                                   trainable=not self.fix_embeddings)
        self.embedding = paddle.nn.Embedding(num_embeddings=len(value),
                                      embedding_dim=300,
                                      padding_idx= 0 ,
                                      weight_attr=pretrained_attr)
        # self.embedding.load_state_dict({'weight': paddle.tensor(value)})

    def forward(self, x):
        x = self.embedding(x)
        x = f.dropout(x, self.dropout, self.training)
        return x

class Encoder(paddle.nn.Layer):
    def __init__(self, args, input_size):
        super().__init__()
        self.dropout = args.dropout
        self.encoders = nn.LayerList([Conv1d(
                in_channels=input_size if i == 0 else args.hidden_size,
                out_channels=args.hidden_size,
                kernel_sizes=args.kernel_sizes) for i in range(args.enc_layers)])

    def forward(self, x, mask):
        # x = x.transpose(1, 2)  # B x C x L
        x = paddle.transpose(x, perm=[0, 2, 1])
        # mask = mask.transpose(1, 2)
        mask = paddle.transpose(mask.astype("float32"), perm=[0, 2, 1])#.astype("bool")
        for i, encoder in enumerate(self.encoders):
            # x.masked_fill_(~mask, 0.)
            mask_ = paddle.full_like(mask, 0.)
            x = paddle.where(mask>0, x, mask_) 
            if i > 0:
                x = f.dropout(x, self.dropout, self.training)
            x = encoder(x)
        x = f.dropout(x, self.dropout, self.training)
        # return x.transpose(1, 2)  # B x L x C
        return paddle.transpose(x, perm=[0, 2, 1])

# @register('simple')
# class Fusion(paddle.nn.Layer):
#     def __init__(self, args, input_size):
#         super().__init__()
#         self.fusion = Linear(input_size * 2, args.hidden_size, activations=True)

#     def forward(self, x, align):
#         return self.fusion(paddle.concat([x, align], axis=-1))

# @register('full_fusion')
class FullFusion(paddle.nn.Layer):
    def __init__(self, args, input_size):
        super().__init__()
        self.dropout = args.dropout
        self.fusion1 = Linear(input_size * 2, args.hidden_size, activations=True)
        self.fusion2 = Linear(input_size * 2, args.hidden_size, activations=True)
        self.fusion3 = Linear(input_size * 2, args.hidden_size, activations=True)
        self.fusion = Linear(args.hidden_size * 3, args.hidden_size, activations=True)

    def forward(self, x, align):
        x1 = self.fusion1(paddle.concat([x, align], axis=-1))
        x2 = self.fusion2(paddle.concat([x, x - align], axis=-1))
        x3 = self.fusion3(paddle.concat([x, x * align], axis=-1))
        x = paddle.concat([x1, x2, x3], axis=-1)
        x = f.dropout(x, self.dropout, self.training)
        return self.fusion(x)

class Pooling(paddle.nn.Layer):
    def forward(self, x, mask):
        # print(x.shape,mask.shape)
        # return x.masked_fill_(~mask, -float('inf')).max(dim=1)[0]
        mask = mask.astype("float32")
        # mask_ = paddle.full_like(x, 0.0)
        # mask_ += float('-inf')
        # return paddle.where(mask>0, x, mask_).max(axis=1)[0]
        # out = paddle.max(paddle.where(mask == 0, paddle.to_tensor(0.), x),axis=1)
        out = paddle.max(paddle.where(mask == paddle.zeros([1]), paddle.zeros([1]), x),axis=1)
        # print(out.shape)
        return out#[0]

# @register('simple')
class Prediction(paddle.nn.Layer):
    def __init__(self, args, inp_features=2):
        super().__init__()
        self.dense = nn.Sequential(
            nn.Dropout(args.dropout),
            Linear(args.hidden_size * inp_features, args.hidden_size, activations=True),
            nn.Dropout(args.dropout),
            Linear(args.hidden_size, args.num_classes),
        )

    def forward(self, a, b):
        return self.dense(paddle.concat([a, b], axis=-1))

# @register('full')
class AdvancedPrediction(Prediction):
    def __init__(self, args):
        super().__init__(args, inp_features=4)

    def forward(self, a, b):
        return self.dense(paddle.concat([a, b, a - b, a * b], axis=-1))

################################## 模型主网络 #################################

class Network(Module):
    def __init__(self, args):
        super().__init__()
        # self.dropout = args.dropout
        self.embedding = Embedding(args)
        # self.blocks = [,
        # ) for i in range(args.blocks)]
        self.blocks = []
        for i in range(args.blocks):
            block = nn.Sequential(("encoder",Encoder(args, args.embedding_dim if i == 0 else args.embedding_dim + args.hidden_size)),
            ("alignment",Alignment(args, args.embedding_dim + args.hidden_size if i == 0 else args.embedding_dim + args.hidden_size * 2)),
            ("fusion",FullFusion(args, args.embedding_dim + args.hidden_size if i == 0 else args.embedding_dim + args.hidden_size * 2)))
            self.blocks.append(block)

        self.connection = AugmentedResidual()
        self.pooling = Pooling()
        self.prediction = AdvancedPrediction(args)

    def forward(self, a,b,mask_a,mask_b):
        # print(type(a),type(b))
        a = self.embedding(a)
        b = self.embedding(b)
        # print(type(a),type(b))
        res_a, res_b = a, b

        for i, block in enumerate(self.blocks):
            if i > 0:
                a = self.connection(a, res_a, i)
                b = self.connection(b, res_b, i)
                res_a, res_b = a, b 
            a_enc = block["encoder"](a, mask_a)
            b_enc = block["encoder"](b, mask_b)
            a = paddle.concat([a, a_enc], axis=-1)
            b = paddle.concat([b, b_enc], axis=-1)
            align_a, align_b = block["alignment"](a, b, mask_a, mask_b)
            a = block["fusion"](a, align_a)
            b = block["fusion"](b, align_b)
            # a,b = block(a,b,mask_a,mask_b)
            print(type(a),type(b))
        # a,b = self.blocks[0](a,b,mask_a,mask_b)
        a = self.pooling(a, mask_a)
        b = self.pooling(b, mask_b)
        print(type(a),type(b))
        return self.prediction(a, b)

############################## 损失函数 ##########################

def get_loss(logits, target, soft_target):
    softlabel = False
    def SoftCrossEntropy(inputs, target,soft_target,reduction='sum'):  
        if reduction == 'average': 
            log_likelihood = inputs[:,1] 
            loss = paddle.sum(paddle.mul(log_likelihood, target)) / batch
        else:
            # log_likelihood = inputs[:,1]   
            soft_loss =  5 * paddle.nn.functional.mse_loss(inputs, soft_target)
            # target = paddle.tensor(target, dtype=paddle.long).to(DEVICE)
            hard_loss =  f.cross_entropy(logits, target)
        return hard_loss + soft_loss 
    if softlabel: 
        return SoftCrossEntropy(logits,target,soft_target)
    else: 
        return f.cross_entropy(logits, target)

##############################################   训练   ##########################
args_dict = {'alignment': 'identity',
  'batch_size': 512,
  'beta1': 0.9,
  'beta2': 0.999,
  'blocks': 2,
  'connection': 'aug',
  'cuda': True,
  'data_dir': 'data/ddqa',
  'deterministic': True,
  'dropout': 0.2,
  'early_stopping': 1954,
  'embedding_dim': 300,
  'embedding_mode': 'freq',
  'enc_layers': 2,
  'encoder': 'cnn',
  'epochs': 30,
  'eval_epoch': True,
  'eval_file': 'dev',
  'eval_per_samples': 64000,
  'eval_per_samples_warmup': 40000,
  'eval_per_updates': 125,
  'eval_per_updates_warmup': 79,
  'eval_subset': None,
  'eval_warmup_samples': 0,
  'eval_warmup_steps': 0,
  'fix_embeddings': False,
  'fusion': 'full_fusion',
  'grad_clipping': 5,
  'hidden_size': 150,
  'kernel_sizes': [3],
  'log_file': 'log.txt',
  'log_per_samples': 32000,
  'log_per_updates': 63,
  'lower_case': True,
  'lr': 0.001,
  'lr_decay_rate': 1.0,
  'lr_decay_samples': 128000,
  'lr_decay_steps': 250,
  'lr_warmup_samples': 0,
  'lr_warmup_steps': 0,
  'max_len': 150,
  'max_loss': 999.0,
  'max_vocab': 999999,
  'metric': 'acc',
  'min_df': 5,
  'min_len': 1,
  'min_lr': 6e-05,
  'min_samples': 0,
  'min_steps': 0,
  'multi_gpu': False,
  'name': 'benchmark-0',
  'num_classes': 2,
  'num_vocab': 248827,
  'output_dir': 'models/ddqa_合并训练集_softlabel',
  'padding': 0,
  'prediction': 'full',
  'pretrained_embeddings': 'resources/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
  'resume': None,
  'save': True,
  'save_all': False,
  'seed': None,
  'sort_by_len': False,
  'summary_dir': 'models/ddqa_合并训练集_softlabel/benchmark-0',
  'summary_per_logs': 20,
  'summary_per_updates': 1260,
  'tensorboard': True,
  'tolerance_samples': 1000000,
  'watch_metrics': ['acc'],
  'weight_decay': 0
  }

Args = type("Args",(),{})
for  key,value in args_dict.items():
    setattr(Args,key,value)
args = Args()
model = Network(args)

optimizer = paddle.optimizer.Adam(parameters = model.parameters(), learning_rate=args.lr, beta1=args.beta1,beta2= args.beta2,
                                    weight_decay=float(args.weight_decay))

a= [[200]*150]

b= [[100]*150]

mask_a = [[[1]]*len(a[0])]
mask_b = [[[1]]*len(b[0])]

a_t = paddle.to_tensor(a)
mask_a_t = paddle.to_tensor(mask_a)
b_t = paddle.to_tensor(b)
mask_b_t = paddle.to_tensor(mask_b)

output = model(a_t,b_t,mask_a_t,mask_b_t)
target = paddle.to_tensor([1],dtype="int64")
soft_target = paddle.to_tensor([0.84127417])
loss =  get_loss(output, target, soft_target)
loss.backward()
optimizer.step()

from paddle.static import InputSpec
from paddle.jit import to_static
net = to_static(model, input_spec=[InputSpec(shape=[None, 150],dtype = 'int32', name='a'),InputSpec(shape=[None,150],dtype= 'int32', name='b'),InputSpec(shape=[None,150,1],dtype= 'int32', name='mask_a'),InputSpec(shape=[None,150,1],dtype= 'int32', name='mask_b')])
print("\nto_static done!\n")
paddle.jit.save(net, 'inference_model_new/qanet')
paddle-bot-old[bot] commented 3 years ago

您好,我们已经收到了您的问题,会安排技术人员尽快解答您的问题,请耐心等待。请您再次检查是否提供了清晰的问题描述、复现代码、环境&版本、报错信息等。同时,您也可以通过查看官网API文档常见问题历史IssueAI社区来寻求解答。祝您生活愉快~

Hi! We've received your issue and please be patient to get responded. We will arrange technicians to answer your questions as soon as possible. Please make sure that you have posted enough message to demo your request. You may also check out the APIFAQGithub Issue and AI community to get the answer.Have a nice day!

wenming2014 commented 3 years ago

您好,layer参数不支持通过@declarative定义,建议改成在__init_函数下定义

LoveNingBo commented 3 years ago

您好,layer参数不支持通过@declarative定义,建议改成在__init_函数下定义

您好,请问可以举例的说明一下是哪些模块的定义方式有误吗?类似 GeLU 模块这种只有forward没有init 函数这种形式就会报这个错误吗?

wenming2014 commented 3 years ago

您好,layer参数不支持通过@declarative定义,建议改成在__init_函数下定义

您好,请问可以举例的说明一下是哪些模块的定义方式有误吗?类似 GeLU 模块这种只有forward没有init 函数这种形式就会报这个错误吗?

是的,最好每个都有一个init 函数定义参数。

But we found parameter(conv1d_1.b_0) was created in the decorated function.

从报错信息看,你应该是这个地方报错了

LoveNingBo commented 3 years ago

您好,layer参数不支持通过@declarative定义,建议改成在__init_函数下定义

您好,请问可以举例的说明一下是哪些模块的定义方式有误吗?类似 GeLU 模块这种只有forward没有init 函数这种形式就会报这个错误吗?

是的,最好每个都有一个init 函数定义参数。

But we found parameter(conv1d_1.b_0) was created in the decorated function.

从报错信息看,你应该是这个地方报错了

同学你好,我按照你的提示全部修改重写后还是会报类似的错误,麻烦你帮忙看一下

报错信息

Traceback (most recent call last):
  File "model.py", line 486, in <module>
    paddle.jit.save(net, 'inference_model_new/qanet')
  File "<decorator-gen-60>", line 2, in save
  File "/mnt/huiyanfei/conda_env/hyf_py3/lib/python3.6/site-packages/paddle/fluid/wrapped_decorator.py", line 25, in __impl__
    return wrapped_func(*args, **kwargs)
  File "/mnt/huiyanfei/conda_env/hyf_py3/lib/python3.6/site-packages/paddle/fluid/dygraph/base.py", line 39, in __impl__
    return func(*args, **kwargs)
  File "/mnt/huiyanfei/conda_env/hyf_py3/lib/python3.6/site-packages/paddle/fluid/dygraph/jit.py", line 681, in save
    inner_input_spec)
  File "/mnt/huiyanfei/conda_env/hyf_py3/lib/python3.6/site-packages/paddle/fluid/dygraph/dygraph_to_static/program_translator.py", line 488, in concrete_program_specify_input_spec
    *desired_input_spec)
  File "/mnt/huiyanfei/conda_env/hyf_py3/lib/python3.6/site-packages/paddle/fluid/dygraph/dygraph_to_static/program_translator.py", line 402, in get_concrete_program
    concrete_program, partial_program_layer = self._program_cache[cache_key]
  File "/mnt/huiyanfei/conda_env/hyf_py3/lib/python3.6/site-packages/paddle/fluid/dygraph/dygraph_to_static/program_translator.py", line 711, in __getitem__
    self._caches[item] = self._build_once(item)
  File "/mnt/huiyanfei/conda_env/hyf_py3/lib/python3.6/site-packages/paddle/fluid/dygraph/dygraph_to_static/program_translator.py", line 703, in _build_once
    return concrete_program, partial_program_from(concrete_program)
  File "/mnt/huiyanfei/conda_env/hyf_py3/lib/python3.6/site-packages/paddle/fluid/dygraph/dygraph_to_static/partial_program.py", line 408, in partial_program_from
    concrete_program.parameters)
  File "/mnt/huiyanfei/conda_env/hyf_py3/lib/python3.6/site-packages/paddle/fluid/dygraph/dygraph_to_static/partial_program.py", line 135, in __init__
    self._origin_main_program = self._verify_program(main_program)
  File "/mnt/huiyanfei/conda_env/hyf_py3/lib/python3.6/site-packages/paddle/fluid/dygraph/dygraph_to_static/partial_program.py", line 165, in _verify_program
    self._check_params_all_inited(main_program)
  File "/mnt/huiyanfei/conda_env/hyf_py3/lib/python3.6/site-packages/paddle/fluid/dygraph/dygraph_to_static/partial_program.py", line 382, in _check_params_all_inited
    % name)
ValueError: 
        We don't support to define layer with parameters in the function decorated by `@declarative`.
        Because that will re-defined parameters every time when you run the function.
        But we found parameter(conv1d_1.w_0) was created in the decorated function.
        Please define the layer with parameters in `__init__` function. 

源码

'''
@Author: your name
@Date: 2021-10-20 16:35:31
@LastEditTime: 2021-10-25 14:43:35
@LastEditors: Please set LastEditors
@Description: In User Settings Edit
@FilePath: /huiyanfei/work/re2_paddle_all/model.py
'''
import numpy
from typing import Collection
import math
import paddle
paddle.set_device("cpu")
import paddle.nn as nn
from paddle import fluid 
import paddle.nn.functional as f
from functools import partial 

def register(name=None, registry=None):
    def decorator(fn, registration_name=None):
        module_name = registration_name or _default_name(fn)
        if module_name in registry:
            raise LookupError(f"module {module_name} already registered.")
        registry[module_name] = fn
        return fn
    return lambda fn: decorator(fn, name)

def _default_name(obj_class):
    return obj_class.__name__

registry = {}
register = partial(register, registry=registry)

class Module(paddle.nn.Layer):
    def __init__(self):
        super().__init__()
        self.summary = {}

    def add_summary(self, name, val):
        if self.training:
            self.summary[name] = val.clone().numpy()

    def get_summary(self, base_name=''):
        summary = {}
        if base_name:
            base_name += '/'
        if self.summary:
            summary.update({base_name + name: val for name, val in self.summary.items()})
        for name, child in self.named_children():
            if hasattr(child, 'get_summary'):
                name = base_name + name
                summary.update(child.get_summary(name))
        return summary

class GeLU(paddle.nn.Layer):
    def __init__(self,):
        super().__init__()
    def forward(self, x):
        return 0.5 * x * (1. + paddle.tanh(x * 0.7978845608 * (1. + 0.044715 * x * x)))

class Linear(paddle.nn.Layer):
    def __init__(self, in_features, out_features, activations=False):
        super().__init__()
        weight_attr = paddle.framework.ParamAttr( 
            initializer=paddle.nn.initializer.Normal(mean=0.0, std=math.sqrt((2. if activations else 1.) / in_features)))
        bias_attr = paddle.framework.ParamAttr( 
            initializer=paddle.nn.initializer.Constant(value=0.0))
        linear = nn.Linear(in_features, out_features, weight_attr=weight_attr, bias_attr=bias_attr)
        # nn.init.normal_(linear.weight, std=math.sqrt((2. if activations else 1.) / in_features))
        # nn.init.zeros_(linear.bias)
        # linear.weight.normal_(0, math.sqrt((2. if activations else 1.) / in_features))
        # linear.bias.zero_()

        modules = [nn.utils.weight_norm(linear)]
        if activations:
            modules.append(GeLU())
        self.model = nn.Sequential(*modules)

    def forward(self, x):
        return self.model(x)

class Conv1d(paddle.nn.Layer):
    def __init__(self, in_channels, out_channels, kernel_sizes: Collection[int]):
        super(Conv1d,self).__init__()
        print(in_channels, out_channels, kernel_sizes)
        assert all(k % 2 == 1 for k in kernel_sizes), 'only support odd kernel sizes'
        assert out_channels % len(kernel_sizes) == 0, 'out channels must be dividable by kernels'
        out_channels = out_channels // len(kernel_sizes)
        # self.conv = nn.Sequential(nn.Conv1D(in_channels, out_channels,kernel_sizes[0],padding = (kernel_sizes[0] - 1) // 2),GeLU())
        self.conv = paddle.nn.Conv1D(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_sizes[0],padding = (kernel_sizes[0] - 1) // 2)
        # self.convs = []
        # self.weight_attr = paddle.ParamAttr(learning_rate=0.5,
        #                        regularizer=paddle.regularizer.L2Decay(1.0),
        #                        trainable=True)
        # for kernel_size in kernel_sizes: 
        #     self.conv = nn.Conv1D(in_channels, out_channels, kernel_size,
        #                      padding=(kernel_size - 1) // 2,weight_attr=self.weight_attr)
        self.gelu = GeLU() 
        #     self.convs.append(nn.Sequential(self.conv,self.gelu))

    def forward(self, x):
        return self.gelu(self.conv(x))
        # return paddle.concat([encoder(x) for encoder in self.convs], axis=-1)

# @register('identity')
class Alignment(Module):
    def __init__(self, args, __):
        super().__init__()
        # self.temperature = nn.Parameter(paddle.tensor(1 / math.sqrt(args.hidden_size)))
        x = paddle.to_tensor(1 / math.sqrt(args.hidden_size))
        self.temperature = paddle.create_parameter(shape=x.shape,
                        dtype=str(x.numpy().dtype),
                        default_initializer=paddle.nn.initializer.Assign(x))

    def _attention(self, a, b):

        # return paddle.matmul(a, b.transpose(1, 2)) * self.temperature
        b = paddle.transpose(b, perm=[0, 2, 1])
        return paddle.matmul(a, b) * self.temperature

    def forward(self, a, b, mask_a, mask_b):
        attn = self._attention(a, b)

        # mask = paddle.matmul(mask_a.float(), mask_b.transpose(1, 2).float()).byte()
        mask_b = paddle.transpose(mask_b.astype("float32"), perm=[0, 2, 1])
        mask = paddle.matmul(mask_a.astype("float32"), mask_b)#.astype("bool")
        # attn.masked_fill_(~mask, -1e7)
        mask_ = paddle.full_like(mask, -1e7)
        attn = paddle.where(mask>0, attn, mask_)
        attn_a = f.softmax(attn, axis=1)
        attn_b = f.softmax(attn, axis=2)

        # feature_b = paddle.matmul(attn_a.transpose(1, 2), a)
        attn_a_trans = paddle.transpose(attn_a, perm=[0, 2, 1])
        feature_b = paddle.matmul(attn_a_trans, a)
        feature_a = paddle.matmul(attn_b, b)
        self.add_summary('temperature', self.temperature)
        self.add_summary('attention_a', attn_a)
        self.add_summary('attention_b', attn_b)
        return feature_a, feature_b

# @register('linear')
# class MappedAlignment(Alignment):
#     def __init__(self, args, input_size):
#         super().__init__(args, input_size)
#         self.projection = nn.Sequential(
#             nn.Dropout(args.dropout),
#             Linear(input_size, args.hidden_size, activations=True),
#         )

#     def _attention(self, a, b):
#         a = self.projection(a)
#         b = self.projection(b)
#         return super()._attention(a, b)

# @register('none')
# class NullConnection(paddle.nn.Layer):
#     def forward(self, x, _, __):
#         return x

# @register('residual')
# class Residual(paddle.nn.Layer):
#     def __init__(self, args):
#         super().__init__()
#         self.linear = Linear(args.embedding_dim, args.hidden_size)

#     def forward(self, x, res, i):
#         if i == 1:
#             res = self.linear(res)
#         return (x + res) * math.sqrt(0.5)

# @register('aug')
class AugmentedResidual(paddle.nn.Layer):
    def __init__(self,):
        super().__init__()
    def forward(self, x, res, i):
        if i == 1:
            return paddle.concat([x, res], axis=-1)  # res is embedding
        hidden_size = x.size(-1)
        x = (res[:, :, :hidden_size] + x) * math.sqrt(0.5)
        return paddle.concat([x, res[:, :, hidden_size:]], axis=-1)  # latter half of res is embedding

class Embedding(paddle.nn.Layer):
    def __init__(self, args):
        super().__init__()
        self.fix_embeddings = args.fix_embeddings
        self.embedding = nn.Embedding(args.num_vocab, args.embedding_dim, padding_idx=0)
        self.dropout = args.dropout

    def set_(self, value):
        # self.embedding.weight.requires_grad = not self.fix_embeddings
        pretrained_attr = paddle.ParamAttr(name='embedding',
                                   initializer=paddle.nn.initializer.Assign(value),
                                   trainable=not self.fix_embeddings)
        self.embedding = paddle.nn.Embedding(num_embeddings=len(value),
                                      embedding_dim=300,
                                      padding_idx= 0 ,
                                      weight_attr=pretrained_attr)
        # self.embedding.load_state_dict({'weight': paddle.tensor(value)})

    def forward(self, x):
        x = self.embedding(x)
        x = f.dropout(x, self.dropout, self.training)
        return x

class Encoder(paddle.nn.Layer):
    def __init__(self, args, input_size):
        super().__init__()
        self.dropout = args.dropout
        print(args.enc_layers)
        self.conv1 = Conv1d( in_channels=input_size,
                out_channels=args.hidden_size,
                kernel_sizes=args.kernel_sizes)
        self.conv2 = Conv1d( in_channels=args.hidden_size,
                out_channels=args.hidden_size,
                kernel_sizes=args.kernel_sizes)
        self.encoders = nn.LayerList([self.conv1,self.conv2])

    def forward(self, x, mask):
        # x = x.transpose(1, 2)  # B x C x L
        x = paddle.transpose(x, perm=[0, 2, 1])
        # mask = mask.transpose(1, 2)
        mask = paddle.transpose(mask.astype("float32"), perm=[0, 2, 1])#.astype("bool")
        for i, encoder in enumerate(self.encoders):
            # x.masked_fill_(~mask, 0.)
            mask_ = paddle.full_like(mask, 0.)
            x = paddle.where(mask>0, x, mask_) 
            if i > 0:
                x = f.dropout(x, self.dropout, self.training)
            x = encoder(x)
        x = f.dropout(x, self.dropout, self.training)
        # return x.transpose(1, 2)  # B x L x C
        return paddle.transpose(x, perm=[0, 2, 1])

# @register('simple')
# class Fusion(paddle.nn.Layer):
#     def __init__(self, args, input_size):
#         super().__init__()
#         self.fusion = Linear(input_size * 2, args.hidden_size, activations=True)

#     def forward(self, x, align):
#         return self.fusion(paddle.concat([x, align], axis=-1))

# @register('full_fusion')
class FullFusion(paddle.nn.Layer):
    def __init__(self, args, input_size):
        super().__init__()
        self.dropout = args.dropout
        self.fusion1 = Linear(input_size * 2, args.hidden_size, activations=True)
        self.fusion2 = Linear(input_size * 2, args.hidden_size, activations=True)
        self.fusion3 = Linear(input_size * 2, args.hidden_size, activations=True)
        self.fusion = Linear(args.hidden_size * 3, args.hidden_size, activations=True)

    def forward(self, x, align):
        x1 = self.fusion1(paddle.concat([x, align], axis=-1))
        x2 = self.fusion2(paddle.concat([x, x - align], axis=-1))
        x3 = self.fusion3(paddle.concat([x, x * align], axis=-1))
        x = paddle.concat([x1, x2, x3], axis=-1)
        x = f.dropout(x, self.dropout, self.training)
        return self.fusion(x)

class Pooling(paddle.nn.Layer):
    def forward(self, x, mask):
        # print(x.shape,mask.shape)
        # return x.masked_fill_(~mask, -float('inf')).max(dim=1)[0]
        mask = mask.astype("float32")
        # mask_ = paddle.full_like(x, 0.0)
        # mask_ += float('-inf')
        # return paddle.where(mask>0, x, mask_).max(axis=1)[0]
        # out = paddle.max(paddle.where(mask == 0, paddle.to_tensor(0.), x),axis=1)
        out = paddle.max(paddle.where(mask == paddle.zeros([1]), paddle.zeros([1]), x),axis=1)
        # print(out.shape)
        return out#[0]

# @register('simple')
class Prediction(paddle.nn.Layer):
    def __init__(self, args, inp_features=2):
        super().__init__()
        self.dense = nn.Sequential(
            nn.Dropout(args.dropout),
            Linear(args.hidden_size * inp_features, args.hidden_size, activations=True),
            nn.Dropout(args.dropout),
            Linear(args.hidden_size, args.num_classes),
        )

    def forward(self, a, b):
        return self.dense(paddle.concat([a, b], axis=-1))

# @register('full')
class AdvancedPrediction(Prediction):
    def __init__(self, args):
        super().__init__(args, inp_features=4)

    def forward(self, a, b):
        return self.dense(paddle.concat([a, b, a - b, a * b], axis=-1))

################################## 模型主网络 #################################

class Network(Module):
    def __init__(self, args):
        super().__init__()
        # self.dropout = args.dropout
        self.embedding = Embedding(args)
        # self.blocks = [,
        # ) for i in range(args.blocks)]
        self.blocks = []
        for i in range(args.blocks):
            block = nn.Sequential(("encoder",Encoder(args, args.embedding_dim if i == 0 else args.embedding_dim + args.hidden_size)),
            ("alignment",Alignment(args, args.embedding_dim + args.hidden_size if i == 0 else args.embedding_dim + args.hidden_size * 2)),
            ("fusion",FullFusion(args, args.embedding_dim + args.hidden_size if i == 0 else args.embedding_dim + args.hidden_size * 2)))
            self.blocks.append(block)

        self.connection = AugmentedResidual()
        self.pooling = Pooling()
        self.prediction = AdvancedPrediction(args)

    def forward(self, a,b,mask_a,mask_b):
        # print(type(a),type(b))
        a = self.embedding(a)
        b = self.embedding(b)
        # print(type(a),type(b))
        res_a, res_b = a, b

        for i, block in enumerate(self.blocks):
            if i > 0:
                a = self.connection(a, res_a, i)
                b = self.connection(b, res_b, i)
                res_a, res_b = a, b 
            a_enc = block["encoder"](a, mask_a)
            b_enc = block["encoder"](b, mask_b)
            a = paddle.concat([a, a_enc], axis=-1)
            b = paddle.concat([b, b_enc], axis=-1)
            align_a, align_b = block["alignment"](a, b, mask_a, mask_b)
            a = block["fusion"](a, align_a)
            b = block["fusion"](b, align_b)
            # a,b = block(a,b,mask_a,mask_b)
            print(type(a),type(b))
        # a,b = self.blocks[0](a,b,mask_a,mask_b)
        a = self.pooling(a, mask_a)
        b = self.pooling(b, mask_b)
        print(type(a),type(b))
        return self.prediction(a, b)

############################## 损失函数 ##########################

def get_loss(logits, target, soft_target):
    softlabel = False
    def SoftCrossEntropy(inputs, target,soft_target,reduction='sum'):  
        if reduction == 'average': 
            log_likelihood = inputs[:,1] 
            loss = paddle.sum(paddle.mul(log_likelihood, target)) / batch
        else:
            # log_likelihood = inputs[:,1]   
            soft_loss =  5 * paddle.nn.functional.mse_loss(inputs, soft_target)
            # target = paddle.tensor(target, dtype=paddle.long).to(DEVICE)
            hard_loss =  f.cross_entropy(logits, target)
        return hard_loss + soft_loss 
    if softlabel: 
        return SoftCrossEntropy(logits,target,soft_target)
    else: 
        return f.cross_entropy(logits, target)

##############################################   训练   ##########################
args_dict = {'alignment': 'identity',
  'batch_size': 512,
  'beta1': 0.9,
  'beta2': 0.999,
  'blocks': 2,
  'connection': 'aug',
  'cuda': True,
  'data_dir': 'data/ddqa',
  'deterministic': True,
  'dropout': 0.2,
  'early_stopping': 1954,
  'embedding_dim': 300,
  'embedding_mode': 'freq',
  'enc_layers': 2,
  'encoder': 'cnn',
  'epochs': 30,
  'eval_epoch': True,
  'eval_file': 'dev',
  'eval_per_samples': 64000,
  'eval_per_samples_warmup': 40000,
  'eval_per_updates': 125,
  'eval_per_updates_warmup': 79,
  'eval_subset': None,
  'eval_warmup_samples': 0,
  'eval_warmup_steps': 0,
  'fix_embeddings': False,
  'fusion': 'full_fusion',
  'grad_clipping': 5,
  'hidden_size': 150,
  'kernel_sizes': [3],
  'log_file': 'log.txt',
  'log_per_samples': 32000,
  'log_per_updates': 63,
  'lower_case': True,
  'lr': 0.001,
  'lr_decay_rate': 1.0,
  'lr_decay_samples': 128000,
  'lr_decay_steps': 250,
  'lr_warmup_samples': 0,
  'lr_warmup_steps': 0,
  'max_len': 150,
  'max_loss': 999.0,
  'max_vocab': 999999,
  'metric': 'acc',
  'min_df': 5,
  'min_len': 1,
  'min_lr': 6e-05,
  'min_samples': 0,
  'min_steps': 0,
  'multi_gpu': False,
  'name': 'benchmark-0',
  'num_classes': 2,
  'num_vocab': 248827,
  'output_dir': 'models/ddqa_合并训练集_softlabel',
  'padding': 0,
  'prediction': 'full',
  'pretrained_embeddings': 'resources/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5',
  'resume': None,
  'save': True,
  'save_all': False,
  'seed': None,
  'sort_by_len': False,
  'summary_dir': 'models/ddqa_合并训练集_softlabel/benchmark-0',
  'summary_per_logs': 20,
  'summary_per_updates': 1260,
  'tensorboard': True,
  'tolerance_samples': 1000000,
  'watch_metrics': ['acc'],
  'weight_decay': 0
  }

Args = type("Args",(),{})
for  key,value in args_dict.items():
    setattr(Args,key,value)
args = Args()
model = Network(args)

optimizer = paddle.optimizer.Adam(parameters = model.parameters(), learning_rate=args.lr, beta1=args.beta1,beta2= args.beta2,
                                    weight_decay=float(args.weight_decay))

a= [[200]*150]

b= [[100]*150]

mask_a = [[[1]]*len(a[0])]
mask_b = [[[1]]*len(b[0])]

a_t = paddle.to_tensor(a)
mask_a_t = paddle.to_tensor(mask_a)
b_t = paddle.to_tensor(b)
mask_b_t = paddle.to_tensor(mask_b)

output = model(a_t,b_t,mask_a_t,mask_b_t)
target = paddle.to_tensor([1],dtype="int64")
soft_target = paddle.to_tensor([0.84127417])
loss =  get_loss(output, target, soft_target)
loss.backward()
optimizer.step()

from paddle.static import InputSpec
from paddle.jit import to_static
net = to_static(model, input_spec=[InputSpec(shape=[None, 150],dtype = 'int32', name='a'),InputSpec(shape=[None,150],dtype= 'int32', name='b'),InputSpec(shape=[None,150,1],dtype= 'int32', name='mask_a'),InputSpec(shape=[None,150,1],dtype= 'int32', name='mask_b')])
print("\nto_static done!\n")
paddle.jit.save(net, 'inference_model_new/qanet')
LoveNingBo commented 2 years ago

感谢paddle同学的解答,报错主要还是由于组网方式不规范造成,修改方案为:

        self.blocks = []
        for i in range(args.blocks):
            block = nn.Sequential(("encoder",Encoder(args, args.embedding_dim if i == 0 else args.embedding_dim + args.hidden_size)),
            ("alignment",Alignment(args, args.embedding_dim + args.hidden_size if i == 0 else args.embedding_dim + args.hidden_size * 2)),
            ("fusion",FullFusion(args, args.embedding_dim + args.hidden_size if i == 0 else args.embedding_dim + args.hidden_size * 2)))
            self.blocks.append(block)

替换为

        blocks = []
        for i in range(args.blocks):
            block = nn.Sequential(("encoder",Encoder(args, args.embedding_dim if i == 0 else args.embedding_dim + args.hidden_size)),
            ("alignment",Alignment(args, args.embedding_dim + args.hidden_size if i == 0 else args.embedding_dim + args.hidden_size * 2)),
            ("fusion",FullFusion(args, args.embedding_dim + args.hidden_size if i == 0 else args.embedding_dim + args.hidden_size * 2)))
            blocks.append(block)
        self.blocks = nn.LayerList(blocks)