microsoft / torchscale

Foundation Architecture for (M)LLMs
https://aka.ms/GeneralAI
MIT License
2.98k stars 201 forks source link

AttributeError: 'EncoderDecoderConfig' object has no attribute 'normalize_output' #73

Closed Yuki2L0ve closed 9 months ago

Yuki2L0ve commented 9 months ago

I have get two errors: (1) from torchscale.architecture.config import EncoderDecoderConfig from torchscale.architecture.encoder_decoder import EncoderDecoder

config = EncoderDecoderConfig(vocab_size=64000) encdec = EncoderDecoder(config) print(encdec)

Traceback (most recent call last): AttributeError: 'EncoderDecoderConfig' object has no attribute 'normalize_output'

(2) import torch from torchscale.architecture.config import RetNetConfig from torchscale.architecture.retnet import RetNetDecoder

config = RetNetConfig(vocab_size=64000) retnet = RetNetDecoder(config) print(retnet)

it shows that "Cannot find reference 'RetNetConfig' in 'config.py' " and "Cannot find reference 'retnet' in 'init.py' " and "Unresolved reference 'RetNetDecoder' "

so how I fix them?

donglixp commented 9 months ago

@shumingma Is this the same issue as https://github.com/microsoft/torchscale/issues/67 ?

Yuki2L0ve commented 9 months ago

@shumingma Is this the same issue as #67 ?

That is to say, you don't have fixed the first problem yet? How to solve the problem (2)?

shumingma commented 9 months ago

The latest release torchscale 0.3.0 (https://pypi.org/project/torchscale/) has fixed these problems. Please have a try. Thanks!

>>> import torchscale
>>> from torchscale.architecture.config import EncoderDecoderConfig
>>> from torchscale.architecture.encoder_decoder import EncoderDecoder
>>>
>>> config = EncoderDecoderConfig(vocab_size=64000)
>>> encdec = EncoderDecoder(config)
>>> print(encdec)
EncoderDecoder(
  (encoder): Encoder(
    (dropout_module): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-11): 12 x EncoderLayer(
        (self_attn): MultiheadAttention(
          (k_proj): Linear(in_features=768, out_features=768, bias=True)
          (v_proj): Linear(in_features=768, out_features=768, bias=True)
          (q_proj): Linear(in_features=768, out_features=768, bias=True)
          (out_proj): Linear(in_features=768, out_features=768, bias=True)
          (inner_attn_ln): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout_module): Dropout(p=0.0, inplace=False)
        )
        (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout_module): Dropout(p=0.0, inplace=False)
        (ffn): FeedForwardNetwork(
          (activation_dropout_module): Dropout(p=0.0, inplace=False)
          (dropout_module): Dropout(p=0.0, inplace=False)
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (ffn_layernorm): LayerNorm((3072,), eps=1e-05, elementwise_affine=True)
        )
        (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
    )
    (layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (decoder): Decoder(
    (dropout_module): Dropout(p=0.0, inplace=False)
    (output_projection): Linear(in_features=768, out_features=64000, bias=False)
    (layers): ModuleList(
      (0-11): 12 x DecoderLayer(
        (dropout_module): Dropout(p=0.0, inplace=False)
        (self_attn): MultiheadAttention(
          (k_proj): Linear(in_features=768, out_features=768, bias=True)
          (v_proj): Linear(in_features=768, out_features=768, bias=True)
          (q_proj): Linear(in_features=768, out_features=768, bias=True)
          (out_proj): Linear(in_features=768, out_features=768, bias=True)
          (inner_attn_ln): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout_module): Dropout(p=0.0, inplace=False)
        )
        (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (encoder_attn): MultiheadAttention(
          (k_proj): Linear(in_features=768, out_features=768, bias=True)
          (v_proj): Linear(in_features=768, out_features=768, bias=True)
          (q_proj): Linear(in_features=768, out_features=768, bias=True)
          (out_proj): Linear(in_features=768, out_features=768, bias=True)
          (dropout_module): Dropout(p=0.0, inplace=False)
        )
        (encoder_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (ffn): FeedForwardNetwork(
          (activation_dropout_module): Dropout(p=0.0, inplace=False)
          (dropout_module): Dropout(p=0.0, inplace=False)
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (ffn_layernorm): LayerNorm((3072,), eps=1e-05, elementwise_affine=True)
        )
        (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
    )
    (layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
)
>>> import torch
>>> from torchscale.architecture.config import RetNetConfig
>>> from torchscale.architecture.retnet import RetNetDecoder
>>>
>>> config = RetNetConfig(vocab_size=64000)
>>> retnet = RetNetDecoder(config)
>>> print(retnet)
RetNetDecoder(
  (dropout_module): Dropout(p=0.0, inplace=False)
  (output_projection): Linear(in_features=768, out_features=64000, bias=False)
  (layers): ModuleList(
    (0-11): 12 x DecoderLayer(
      (dropout_module): Dropout(p=0.0, inplace=False)
      (retention): MultiScaleRetention(
        (q_proj): Linear(in_features=768, out_features=768, bias=False)
        (k_proj): Linear(in_features=768, out_features=768, bias=False)
        (v_proj): Linear(in_features=768, out_features=1280, bias=False)
        (g_proj): Linear(in_features=768, out_features=1280, bias=False)
        (out_proj): Linear(in_features=1280, out_features=768, bias=False)
        (group_norm): RMSNorm()
      )
      (retention_layer_norm): RMSNorm()
      (ffn): GLU(
        (activation_dropout_module): Dropout(p=0.0, inplace=False)
        (dropout_module): Dropout(p=0.0, inplace=False)
        (fc1): Linear(in_features=768, out_features=1280, bias=False)
        (fc2): Linear(in_features=1280, out_features=768, bias=False)
        (gate): Linear(in_features=768, out_features=1280, bias=False)
      )
      (final_layer_norm): RMSNorm()
    )
  )
  (layer_norm): RMSNorm()
  (retnet_rel_pos): RetNetRelPos()
)