[BUG] Cannot initialize inference engine with Hugging Face Transformers GPT2

Describe the bug I cannot initialize an inference engine from Hugging Face's GPT2 implementation.

To Reproduce

from transformers import AutoTokenizer, AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained('gpt2')

model = deepspeed.init_inference(model,
                                 mp_size=1,
                                 dtype=torch.float,
                                 replace_method='auto',
                                 replace_with_kernel_inject=True)

Expected behavior An inference engine should be returned.

ds_report output

--------------------------------------------------
DeepSpeed C++/CUDA extension op report
--------------------------------------------------
NOTE: Ops not installed will be just-in-time (JIT) compiled at
      runtime if needed. Op compatibility means that your system
      meet the required dependencies to JIT install the op.
--------------------------------------------------
JIT compiled ops requires ninja
ninja .................. [OKAY]
--------------------------------------------------
op name ................ installed .. compatible
--------------------------------------------------
cpu_adam ............... [NO] ....... [OKAY]
cpu_adagrad ............ [NO] ....... [OKAY]
fused_adam ............. [NO] ....... [OKAY]
fused_lamb ............. [NO] ....... [OKAY]
sparse_attn ............ [NO] ....... [OKAY]
transformer ............ [NO] ....... [OKAY]
stochastic_transformer . [NO] ....... [OKAY]
 [WARNING]  async_io requires the dev libaio .so object and headers but these were not found.
 [WARNING]  async_io: please install the libaio-devel package with yum
 [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
async_io ............... [NO] ....... [NO]
transformer_inference .. [NO] ....... [OKAY]
utils .................. [NO] ....... [OKAY]
quantizer .............. [NO] ....... [OKAY]
--------------------------------------------------
DeepSpeed general environment info:
torch install path ............... ['/home/ec2-user/anaconda3/envs/pytorch_latest_p37/lib/python3.7/site-packages/torch']
torch version .................... 1.8.1+cu111
torch cuda version ............... 11.1
nvcc version ..................... 11.1
deepspeed install path ........... ['/home/ec2-user/anaconda3/envs/pytorch_latest_p37/lib/python3.7/site-packages/deepspeed']
deepspeed info ................... 0.5.10, unknown, unknown
deepspeed wheel compiled w. ...... torch 1.8, cuda 11.1

System info (please complete the following information):

OS: Amazon Linux
GPU count and types: 4 T4s
Python version: 3.7

Launcher context Notebook, single GPU

Docker context Just pip install deepspeed

Additional context Here's the output from init_inference:

[2022-02-14 20:23:12,459] [INFO] [logging.py:69:log_dist] [Rank -1] DeepSpeed info: version=0.5.10, git-hash=unknown, git-branch=unknown
[2022-02-14 20:23:12,460] [INFO] [engine.py:127:_init_quantization_setting] quantize_bits = 8 mlp_extra_grouping = False, quantize_groups = 1
DeepSpeed Transformer Inference config is  {'layer_id': 1, 'hidden_size': 768, 'intermediate_size': 3072, 'heads': 12, 'num_hidden_layers': -1, 'fp16': False, 'pre_layer_norm': True, 'local_rank': -1, 'stochastic_mode': False, 'epsilon': 1e-05, 'mp_size': 1, 'q_int8': False, 'scale_attention': True, 'specialized_mode': False, 'triangular_masking': True, 'local_attention': False, 'window_size': 1, 'rotary_dim': -1, 'return_tuple': True, 'mlp_after_attn': True}

TypeError                                 Traceback (most recent call last)
/tmp/ipykernel_37998/3974492404.py in <module>
      5                                  dtype=torch.float,
      6                                  replace_method='auto',
----> 7                                  replace_with_kernel_inject=True)

~/anaconda3/envs/pytorch_latest_p37/lib/python3.7/site-packages/deepspeed/__init__.py in init_inference(model, mp_size, mpu, checkpoint, module_key, dtype, injection_policy, replace_method, quantization_setting, replace_with_kernel_inject, return_tuple)
    281                                  replace_method,
    282                                  quantization_setting,
--> 283                                  replace_with_kernel_inject)
    284 
    285     return engine

~/anaconda3/envs/pytorch_latest_p37/lib/python3.7/site-packages/deepspeed/inference/engine.py in __init__(self, model, mp_size, mpu, checkpoint, dtype, injection_dict, return_tuple, replace_method, quantization_setting, replace_with_kernel_inject)
     86             self._apply_injection_policy(
     87                 return_tuple=return_tuple,
---> 88                 replace_with_kernel_inject=replace_with_kernel_inject)
     89 
     90         device = torch.cuda.current_device()

~/anaconda3/envs/pytorch_latest_p37/lib/python3.7/site-packages/deepspeed/inference/engine.py in _apply_injection_policy(self, client_module, injection_policy, return_tuple, replace_with_kernel_inject)
    173                                                      self.mlp_extra_grouping,
    174                                                      self.quantize_groups),
--> 175                                   replace_with_kernel_inject=replace_with_kernel_inject)
    176 
    177     def _load_checkpoint(self, load_dir, load_module_strict=True):

~/anaconda3/envs/pytorch_latest_p37/lib/python3.7/site-packages/deepspeed/module_inject/replace_module.py in replace_transformer_layer(orig_layer_impl, model, policy, micro_batch_size, config, seed, hidden_size, num_attention_heads, mp_size, mp_group, preln, fp16, local_rank, stochastic_mode, training, quantize, quantize_settings, return_tuple, replace_with_kernel_inject, linear_layer_setting)
    465                           orig_class=orig_layer_impl,
    466                           replace_fn=replace_fn,
--> 467                           _replace_policy=policy)
    468 
    469 

~/anaconda3/envs/pytorch_latest_p37/lib/python3.7/site-packages/deepspeed/module_inject/replace_module.py in replace_module(model, orig_class, replace_fn, _replace_policy)
    559         "You can find some samples here: https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/module_inject/replace_policy.py"
    560 
--> 561     replaced_module, _ = _replace_module(model, policy)
    562     return replaced_module
    563 

~/anaconda3/envs/pytorch_latest_p37/lib/python3.7/site-packages/deepspeed/module_inject/replace_module.py in _replace_module(model, policies, layer_id)
    581             layer_id += 1
    582         else:
--> 583             _, layer_id = _replace_module(child, policies, layer_id=layer_id)
    584 
    585     return model, layer_id

~/anaconda3/envs/pytorch_latest_p37/lib/python3.7/site-packages/deepspeed/module_inject/replace_module.py in _replace_module(model, policies, layer_id)
    581             layer_id += 1
    582         else:
--> 583             _, layer_id = _replace_module(child, policies, layer_id=layer_id)
    584 
    585     return model, layer_id

~/anaconda3/envs/pytorch_latest_p37/lib/python3.7/site-packages/deepspeed/module_inject/replace_module.py in _replace_module(model, policies, layer_id)
    578                 policies[child.__class__][0](child,
    579                                              policies[child.__class__][-1],
--> 580                                              layer_id))
    581             layer_id += 1
    582         else:

~/anaconda3/envs/pytorch_latest_p37/lib/python3.7/site-packages/deepspeed/module_inject/replace_module.py in replace_fn(child, _policy, layer_id)
    456                                                  preln=(_policy
    457                                                         is not HFBertLayerPolicy),
--> 458                                                  layer_id=layer_id)
    459             else:
    460                 new_module = replace_wo_policy(child, _policy)

~/anaconda3/envs/pytorch_latest_p37/lib/python3.7/site-packages/deepspeed/module_inject/replace_module.py in replace_with_policy(child, policy_cls, inference, preln, layer_id)
    285             attn_block.attn_qkvw.data = mp_replace.qkv_copy(attn_block.attn_qkvw.data,
    286                                                             qkvw)
--> 287             attn_block.attn_qkvb = mp_replace.qkv_copy(attn_block.attn_qkvb.data, qkvb)
    288 
    289             attn_block.attn_ow.data = mp_replace.copy(attn_block.attn_ow.data, dense_w)

~/anaconda3/envs/pytorch_latest_p37/lib/python3.7/site-packages/torch/nn/modules/module.py in __setattr__(self, name, value)
    968                 raise TypeError("cannot assign '{}' as parameter '{}' "
    969                                 "(torch.nn.Parameter or None expected)"
--> 970                                 .format(torch.typename(value), name))
    971             self.register_parameter(name, value)
    972         else:

TypeError: cannot assign 'torch.cuda.FloatTensor' as parameter 'attn_qkvb' (torch.nn.Parameter or None expected)

microsoft / DeepSpeed

[BUG] Cannot initialize inference engine with Hugging Face Transformers GPT2 #1770