[BUG] ValueError: `.to` is not supported for `4-bit` or `8-bit` models. Please use the model as it is, since the model has already been set to the correct devices and casted to the correct `dtype`.

Describe the bug Loading the llama2 70b model using 4 bit(bitstandbytes) and then distributed the model by calling deepspeed.initialize. Get the following error

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[3], line 39
      1 deepspeed_config = {
      2     "optimizer": {
      3           "type": "AdamW",
   (...)
     37     "contiguous_gradients": True,
     38 }
---> 39 model, optimizer, _, lr_scheduler = deepspeed.initialize(
     40 model=model,
     41 model_parameters=model.parameters(),
     42 config=deepspeed_config,
     43 )

File /opt/conda/envs/domino-ray/lib/python3.10/site-packages/deepspeed/__init__.py:177, in initialize(args, model, optimizer, model_parameters, training_data, lr_scheduler, mpu, dist_init_required, collate_fn, config, config_params)
    165         engine = DeepSpeedHybridEngine(args=args,
    166                                        model=model,
    167                                        optimizer=optimizer,
   (...)
    174                                        config=config,
    175                                        config_class=config_class)
    176     else:
--> 177         engine = DeepSpeedEngine(args=args,
    178                                  model=model,
    179                                  optimizer=optimizer,
    180                                  model_parameters=model_parameters,
    181                                  training_data=training_data,
    182                                  lr_scheduler=lr_scheduler,
    183                                  mpu=mpu,
    184                                  dist_init_required=dist_init_required,
    185                                  collate_fn=collate_fn,
    186                                  config=config,
    187                                  config_class=config_class)
    188 else:
    189     assert mpu is None, "mpu must be None with pipeline parallelism"

File /opt/conda/envs/domino-ray/lib/python3.10/site-packages/deepspeed/runtime/engine.py:262, in DeepSpeedEngine.__init__(self, args, model, optimizer, model_parameters, training_data, lr_scheduler, mpu, dist_init_required, collate_fn, config, config_class, dont_change_device)
    259 self.pipeline_parallelism = isinstance(model, PipelineModule)
    261 # Configure distributed model
--> 262 self._configure_distributed_model(model)
    264 # needed for zero_to_fp32 weights reconstruction to remap nameless data to state_dict
    265 self.param_names = {param: name for name, param in model.named_parameters()}

File /opt/conda/envs/domino-ray/lib/python3.10/site-packages/deepspeed/runtime/engine.py:1113, in DeepSpeedEngine._configure_distributed_model(self, model)
   1111 # zero.Init() handles device placement of model
   1112 if not (self.dont_change_device or is_zero_init_model):
-> 1113     self.module.to(self.device)
   1115 # MoE related initialization
   1116 for _, module in self.module.named_modules():

File /opt/conda/envs/domino-ray/lib/python3.10/site-packages/transformers/modeling_utils.py:1895, in PreTrainedModel.to(self, *args, **kwargs)
   1892 def to(self, *args, **kwargs):
   1893     # Checks if the model has been loaded in 8-bit
   1894     if getattr(self, "is_quantized", False):
-> 1895         raise ValueError(
   1896             "`.to` is not supported for `4-bit` or `8-bit` models. Please use the model as it is, since the"
   1897             " model has already been set to the correct devices and casted to the correct `dtype`."
   1898         )
   1899     else:
   1900         return super().to(*args, **kwargs)

ValueError: `.to` is not supported for `4-bit` or `8-bit` models. Please use the model as it is, since the model has already been set to the correct devices and casted to the correct `dtype`.

To Reproduce Steps to reproduce the behavior: Here is the code to reproduce

model_name = 'llm-models/Llama-2-7b-chat-hf'
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )

model = AutoModelForSequenceClassification.from_pretrained(
    model_name, 
    return_dict=True, 
    quantization_config=bnb_config
)

deepspeed_config = {
    "optimizer": {
          "type": "AdamW",
            "params": {
                "lr": "auto",
                "betas": "auto",
                "eps": "auto",
            },
    },
    "scheduler": {"type": "WarmupLR", "params": {"warmup_num_steps": 100}},
    "fp16": {"enabled": False},
    "bf16": {"enabled": True},  # Turn this on if using AMPERE GPUs.
    "zero_optimization": {
        "stage": 3,
        "offload_optimizer": {
            "device": "cpu",
            "pin_memory": True
        },
        "stage3_gather_16bit_weights_on_model_save": True
    },
    "gradient_accumulation_steps": 1,
    "gradient_clipping": True,
    "steps_per_print": 10,
    "train_micro_batch_size_per_gpu": 16,
    "wall_clock_breakdown": False,
    "overlap_comm": True,
    "contiguous_gradients": True,
}
model, optimizer, _, lr_scheduler = deepspeed.initialize(
          model=model,
         model_parameters=model.parameters(),
        config=deepspeed_config,
)

Expected behavior get a distributed model and optimizer and etc

ds_report output Please run ds_report to give us details about your setup.

[2024-02-26 15:34:02,963] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
--------------------------------------------------
DeepSpeed C++/CUDA extension op report
--------------------------------------------------
NOTE: Ops not installed will be just-in-time (JIT) compiled at
      runtime if needed. Op compatibility means that your system
      meet the required dependencies to JIT install the op.
--------------------------------------------------
JIT compiled ops requires ninja
ninja .................. [OKAY]
--------------------------------------------------
op name ................ installed .. compatible
--------------------------------------------------
 [WARNING]  async_io requires the dev libaio .so object and headers but these were not found.
 [WARNING]  async_io: please install the libaio-devel package with yum
 [WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
async_io ............... [NO] ....... [NO]
fused_adam ............. [YES] ...... [OKAY]
cpu_adam ............... [NO] ....... [OKAY]
cpu_adagrad ............ [NO] ....... [OKAY]
cpu_lion ............... [NO] ....... [OKAY]
 [WARNING]  Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
evoformer_attn ......... [NO] ....... [NO]
fused_lamb ............. [NO] ....... [OKAY]
fused_lion ............. [NO] ....... [OKAY]
inference_core_ops ..... [NO] ....... [OKAY]
cutlass_ops ............ [NO] ....... [OKAY]
transformer_inference .. [NO] ....... [OKAY]
quantizer .............. [NO] ....... [OKAY]
ragged_device_ops ...... [NO] ....... [OKAY]
ragged_ops ............. [NO] ....... [OKAY]
random_ltd ............. [NO] ....... [OKAY]
 [WARNING]  sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.1
 [WARNING]  using untested triton version (2.1.0), only 1.0.0 is known to be compatible
sparse_attn ............ [NO] ....... [NO]
spatial_inference ...... [NO] ....... [OKAY]
transformer ............ [NO] ....... [OKAY]
stochastic_transformer . [NO] ....... [OKAY]
--------------------------------------------------
DeepSpeed general environment info:
torch install path ............... ['/opt/conda/envs/domino-ray/lib/python3.10/site-packages/torch']
torch version .................... 2.1.0+cu121
deepspeed install path ........... ['/opt/conda/envs/domino-ray/lib/python3.10/site-packages/deepspeed']
deepspeed info ................... 0.13.2, unknown, unknown
torch cuda version ............... 12.1
torch hip version ................ None
nvcc version ..................... 12.1
deepspeed wheel compiled w. ...... torch 2.1, cuda 12.1
shared memory (/dev/shm) size .... 64.00 MB
  [WARNING] /dev/shm size might be too small, if running in docker increase to at least --shm-size='1gb' 
  [WARNING] see more details about NCCL requirements: https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/troubleshooting.html#sharing-data

Screenshots detailed stacktrace

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[3], line 39
      1 deepspeed_config = {
      2     "optimizer": {
      3           "type": "AdamW",
   (...)
     37     "contiguous_gradients": True,
     38 }
---> 39 model, optimizer, _, lr_scheduler = deepspeed.initialize(
     40 model=model,
     41 model_parameters=model.parameters(),
     42 config=deepspeed_config,
     43 )

File /opt/conda/envs/domino-ray/lib/python3.10/site-packages/deepspeed/__init__.py:177, in initialize(args, model, optimizer, model_parameters, training_data, lr_scheduler, mpu, dist_init_required, collate_fn, config, config_params)
    165         engine = DeepSpeedHybridEngine(args=args,
    166                                        model=model,
    167                                        optimizer=optimizer,
   (...)
    174                                        config=config,
    175                                        config_class=config_class)
    176     else:
--> 177         engine = DeepSpeedEngine(args=args,
    178                                  model=model,
    179                                  optimizer=optimizer,
    180                                  model_parameters=model_parameters,
    181                                  training_data=training_data,
    182                                  lr_scheduler=lr_scheduler,
    183                                  mpu=mpu,
    184                                  dist_init_required=dist_init_required,
    185                                  collate_fn=collate_fn,
    186                                  config=config,
    187                                  config_class=config_class)
    188 else:
    189     assert mpu is None, "mpu must be None with pipeline parallelism"

File /opt/conda/envs/domino-ray/lib/python3.10/site-packages/deepspeed/runtime/engine.py:262, in DeepSpeedEngine.__init__(self, args, model, optimizer, model_parameters, training_data, lr_scheduler, mpu, dist_init_required, collate_fn, config, config_class, dont_change_device)
    259 self.pipeline_parallelism = isinstance(model, PipelineModule)
    261 # Configure distributed model
--> 262 self._configure_distributed_model(model)
    264 # needed for zero_to_fp32 weights reconstruction to remap nameless data to state_dict
    265 self.param_names = {param: name for name, param in model.named_parameters()}

File /opt/conda/envs/domino-ray/lib/python3.10/site-packages/deepspeed/runtime/engine.py:1113, in DeepSpeedEngine._configure_distributed_model(self, model)
   1111 # zero.Init() handles device placement of model
   1112 if not (self.dont_change_device or is_zero_init_model):
-> 1113     self.module.to(self.device)
   1115 # MoE related initialization
   1116 for _, module in self.module.named_modules():

File /opt/conda/envs/domino-ray/lib/python3.10/site-packages/transformers/modeling_utils.py:1895, in PreTrainedModel.to(self, *args, **kwargs)
   1892 def to(self, *args, **kwargs):
   1893     # Checks if the model has been loaded in 8-bit
   1894     if getattr(self, "is_quantized", False):
-> 1895         raise ValueError(
   1896             "`.to` is not supported for `4-bit` or `8-bit` models. Please use the model as it is, since the"
   1897             " model has already been set to the correct devices and casted to the correct `dtype`."
   1898         )
   1899     else:
   1900         return super().to(*args, **kwargs)

ValueError: `.to` is not supported for `4-bit` or `8-bit` models. Please use the model as it is, since the model has already been set to the correct devices and casted to the correct `dtype`.

Additional context Add any other context about the problem here.

microsoft / DeepSpeed

[BUG] ValueError: `.to` is not supported for `4-bit` or `8-bit` models. Please use the model as it is, since the model has already been set to the correct devices and casted to the correct `dtype`. #5195