'str' object has no attribute 'device' and other issues when using quantized model

leszkolukasz commented 8 months ago

System Info

- `Accelerate` version: 0.26.1
- Platform: Linux-6.1.58+-x86_64-with-glibc2.35
- Python version: 3.10.12
- Numpy version: 1.26.3
- PyTorch version (GPU?): 2.1.2+cu121 (True)
- PyTorch XPU available: False
- PyTorch NPU available: False
- System RAM: 83.48 GB
- GPU type: NVIDIA A100-SXM4-40GB
- `Accelerate` default config:
    Not found

Information

[ ] The official example scripts
[X] My own modified scripts

Tasks

[ ] One of the scripts in the examples/ folder of Accelerate or an officially supported no_trainer script in the examples folder of the transformers repo (such as run_no_trainer_glue.py)
[X] My own task or dataset (give details below)

Reproduction

import logging
from pathlib import Path
import psutil
import torch
from accelerate import (
    Accelerator,
    init_empty_weights,
)
from accelerate.utils import BnbQuantizationConfig, load_and_quantize_model
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    AutoConfig,
    PreTrainedTokenizerBase,
)
from huggingface_hub import snapshot_download
from ts.torch_handler.base_handler import BaseHandler
from ts.utils.util import PredictionException

logger = logging.getLogger(__name__)

FETCH_QUANITZED = (
    False  # Fetch quantized weights from repo instead of quantizing on initialization
)
LOAD_IN_8BIT = True
LOAD_IN_4BIT = False

if LOAD_IN_4BIT and LOAD_IN_8BIT:
    raise Exception("Invalid config")

class MixtralHandler(BaseHandler):
    MODEL_NAME = "cognitivecomputations/dolphin-2.6-mixtral-8x7b"
    QUANTIZED_WEIGHTS_URL = "whistleroosh/dolphin-2.6-mixtral-8x7b-8bit"
    model: torch.nn.Module
    tokenizer: PreTrainedTokenizerBase
    accelerator: Accelerator
    model_dir: Path

    def initialize(self, context):
        properties = context.system_properties
        manifest = context.manifest
        self.model_dir = Path(properties.get("model_dir"))

        logger.info(f"Properties: {properties}")
        logger.info(f"Manifest: {manifest}")

        self.accelerator = Accelerator()

        self.tokenizer = AutoTokenizer.from_pretrained(
            self.MODEL_NAME, trust_remote_code=True, use_fast=False
        )

        config = AutoConfig.from_pretrained(self.MODEL_NAME, trust_remote_code=True)
        with init_empty_weights():
            self.model = AutoModelForCausalLM.from_config(config)

        logger.info("Loading model")
        weights_location, is_quantized = self._download_weights()
        if FETCH_QUANITZED or is_quantized:
            self._load_model(weights_location)
        else:
            self._quantize_and_load_model(weights_location)
        logger.info("Model loading completed")

        logger.info(f"Device map: {self.model.hf_device_map}")
        logger.info(f"Memory footprint: {self.model.get_memory_footprint()}")

        self.model.eval()

    def preprocess(self, data):
        if "input" in data:
            data = data["input"]

        if isinstance(data, list):
            data = data[0]

        if data is None:
            raise PredictionException("Input is None")

        logger.info(f'Received: "{data}". Begin tokenizing')

        message = [{"role": "user", "content": data}]
        tokenized_input = self.tokenizer.apply_chat_template(
            message, add_generation_prompt=True, return_tensors="pt"
        ).to(self.accelerator.device)

        logger.info("Tokenization process completed")

        return tokenized_input

    def inference(self, data, *args, **kwargs):
        logger.info("Begin inference")

        with torch.inference_mode():
            output = self.model.generate(
                data,
                max_new_tokens=2048,
                do_sample=True,
                temperature=0.2,
                repetition_penalty=1.1,
                top_k=50,
                top_p=0.95,
                num_return_sequences=1,
                eos_token_id=32021,
            )

        logger.info("Inference completed")

        return output

    def postprocess(self, data, input_len):
        output = self.tokenizer.decode(data[0][input_len:], skip_special_tokens=True)

        logger.info(f"Postprocessing completed. Output: {output}")

        return output

    def handle(self, data, context):
        input = self.preprocess(data)
        output = self.inference(input)
        output = self.postprocess(output, input.shape[1])

        return output

    def _download_weights(self):
        if not FETCH_QUANITZED and self.quantized_model_weights_dir.exists():
            logger.info("Quantized weights already exist")
            return self.quantized_model_weights_dir, True

        logger.info("Downloading weights")
        weights_location = snapshot_download(
            repo_id=self.QUANTIZED_WEIGHTS_URL if FETCH_QUANITZED else self.MODEL_NAME,
            allow_patterns=["*.json", "*.safetensors"]
            if FETCH_QUANITZED
            else ["*.json", "*.bin"],
            ignore_patterns=["*.bin.index.json"]
            if FETCH_QUANITZED
            else ["*.safetensors.index.json"],
        )
        logger.info(f"Weights location: {weights_location}")

        return weights_location, FETCH_QUANITZED

    def _quantize_and_load_model(self, weights_location):
        logger.info("Weights will be quantized and saved to the model directory")

        self._load_model(weights_location)

        logger.info("Saving quantized weights")
        self.accelerator.save_model(self.model, self.quantized_model_weights_dir, safe_serialization=False, max_shard_size="5GB")

    def _load_model(self, weights_location):
        bnb_quantization_config = BnbQuantizationConfig(
            load_in_8bit=LOAD_IN_8BIT
        )

        VRAM = torch.cuda.get_device_properties(0).total_memory / (1024 ** 3) - 30
        RAM = psutil.virtual_memory().total / (1024 ** 3) - 20

        self.model = load_and_quantize_model(
            self.model,
            weights_location=weights_location,
            device_map="auto",
            bnb_quantization_config=bnb_quantization_config,
            offload_folder=self.model_dir / "offload",
            offload_state_dict=True,
            no_split_module_classes=self.model._no_split_modules,
            max_memory={"cpu": f"{RAM:.2f}GiB", 0: f"{VRAM:.2f}GiB"}
        )

    @property
    def quantized_model_weights_dir(self):
        return self.model_dir / "drive" / "MyDrive" / "quantized_weights"

model = MixtralHandler()
class Context:
  pass

context = Context()
context.system_properties = {"gpu_id": 0, "model_dir": "."}
context.manifest = {}
model.initialize(context)

Expected behavior

I am trying to quantize and save a big model, however, there are a few issues with that. While it does manage to load the model, it fails when trying to save it with the following error:

AttributeError                            Traceback (most recent call last)

[<ipython-input-4-b60b17f2815a>](https://localhost:8080/#) in <cell line: 8>()
      6 context.system_properties = {"gpu_id": 0, "model_dir": "."}
      7 context.manifest = {}
----> 8 model.initialize(context)

3 frames

[/usr/local/lib/python3.10/dist-packages/accelerate/utils/modeling.py](https://localhost:8080/#) in get_state_dict_offloaded_model(model)
   1351         for key in module_state_dict:
   1352             # ignore placeholder parameters that are still on the meta device
-> 1353             if module_state_dict[key].device == torch.device("meta"):
   1354                 placeholders.add(name + f".{key}")
   1355                 continue

AttributeError: 'str' object has no attribute 'device'

I remember having a similar issue before on another model and I think I solved it by chaning device_map from custom dict to "auto". It does not help in this case, though.

When I try to run inference on the model two issues arise. If I don't set max_memory to very low values I will get CUDA: out of memory error. If I heavily limit max_memory I get:

ValueError                                Traceback (most recent call last)

<timed eval> in <module>

[<ipython-input-1-ae9ebbc85d19>](https://localhost:8080/#) in handle(self, data, context)
    119     def handle(self, data, context):
    120         input = self.preprocess(data)
--> 121         output = self.inference(input)
    122         output = self.postprocess(output, input.shape[1])
    123 

23 frames

[/usr/local/lib/python3.10/dist-packages/accelerate/utils/modeling.py](https://localhost:8080/#) in set_module_tensor_to_device(module, tensor_name, device, value, dtype, fp16_statistics)
    308     if value is not None:
    309         if old_value.shape != value.shape:
--> 310             raise ValueError(
    311                 f'Trying to set a tensor of shape {value.shape} in "{tensor_name}" (which has shape {old_value.shape}), this look incorrect.'
    312             )

ValueError: Trying to set a tensor of shape torch.Size([8, 4096]) in "weight" (which has shape torch.Size([32, 4096])), this look incorrect.

I have almost identical code that I used to quantize deep seek coder 33b and it worked there. Though when I tried to run the quantized version of deep seek coder 33b on V100 (now I am running on A100) I had problems with CUDA: out of memory. Note that I had no problems with memory when loading unquantized (loaded with load_checkpoint_and_dispatch) version of deep seek coder 33b on V100 and inference worked as well, though it was very slow.

I am not sure if I just don't have enough memory to run this or this is caused by something else.

SunMarc commented 8 months ago

I'll try to check what is going on. In the meantime, I recommend you to use directly the integration of bitsandbytes in transformers by doing AutoModelForCausalLM.from_pretrained(self.model_name, load_in_8bit=True, device_map="auto")

github-actions[bot] commented 7 months ago

This issue has been automatically marked as stale because it has not had recent activity. If you think this still needs to be addressed please comment on this thread.

Please note that issues that do not follow the contributing guidelines are likely to be ignored.

pinchedsquare commented 7 months ago

@SunMarc Just curious fundamentally why is load_and_quantize_model() using more memory than AutoModelForCausalLM.from_pretrained(self.model_name, load_in_8bit=True, device_map="auto") ?

In the following 2 examples, first one succeeds and 2nd one fails with OOM on GPU.

Example 1:

quantization_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    device_map = "auto",
    trust_remote_code = True,
    quantization_config=quantization_config,
)

Example 2:

checkpoint = MODEL_PATH
config = AutoConfig.from_pretrained(checkpoint)

model = None
with init_empty_weights():
    model = AutoModelForCausalLM.from_config(config)

bnb_quantization_config = BnbQuantizationConfig(
    load_in_4bit=True, 
    torch_dtype=torch.bfloat16,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True, 
    bnb_4bit_quant_type="nf4",
)

quantized_model = load_and_quantize_model(model, 
                                          weights_location=MODEL_PATH, 
                                          bnb_quantization_config=bnb_quantization_config, 
                                          device_map = 'auto',
                                         )
quantized_model

Any insights is appreciated.

github-actions[bot] commented 6 months ago

This issue has been automatically marked as stale because it has not had recent activity. If you think this still needs to be addressed please comment on this thread.

Please note that issues that do not follow the contributing guidelines are likely to be ignored.

SunMarc commented 6 months ago

Hi @pinchedsquare, there shouldn't be any big difference. The goal of load_and_quantize_model is to enable any pytorch model to be quantized with bnb, not just models from transformers library. I'll have a look this strange OOM asap.

pinchedsquare commented 5 months ago

Will await your response. Thanks.

github-actions[bot] commented 4 months ago

This issue has been automatically marked as stale because it has not had recent activity. If you think this still needs to be addressed please comment on this thread.

Please note that issues that do not follow the contributing guidelines are likely to be ignored.

huggingface / accelerate