unslothai / unsloth

Finetune Llama 3.2, Mistral, Phi, Qwen 2.5 & Gemma LLMs 2-5x faster with 80% less memory
https://unsloth.ai
Apache License 2.0
18.41k stars 1.29k forks source link

Load And Unload Model Error: OSError: could not get source code #1220

Open DaddyCodesAlot opened 3 weeks ago

DaddyCodesAlot commented 3 weeks ago

Hi there, I wrote two methods that allow unsloth models to be loaded into memory and unloaded into memory. To my knowledge, I believe this is the only way to do change unsloth models

llm_model = None
tokenizer = None

def loadModel(model_name):
    global EOS_TOKEN
    global llm_model, tokenizer
    print(f'Load model {model_name}')
    llm_model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = f"model_name}",
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
        token = "TOKEN HERE", # use one if using gated models like meta-llama/Llama-2-7b-hf
    )
    EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN

def unloadModel():
    global llm_model, tokenizer
    # Delete the model and tokenizer
    llm_model.disable_adapter_layers()
    del llm_model
   # del tokenizer

    # Run garbage collection
    gc.collect()

    # Optionally, clear the CUDA cache if using GPU
    if torch.cuda.is_available():
        torch.cuda.empty_cache() 

However, an update to Unsloth has causes errors using this method.

---------------------------------------------------------------------------
OSError                                   Traceback (most recent call last)
Cell In[20], line 3
      1 loadModel("unsloth/Qwen2.5-7B-Instruct-bnb-4bit")
      2 unloadModel()
----> 3 loadModel("unsloth/Qwen2.5-7B-Instruct-bnb-4bit")

Cell In[17], line 29, in loadModel(model_name)
     27 global llm_model, tokenizer
     28 print(f'Load model {model_name}')
---> 29 llm_model, tokenizer = FastLanguageModel.from_pretrained(
     30     model_name = f"{model_name}",
     31     max_seq_length = max_seq_length,
     32     dtype = dtype,
     33     load_in_4bit = load_in_4bit,
     34     token = "token here", # use one if using gated models like meta-llama/Llama-2-7b-hf
     35 )
     36 EOS_TOKEN = tokenizer.eos_token

File /usr/local/lib/python3.10/dist-packages/unsloth/models/loader.py:332, in FastLanguageModel.from_pretrained(model_name, max_seq_length, dtype, load_in_4bit, token, device_map, rope_scaling, fix_tokenizer, trust_remote_code, use_gradient_checkpointing, resize_model_vocab, revision, *args, **kwargs)
    329     tokenizer_name = None
    330 pass
--> 332 model, tokenizer = dispatch_model.from_pretrained(
    333     model_name        = model_name,
    334     max_seq_length    = max_seq_length,
    335     dtype             = dtype,
    336     load_in_4bit      = load_in_4bit,
    337     token             = token,
    338     device_map        = device_map,
    339     rope_scaling      = rope_scaling,
    340     fix_tokenizer     = fix_tokenizer,
    341     model_patcher     = dispatch_model,
    342     tokenizer_name    = tokenizer_name,
    343     trust_remote_code = trust_remote_code,
    344     revision          = revision if not is_peft else None,
    345     *args, **kwargs,
    346 )
    348 if resize_model_vocab is not None:
    349     model.resize_token_embeddings(resize_model_vocab)

File /usr/local/lib/python3.10/dist-packages/unsloth/models/qwen2.py:87, in FastQwen2Model.from_pretrained(model_name, max_seq_length, dtype, load_in_4bit, token, device_map, rope_scaling, fix_tokenizer, model_patcher, tokenizer_name, trust_remote_code, **kwargs)
     72 @staticmethod
     73 def from_pretrained(
     74     model_name        = "Qwen/Qwen2-7B",
   (...)
     85     **kwargs,
     86 ):
---> 87     return FastLlamaModel.from_pretrained(
     88         model_name        = model_name,
     89         max_seq_length    = max_seq_length,
     90         dtype             = dtype,
     91         load_in_4bit      = load_in_4bit,
     92         token             = token,
     93         device_map        = device_map,
     94         rope_scaling      = rope_scaling,
     95         fix_tokenizer     = fix_tokenizer,
     96         model_patcher     = FastQwen2Model,
     97         tokenizer_name    = tokenizer_name,
     98         trust_remote_code = trust_remote_code,
     99         **kwargs,
    100     )

File /usr/local/lib/python3.10/dist-packages/unsloth/models/llama.py:1790, in FastLlamaModel.from_pretrained(model_name, max_seq_length, dtype, load_in_4bit, token, device_map, rope_scaling, fix_tokenizer, model_patcher, tokenizer_name, trust_remote_code, **kwargs)
   1787 Trainer._inner_training_loop = _fast_inner_training_loop
   1789 # Fix gradient accumulation
-> 1790 patch_gradient_accumulation_fix(Trainer)
   1792 # Save tokenizer for inference purposes
   1793 tokenizer.padding_side = "left" # Force inference

File /usr/local/lib/python3.10/dist-packages/unsloth/models/_utils.py:1220, in patch_gradient_accumulation_fix(Trainer)
   1217 # Also fix up loss scaling ie negate loss *= self.args.gradient_accumulation_steps
   1218 if "num_items_in_batch" not in inspect.signature(Trainer.training_step).parameters: return
-> 1220 function = inspect.getsource(Trainer.training_step)
   1221 where = function.find("def")
   1222 function = function.split("\n")

File /usr/lib/python3.10/inspect.py:1139, in getsource(object)
   1133 def getsource(object):
   1134     """Return the text of the source code for an object.
   1135 
   1136     The argument may be a module, class, method, function, traceback, frame,
   1137     or code object.  The source code is returned as a single string.  An
   1138     OSError is raised if the source code cannot be retrieved."""
-> 1139     lines, lnum = getsourcelines(object)
   1140     return ''.join(lines)

File /usr/lib/python3.10/inspect.py:1121, in getsourcelines(object)
   1113 """Return a list of source lines and starting line number for an object.
   1114 
   1115 The argument may be a module, class, method, function, traceback, frame,
   (...)
   1118 original source file the first line of code was found.  An OSError is
   1119 raised if the source code cannot be retrieved."""
   1120 object = unwrap(object)
-> 1121 lines, lnum = findsource(object)
   1123 if istraceback(object):
   1124     object = object.tb_frame

File /usr/lib/python3.10/inspect.py:958, in findsource(object)
    956     lines = linecache.getlines(file)
    957 if not lines:
--> 958     raise OSError('could not get source code')
    960 if ismodule(object):
    961     return lines, 0

OSError: could not get source code

Other than, that, reverting to this patch solved the bug for me:

pip install "unsloth[cu121] @ git+https://github.com/unslothai/unsloth.git@1e7e0e23683c5ec1c1e3a5df0f586d4c433fee44"

I'm unclear on what is causing this bug tbh, but it seems this line of code is a bit finnicky:

function = inspect.getsource(Trainer.training_step)

GennVa commented 3 weeks ago

Same issue here. Also tried unsloth[cu118+torch230] @ git+https://github.com/unslothai/unsloth.git@1e7e0e23683c5ec1c1e3a5df0f586d4c433fee44 and got same error.

Erland366 commented 3 weeks ago

The error is "normal" since Unsloth needs to modify trl or transformers code on the fly. Which means inspect.getsource will error after modification.

For you problem, I can't reproduce it (although I slightly modify the code, but it shouldn't really has a difference)

from unsloth import FastLanguageModel
import gc
import torch

model_name =  "unsloth/Llama-3.2-1B-bnb-4bit"
llm_model = None
tokenizer = None
max_seq_length = 2048
dtype = None
load_in_4bit = True

def loadModel(model_name):
    global EOS_TOKEN
    global llm_model, tokenizer
    print(f'Load model {model_name}')
    llm_model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = f"{model_name}",
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
        # token = "TOKEN HERE", # use one if using gated models like meta-llama/Llama-2-7b-hf
    )
    EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN

def unloadModel():
    global llm_model, tokenizer
    # Delete the model and tokenizer
    try:
        llm_model.disable_adapter_layers()
    except:
        pass
    del llm_model
    del tokenizer

    for _ in range(5):
        # Run garbage collection
        gc.collect()

        # Optionally, clear the CUDA cache if using GPU
        if torch.cuda.is_available():
            torch.cuda.empty_cache() 

Is there any other step that you did?

danielhanchen commented 3 weeks ago

Oh wait I can bypass double patching by checking the function name - can fix this!

DaddyCodesAlot commented 3 weeks ago

The error is "normal" since Unsloth needs to modify trl or transformers code on the fly. Which means inspect.getsource will error after modification.

For you problem, I can't reproduce it (although I slightly modify the code, but it shouldn't really has a difference)

from unsloth import FastLanguageModel
import gc
import torch

model_name =  "unsloth/Llama-3.2-1B-bnb-4bit"
llm_model = None
tokenizer = None
max_seq_length = 2048
dtype = None
load_in_4bit = True

def loadModel(model_name):
    global EOS_TOKEN
    global llm_model, tokenizer
    print(f'Load model {model_name}')
    llm_model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = f"{model_name}",
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
        # token = "TOKEN HERE", # use one if using gated models like meta-llama/Llama-2-7b-hf
    )
    EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN

def unloadModel():
    global llm_model, tokenizer
    # Delete the model and tokenizer
    try:
        llm_model.disable_adapter_layers()
    except:
        pass
    del llm_model
    del tokenizer

    for _ in range(5):
        # Run garbage collection
        gc.collect()

        # Optionally, clear the CUDA cache if using GPU
        if torch.cuda.is_available():
            torch.cuda.empty_cache() 

Is there any other step that you did?

Running this code gives me an error on a runpod instance, but not a Google Colab instance.