Closed Kuchiriel closed 2 months ago
Hello! Unfortunately, we do not have the bandwidth to directly write this code, but we can give you some pointers.
If you mean to ask "how to load a pre-quantized AQLM model" (with or without PV-tuning), please refer to this guide: https://huggingface.co/docs/transformers/main/en/quantization/aqlm . You can find a list of pre-quantized models in or readme or on the hub.
If you want to quantize ("train") an AQLM quantization for an arbitrary model inside the code, we would not recommend that: our calibration code needs to be run in a very particular way (see instructions from README and scroll down for fine-tuning) . If you absolutely must train the model from your script, use subprocess to invoke the script as per the instructions above. In principle, you can also manually merge the codebases, but this will require a lot of (careful) work, where any bug can silently ruin model quality.
Unfortunately, that is all we can tell you atm.
p.s. if you need to share a large snippet of code on github, you may find it more convenient to (a) surround it with triple backticks or (b) publish it as a github gist and linking it to your issue. This is a general recommendation, not formatting the code for this particular issue does not inconvenience anyone since we do not have the bandwidth to help anyway. But if you ask around on other discussion boards, others may be better able to help if you provide them with a minimal python-formatted snippet and an explanation what it does, how you tried to add AQLM and what errors you encountered.
This issue is stale because it has been open for 30 days with no activity.
This issue was closed because it has been inactive for 14 days since being marked as stale.
This is the code, I achived 4bits with normal libs
import gc import os import re import torch import tensorflow as tf import pandas as pd import matplotlib.pyplot as plt import nltk import fitz # PyMuPDF import numpy as np import scipy import tiktoken from sentence_transformers import SentenceTransformer from scipy.stats import fisher_exact from torch import nn from transformers import ( GenerationConfig, TextStreamer, BitsAndBytesConfig, GPT2TokenizerFast, AutoConfig, AutoModelForCausalLM, AutoTokenizer ) from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.vectorstores import FAISS from IPython.display import display import ipywidgets as widgets from accelerate import Accelerator, init_empty_weights, load_checkpoint_and_dispatch, infer_auto_device_map from nltk.corpus import stopwords
Download necessary NLTK data
nltk.download("stopwords")
Display library versions
print(f"NumPy version: {np.version}") print(f"SciPy version: {scipy.version}")
Perform a Fisher's exact test as an example
result = fisher_exact([[10, 10], [5, 20]]) print("Fisher's exact test result:", result)
Model and embedding configurations
CONTENT = "You are a friendly chatbot who always responds to user instructions precisely and remembers the previous prompts" conversation_history = [] MAX_TOKENS_PER_MB = 20
model_names = [ "TheBloke/zephyr-7B-beta-GPTQ", "TheBloke/zephyr-7B-beta-GGUF", "HuggingFaceH4/zephyr-7b-beta" ] CPU_MODEL = "MaziyarPanahi/Phi-3-mini-4k-instruct-v0.3" EMBEDDING_MODEL_NAME = "sentence-transformers/multi-qa-MiniLM-L6-cos-v1"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
Open and read PDF
doc = fitz.open("/kaggle/input/attention-is-all-you-need/attention_is_all_you_need.pdf") text = "" for page in doc: text += page.get_text()
Tokenize the text
enc = tiktoken.encoding_for_model("gpt2")
def count_tokens(text: str) -> int: return len(enc.encode(text))
text_splitter = RecursiveCharacterTextSplitter( chunk_size=512, chunk_overlap=24, length_function=count_tokens, )
chunks = text_splitter.create_documents([text])
Plot token counts
token_counts = [count_tokens(chunk.page_content) for chunk in chunks] df = pd.DataFrame({"Token Count": token_counts}) df.hist(bins=40) plt.show()
Sentence transformer wrapper class
class SentenceTransformerWrapper: def init(self, model_name): self.model = SentenceTransformer(model_name)
embedding_model = SentenceTransformerWrapper(EMBEDDING_MODEL_NAME)
Create FAISS database
db = FAISS.from_documents(chunks, embedding_model)
Initialize accelerator
accelerator = Accelerator()
def get_available_memory(): if DEVICE == "cuda": torch.cuda.empty_cache() total_memory = torch.cuda.get_device_properties(0).total_memory available_memory = total_memory - torch.cuda.memory_allocated() return available_memory return None
def set_dynamic_memory_allocation(max_size_mb=None): total_memory = torch.cuda.get_device_properties(0).total_memory / (1024**2) print("Total memory:", total_memory) available_memory = total_memory
def test_is_tpu_available(): devices = tf.config.list_logical_devices() for device in devices: if device.device_type == "TPU": return True return False
Check for available devices
if torch.cuda.is_available(): DEVICE = "cuda" print("GPU Available:", torch.cuda.get_device_name(torch.cuda.current_device())) elif test_is_tpu_available(): try: tpu = tf.distribute.cluster_resolver.TPUClusterResolver() tf.config.experimental_connect_to_cluster(tpu) tf.tpu.experimental.initialize_tpu_system(tpu) print(f"Running on TPU: {tpu.cluster_spec().as_dict()['worker']}") strategy = tf.distribute.experimental.TPUStrategy(tpu) DEVICE = "tpu" except tf.errors.AlreadyExistsError: print("TPU already initialized") except tf.errors.FailedPreconditionError as e: print(f"Failed to initialize TPU: {e}") DEVICE = "cpu" else: DEVICE = "cpu" print("Device:", DEVICE)
Set data types based on device
if DEVICE == "cuda" and torch.cuda.is_bf16_supported(): DTYPE = torch.bfloat16 elif DEVICE == "cuda" and not torch.cuda.is_bf16_supported(): DTYPE = torch.float16 else: DTYPE = torch.float32 print("DTYPE:", DTYPE)
def flush(): if DEVICE in ["cuda", "tpu"]: torch.cuda.empty_cache() torch.cuda.reset_peak_memory_stats() accelerator.free_memory() gc.collect()
def clear_all(): locals_toremove = [var for var in locals() if var[0] != ""] for var in locals_to_remove: del locals()[var] globals_toremove = [var for var in globals() if var[0] != ""] for var in globals_to_remove: del globals()[var] flush()
def print_memory_usage(): if DEVICE == "cuda": allocated = torch.cuda.memory_allocated() / 1e9 max_allocated = torch.cuda.max_memory_allocated() / 1e9 print(f"Memory Allocated: {allocated} GB, Max Allocated: {max_allocated} GB") flush()
def inspect_model_bits(model): for name, param in model.named_parameters(): print(f"Parameter: {name}, dtype: {param.dtype}")
def load_gptq_model(model_name): from auto_gptq import AutoGPTQForCausalLM config = BitsAndBytesConfig( load_in_4bit=True, disable_exllama=True, ) model = AutoGPTQForCausalLM.from_pretrained(model_name, quantization_config=config) return model
def load_gguf_model(model_name): model = AutoModelForCausalLM.from_pretrained(model_name, model_type="mistral") return model
def determine_model_type(model): try: if hasattr(model, 'gptq_config'): return "gptq" elif hasattr(model.config, 'quantization_approach') and model.config.quantization_approach == 'GGUF': return "gguf" else: return "unknown" except Exception as e: print(f"Error determining model type: {e}") return "unknown"
def load_model_by_type(model_type, model_name): try: if model_type == "gptq": return load_gptq_model(model_name) elif model_type == "gguf": return load_gguf_model(model_name) else: raise ValueError(f"Unknown model type: {model_type}") except Exception as e: print(f"Failed to load {model_name}: {e}") return None
def setup_model_and_tokenizer(model_name): torch.set_grad_enabled(False) try: tokenizer = AutoTokenizer.from_pretrained( model_name, use_fast=True, legacy=True, trust_remote_code=True if DEVICE == "cpu" else False, ) model = AutoModelForCausalLM.from_pretrained(model_name) model_type = determine_model_type(model) print(f"Detected model type: {model_type}") if model_type == "unknown": raise ValueError(f"Unknown model type for {model_name}") model = load_model_by_type(model_type, model_name) if model is None: raise ValueError(f"Failed to load model {model_name}")
if DEVICE in ["cuda", "tpu"]: for model_name in model_names: print(f"\nTesting model: {model_name}") try: model, tokenizer, generation_config = setup_model_and_tokenizer(model_name) if model is not None: print(f"Successfully loaded {model_name}") else: print(f"Failed to load {model_name}") except Exception as e: print(f"Failed to load {model_name}: {e}") else: MODEL = CPU_MODEL
def truncate_history_based_on_memory(): available_memory = get_available_memory() if available_memory is not None: max_tokens = int(available_memory / (1024*2)) MAX_TOKENS_PER_MB current_tokens = sum( len(tokenizer.encode(exchange["content"])) for exchange in conversation_history )
def determine_max_new_tokens(input_length, max_model_input_size, prompt): complexity_factor = 1.0 if is_complex(prompt): complexity_factor = 1.5
def is_complex(prompt): word_count_threshold = 12 unique_word_threshold = 10 long_word_threshold = 7 complex_sentence_threshold = 2
def estimate_memory_per_token(model): return 5
def calculate_dynamic_max_length(model, buffer_factor=0.8): memory_per_token = estimate_memory_per_token(model) available_memory = get_available_memory()
def segment_input(user_prompt, max_length): words = user_prompt.split() segments = [] current_segment = []
def validate_tensor(tensor, name): if torch.is_tensor(tensor): if torch.isnan(tensor).any(): raise ValueError(f"{name} contains NaN values") if torch.isinf(tensor).any(): raise ValueError(f"{name} contains inf values") if (tensor < 0).any(): raise ValueError(f"{name} contains values less than 0") else: raise TypeError(f"{name} is not a tensor")
def stream_text(segment, model, tokenizer, generation_config, context): global conversation_history global CONTENT
def call_llm(): try: user_prompt = input("Enter your prompt (or type 'exit' to quit): ") if user_prompt.lower() == "exit": return "exit" if DEVICE == "cuda": with torch.inference_mode(): max_length = calculate_dynamic_max_length(model) segments = segment_input(user_prompt, max_length) responses = [] for segment in segments: docs = db.similarity_search(segment) context = " ".join([doc.page_content for doc in docs]) response = stream_text(segment, model, tokenizer, generation_config, context) responses.append(response) for response in responses: display(widgets.HTML(f'Chatbot: {response}')) flush() else: print("CPU inference not implemented yet") except Exception as ex: print(f"An error occurred during text generation: {ex}") finally: if DEVICE == "cuda": print_memory_usage() else: accelerator.free_memory() return user_prompt
def main(): try: while True: if call_llm() == "exit": break except KeyboardInterrupt: print("\nExiting the program.") print_memory_usage() clear_all()
Set memory allocation settings
set_dynamic_memory_allocation()
if name == "main": main()