werruww commented 3 days ago

https://huggingface.co/ISTA-DASLab/Llama-3.2-1B-Instruct-AQLM-PV-2Bit-2x8 https://github.com/yeyu2/Youtube_demos/blob/main/Mixtral_of_aqlm_transformers.ipynb https://huggingface.co/docs/transformers/main/en/quantization/aqlm https://pytorch.org/get-started/previous-versions/

%%capture !pip install aqlm[gpu]>=1.0.1 !pip install accelerate>=0.27.0 !pip install transformers>=4.38.0

!pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu118

!pip install aqlm[gpu]==1.0.1 !pip install git+https://github.com/huggingface/accelerate.git@main !pip install git+https://github.com/BlackSamorez/transformers.git@aqlm

from transformers import AutoTokenizer, AutoModelForCausalLM

quantized_model = AutoModelForCausalLM.from_pretrained( "ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf", torch_dtype="auto", device_map="auto", low_cpu_mem_usage=True, ) tokenizer = AutoTokenizer.from_pretrained("ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf")

%%time output = quantized_model.generate(tokenizer("The relationship between humans and AI ", return_tensors="pt")["input_ids"].cuda(), min_new_tokens=128, max_new_tokens=128) print(tokenizer.decode(output[0]))

import json import textwrap

system_prompt = "A chat between a curious user and an blog writing assistant. "

def get_prompt(human_prompt): prompt_template=f"{system_prompt}\n\nUSER: {human_prompt} \nASSISTANT: " return prompt_template

def remove_human_text(text): return text.split('USER:', 1)[0]

def parse_text(data): for item in data: text = item['generated_text'] assistant_text_index = text.find('ASSISTANT:') if assistant_text_index != -1: assistant_text = text[assistant_text_index+len('ASSISTANT:'):].strip() assistant_text = remove_human_text(assistant_text) wrapped_text = textwrap.fill(assistant_text, width=100) print("#####", wrapped_text)

return assistant_text

from transformers import GenerationConfig, pipeline

pipe = pipeline( "text-generation", model=quantized_model, tokenizer=tokenizer, max_length=1200, temperature=0.7, top_p=0.95, do_sample=True, )

%%time prompt = '''Write a short and engaging blog post of travelling in Bohol Island. ''' raw_output = pipe(get_prompt(prompt))

parse_text(raw_output)

werruww commented 3 days ago

on colab h4

werruww commented 3 days ago

from transformers import AutoTokenizer, AutoModelForCausalLM

quantized_model = AutoModelForCausalLM.from_pretrained( "ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf", torch_dtype="auto", device_map="auto", low_cpu_mem_usage=True, ) tokenizer = AutoTokenizer.from_pretrained("ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf")

output = quantized_model.generate(tokenizer("The relationship between humans and AI ", return_tensors="pt")["input_ids"].cuda(), min_new_tokens=128, max_new_tokens=128) print(tokenizer.decode(output[0]))

import json import textwrap

system_prompt = "You are a helpful assistant. "

def get_prompt(human_prompt): prompt_template=f"{system_prompt}\n\nUSER: {human_prompt} \nASSISTANT: " return prompt_template

def remove_human_text(text): return text.split('USER:', 1)[0]

def parse_text(data): for item in data: text = item['generated_text'] assistant_text_index = text.find('ASSISTANT:') if assistant_text_index != -1: assistant_text = text[assistant_text_index+len('ASSISTANT:'):].strip() assistant_text = remove_human_text(assistant_text) wrapped_text = textwrap.fill(assistant_text, width=100) print("#####", wrapped_text)

return assistant_text

from transformers import GenerationConfig, pipeline

pipe = pipeline( "text-generation", model=quantized_model, tokenizer=tokenizer, max_length=1200, temperature=0.2, top_p=0.8, do_sample=True, )

prompt = '''who is python?'''

raw_output = pipe(get_prompt(prompt))

parse_text(raw_output)

werruww commented 3 days ago

from

https://github.com/yeyu2/Youtube_demos/blob/main/Mixtral_of_aqlm_transformers.ipynb

Vahe1994 / AQLM

it run #149

return assistant_text

return assistant_text