Open werruww opened 3 days ago
on colab h4
from transformers import AutoTokenizer, AutoModelForCausalLM
quantized_model = AutoModelForCausalLM.from_pretrained( "ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf", torch_dtype="auto", device_map="auto", low_cpu_mem_usage=True, ) tokenizer = AutoTokenizer.from_pretrained("ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf")
output = quantized_model.generate(tokenizer("The relationship between humans and AI ", return_tensors="pt")["input_ids"].cuda(), min_new_tokens=128, max_new_tokens=128) print(tokenizer.decode(output[0]))
import json import textwrap
system_prompt = "You are a helpful assistant. "
def get_prompt(human_prompt): prompt_template=f"{system_prompt}\n\nUSER: {human_prompt} \nASSISTANT: " return prompt_template
def remove_human_text(text): return text.split('USER:', 1)[0]
def parse_text(data): for item in data: text = item['generated_text'] assistant_text_index = text.find('ASSISTANT:') if assistant_text_index != -1: assistant_text = text[assistant_text_index+len('ASSISTANT:'):].strip() assistant_text = remove_human_text(assistant_text) wrapped_text = textwrap.fill(assistant_text, width=100) print("#####", wrapped_text)
from transformers import GenerationConfig, pipeline
pipe = pipeline( "text-generation", model=quantized_model, tokenizer=tokenizer, max_length=1200, temperature=0.2, top_p=0.8, do_sample=True, )
prompt = '''who is python?'''
raw_output = pipe(get_prompt(prompt))
parse_text(raw_output)
https://huggingface.co/ISTA-DASLab/Llama-3.2-1B-Instruct-AQLM-PV-2Bit-2x8 https://github.com/yeyu2/Youtube_demos/blob/main/Mixtral_of_aqlm_transformers.ipynb https://huggingface.co/docs/transformers/main/en/quantization/aqlm https://pytorch.org/get-started/previous-versions/
%%capture !pip install aqlm[gpu]>=1.0.1 !pip install accelerate>=0.27.0 !pip install transformers>=4.38.0
!pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu118
!pip install aqlm[gpu]==1.0.1 !pip install git+https://github.com/huggingface/accelerate.git@main !pip install git+https://github.com/BlackSamorez/transformers.git@aqlm
from transformers import AutoTokenizer, AutoModelForCausalLM
quantized_model = AutoModelForCausalLM.from_pretrained( "ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf", torch_dtype="auto", device_map="auto", low_cpu_mem_usage=True, ) tokenizer = AutoTokenizer.from_pretrained("ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf")
%%time output = quantized_model.generate(tokenizer("The relationship between humans and AI ", return_tensors="pt")["input_ids"].cuda(), min_new_tokens=128, max_new_tokens=128) print(tokenizer.decode(output[0]))
import json import textwrap
system_prompt = "A chat between a curious user and an blog writing assistant. "
def get_prompt(human_prompt): prompt_template=f"{system_prompt}\n\nUSER: {human_prompt} \nASSISTANT: " return prompt_template
def remove_human_text(text): return text.split('USER:', 1)[0]
def parse_text(data): for item in data: text = item['generated_text'] assistant_text_index = text.find('ASSISTANT:') if assistant_text_index != -1: assistant_text = text[assistant_text_index+len('ASSISTANT:'):].strip() assistant_text = remove_human_text(assistant_text) wrapped_text = textwrap.fill(assistant_text, width=100) print("#####", wrapped_text)
return assistant_text
from transformers import GenerationConfig, pipeline
pipe = pipeline( "text-generation", model=quantized_model, tokenizer=tokenizer, max_length=1200, temperature=0.7, top_p=0.95, do_sample=True, )
%%time prompt = '''Write a short and engaging blog post of travelling in Bohol Island. ''' raw_output = pipe(get_prompt(prompt))
parse_text(raw_output)