Running Llama3 like a genius (that I am not) [SOLVED]:

BBC-Esq commented 5 months ago

Deleting my initial message because it contained paths on my computer and my follow up post below addresses the error I was getting anyways!

BBC-Esq commented 5 months ago

Finally was able to solve the issue. Here are the steps to run the new Llama3 model.

Step 1 - GET FILES

Get the model from here: https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct

Step 2 - PREPARE FILES

Within the folder containing the llama3 files there's a folder named original. Go into it and copy params.json and tokenizer.model into the parent folder. Forget about copying consolidated.00.pth.
This will prevent the error in my initial post. Apparently, my conversion (given in full below) was attempting to copy the "original" folder thinking it was a file; hence, the "[f]ile original does not exist..." error. I may or may not change my script. For now, this workaround should suffice.

Step 3 - CONVERT TO CTRANSLATE2 FORMAT

Convert the model to the ctranslate2 format. You can do this one of two ways:

Use a command within the command line for the ctranslate2 converter itself...NO INSTRUCTIONS are here for that. You'll have to figure that out yourself.
Use my script that creates the command for you. It requires installing Pyside6, but here is the entire script:

CLICK HERE FOR MY VERY EXCELLENT SCRIPT

``` from PySide6.QtWidgets import QApplication, QWidget, QPushButton, QLabel, QFileDialog, QMessageBox, QVBoxLayout, QHBoxLayout, QCheckBox, QTextEdit, QStyleFactory from PySide6.QtCore import QThread, Signal import subprocess import os class ConversionThread(QThread): started = Signal(str, str) finished = Signal(str, str) error = Signal(str) def __init__(self, command, quantization): super().__init__() self.command = command self.quantization = quantization def run(self): self.started.emit(self.quantization, self.command) result = subprocess.run(self.command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) if result.returncode == 0: self.finished.emit(self.quantization, result.stdout) else: self.error.emit(f"Command failed with return code {result.returncode}: {result.stderr}") class App(QWidget): def __init__(self): super().__init__() self.setWindowTitle("chintellalaw.com - for non-commercial use") self.model_path = "" self.resize(800, 500) layout = QVBoxLayout(self) browse_layout = QHBoxLayout() self.browse_btn = QPushButton("Browse") self.browse_btn.clicked.connect(self.browse) browse_layout.addWidget(self.browse_btn) self.path_label = QLabel("") browse_layout.addWidget(self.path_label) layout.addLayout(browse_layout) self.quantization_options = ["float32", "float16", "bfloat16", "int8_float32", "int8_float16", "int8_bfloat16", "int8"] self.quant_vars = {option: QCheckBox(option) for option in self.quantization_options} quant_layout = QHBoxLayout() for option, chk in self.quant_vars.items(): chk.setChecked(False) quant_layout.addWidget(chk) layout.addLayout(quant_layout) self.run_btn = QPushButton("Run") self.run_btn.clicked.connect(self.run_conversion) layout.addWidget(self.run_btn) self.output_text = QTextEdit() self.output_text.setReadOnly(True) layout.addWidget(self.output_text) self.conversion_queue = [] self.current_conversion = None def browse(self): path = QFileDialog.getExistingDirectory(self, "Select Model Directory") if path: self.model_path = path self.path_label.setText(path) def append_to_text_widget(self, content): self.output_text.append(content) def run_conversion(self): self.output_text.clear() self.conversion_queue.clear() if not self.model_path: QMessageBox.critical(self, "Error", "Please select a model directory.") return for option, chk in self.quant_vars.items(): if chk.isChecked(): self.conversion_queue.append(option) self.process_next_conversion() def process_next_conversion(self): if self.conversion_queue: option = self.conversion_queue.pop(0) model_dir = self.model_path output_dir = os.path.join(os.path.dirname(model_dir), f'{os.path.basename(model_dir)}-ct2-{option}') copy_files = [filename for filename in os.listdir(model_dir) if not filename.endswith(('.bin', '.safetensors')) and filename not in ["config.json", ".git", "coreml", "configs"]] if copy_files: copy_files_option = ' '.join([f'"{filename}"' for filename in copy_files]) copy_files_cmd_part = f'--copy_files {copy_files_option} ' else: copy_files_cmd_part = '' cmd = (f'ct2-transformers-converter --model "{model_dir}" ' f'--output_dir "{output_dir}" ' f'--quantization {option} ' f'--low_cpu_mem_usage --trust_remote_code ' f'{copy_files_cmd_part.strip()}') self.current_conversion = ConversionThread(cmd, option) self.current_conversion.started.connect(self.on_conversion_started) self.current_conversion.finished.connect(self.on_conversion_finished) self.current_conversion.error.connect(self.on_conversion_error) self.current_conversion.start() else: self.append_to_text_widget("All selected conversions are complete.") def on_conversion_started(self, quantization, command): self.append_to_text_widget(f"Starting conversion for {quantization} with command:\n{command}") def on_conversion_finished(self, quantization, output): completion_message = f"Conversion completed for {quantization}.\n{output}" self.append_to_text_widget(completion_message) self.process_next_conversion() def on_conversion_error(self, error_message): self.append_to_text_widget(error_message) if __name__ == "__main__": app = QApplication([]) app.setStyle(QStyleFactory.create("Fusion")) widget = App() widget.show() app.exec() ```

If you correctly install Pyside6 and run the script you should see an easy-to-understand GUI that will convert any compatible model. You only need to select the folder containing the model files:

Step 4 - FIX THE STUPID VALUE IN CONFIG.JSON

The new config.json file created during the conversion process will have a key/value that will BORK your script. Specifically, it will specify "null" for the "unk_token". THIS IS IMPROPER. Change it to anything else, but preferably something descriptive like this:

Step 5 - PROMPT FORMAT

The prompt formatting for llama3 royally blows, but once you get it, you can finally hardcode it and forget it about it...
Here is an f-string of the prompt format to send a single query and get a single response:

prompt = f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n{system_message}<|eot_id|><|start_header_id|>user<|end_header_id|>\n{user_message}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n"

Note, this does NOT entail a multi-turn conversation with memory. For that you'll need to consult the llama3 github repository for how to continue using the prompt format. But more importantly, you'll have to construct a ctranslate2 script that can utilize the prompting format as well as manage "memory..." @ guillikam created a basic script with "memory," albeit for llama2, which are in the docs. To reiterate, MY EXAMPLE IS ONLY for a single-turn question - i.e. for rag-based applications.

Moving on...the "system_message" can be anything, but I like "You are a helpful assistant who answers questions in a succinct fashion based on the contexts given to you"
The "user_message" can be anything.

Step 6 - PROPER TOKENIZATION and PROMPTING

I've complained before about the dearth of examples on how to use stereotypical "chat" models. The only helpful example from the "Docs" was for Falcon, which I adapted. Anyhow, I'll provide my full script below for the benefit of the community, but before that, it's helpful to understand a few things:

The line tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct") relies on the AutoTokenizer class within the transformers library to determine the correct tokenizer to use. Once the tokenizer is instantiated, transformers does have a method named apply_chat_format that allows you to apply the prompt formatting to one or more messages to/from llama3. However, for my purposes I like the hardcoded prompt format to see it, but primarily because I only need a single answer...just be aware that transformers offers that, however, if you want to build a chat session with memory.
Using AutoTokenizer is also different than using sentencepiece, which is what's used in the llama2 example in the "Docs" for this repository.
The line reading tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(prompt)) is critical because the "generator" within ctranslate2 itself requires a different format than if you use the transformers library directly. In other words, you're using Autotokenizer, which automatically selects the tokenizer to use, which, in turn, prepares the data in a format that a model being run with transformers needs it. However, since we converted the model to the ctranslate2 format, this line is absolutely necessary. If we used sentencepiece, for example, this wouldn't be necessary, which is why a lot of the older ctranslate2 examples use sentencepiece I presume...Anyhow, just understand this.
My script uses the generate_batch method instead of generate_tokens. You might want to use the latter if, for example, you are building a chat bot and/or simply want tokens streamed back to you and displayed as they're being streamed. I did this because (1) generate_tokens does not allow for the beam_size parameter, which I wanted to test and (2) my RAG use-case involves relatively short responses. For longer responses you might consider generate_tokens to improve the user experience.
Within the generate_batch method, you MUST, MUST MUST (and did I forget to mention..."MUST") use the end_token parameter and set it to "<|eot_id|>" and then use return_end_token=False. If you don't, the LLM will talk to itself until it reaches the token limit.
Since ctranslate2 doesn't have a do_sample parameter like transformers that can be set to true or false, my other parameters try to mimic this greedy/deterministic approach, which works great for RAG. Just be aware...

Step 7 - SAMPLE SCRIPT

Without further ado, here is a sample script. I have put in all caps placeholders for personal information as well as things that depend on your use case;

MY VERY COOL AWESOME VERY DECENT SCRIPT USING LLAMA3

``` import os import ctranslate2 import gc import torch from time import perf_counter from transformers import AutoTokenizer import pynvml import threading import time pynvml.nvmlInit() handle = pynvml.nvmlDeviceGetHandleByIndex(0) user_prompt = """INSERT HERE YOUR HARDCODED PROMPT...E.G. HERE ARE 5 CONTEXTS AND I ONLY WANT YOU TO ANSWER MY QUESTION BASED OFF OF THEM...AND HERE'S MY QUESTION...AND IF YOU DON'T KNOW SAY YOU DON'T KNOW""" system_prompt = "You are a helpful assistant who answers questions in a succinct fashion based on the contexts given to you." def build_prompt_llama_3_8b_instruct(): prompt = f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>\n{user_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n" return prompt def poll_vram_usage(stop_event, vram_readings): while not stop_event.is_set(): memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle) vram_usage = memory_info.used / 1024**2 vram_readings.append(vram_usage) time.sleep(0.1) def main(num_runs=1): start_time = perf_counter() model_dir = r"PATH TO THE FOLDER CONTAINING THE CTRANSLATE2 CONVERTED FILES" model_name = os.path.basename(model_dir) print(f"\033[32mLoading the model: {model_name}...\033[0m") load_start_time = perf_counter() intra_threads = max(os.cpu_count() - 4, os.cpu_count()) generator = ctranslate2.Generator(model_dir, device="cuda", compute_type="int8", intra_threads=intra_threads) tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct") load_end_time = perf_counter() load_time = load_end_time - load_start_time prompt = build_prompt_llama_3_8b_instruct() tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(prompt)) total_generation_time = 0 total_tokens_generated = 0 total_vram_usage = 0 for i in range(num_runs): print(f"\nRun {i+1}:") stop_event = threading.Event() vram_readings = [] poll_thread = threading.Thread(target=poll_vram_usage, args=(stop_event, vram_readings)) poll_thread.start() generation_start_time = perf_counter() results = generator.generate_batch( [tokens], include_prompt_in_result=False, # bool: Include start tokens in the result, default=True end_token="<|eot_id|>", return_end_token=False, max_batch_size=4095, # int: Maximum batch size, default=0 batch_type="tokens", # str: 'examples' or 'tokens', default='examples' beam_size=1, # int: Beam size, 1 for greedy search, default=1 num_hypotheses=1, # int: Number of hypotheses to return, default=1 max_length=512, # int: Maximum generation length, default=512 sampling_temperature=1, # float: Sampling temperature, default=1, not used if not sampling sampling_topk=1, # int: Top K candidates to sample from, default=1, not used if not sampling sampling_topp=1, # float: Cumulative probability cutoff, default=1, not used if not sampling # Optional additional parameters with their details: # asynchronous=False, # bool: Run generation asynchronously, default=False # patience=1, # float: Beam search patience factor, default=1 # length_penalty=1, # float: Exponential penalty for length in beam search, default=1 # repetition_penalty=1, # float: Penalty for repeating tokens, default=1 # no_repeat_ngram_size=0, # int: Size of ngram that cannot repeat, set 0 to disable, default=0 # disable_unk=False, # bool: Disable generation of unknown token, default=False # suppress_sequences=None, # Optional[List[List[str]]]: Sequences to suppress, default=None # end_token=None, # Optional[Union[str, List[str], List[int]]]: End token for decoding, default=None # return_end_token=False, # bool: Include end token in results, default=False # min_length=0, # int: Minimum generation length, default=0 # static_prompt=None, # Optional[List[str]]: Static prompt for the model, default=None # cache_static_prompt=True, # bool: Cache static prompt's model state, default=True # return_scores=False, # bool: Include scores in the output, default=False # return_alternatives=False, # bool: Return alternatives at first unconstrained position, default=False # min_alternative_expansion_prob=0, # float: Min prob to expand an alternative, default=0 # callback=None, # Callable: Function called for each generated token when beam_size is 1, default=None ) generation_end_time = perf_counter() generation_time = generation_end_time - generation_start_time stop_event.set() poll_thread.join() output = tokenizer.decode(results[0].sequences_ids[0]) generated_tokens = len(results[0].sequences_ids[0]) max_vram_usage = max(vram_readings) if vram_readings else 0 print("\nGenerated response:") print(output) print(f"\nResponse generation time: {generation_time:.4f} seconds") print(f"Generated tokens: {generated_tokens}") print(f"Max VRAM Usage: {max_vram_usage:.2f} MB") total_generation_time += generation_time total_tokens_generated += generated_tokens total_vram_usage += max_vram_usage average_generation_time = total_generation_time / num_runs tokens_per_second = total_tokens_generated / total_generation_time average_vram_usage = total_vram_usage / num_runs print(f"{model_name}") #print(f"\nModel loading time: {load_time:.4f} seconds") print(f"Beam size = 1") print(f"Average tokens per second: {tokens_per_second:.2f}") print(f"Average VRAM Usage: {average_vram_usage:.2f} MB") end_time = perf_counter() total_time = end_time - start_time #print(f"Total execution time: {total_time:.4f} seconds") del generator gc.collect() torch.cuda.empty_cache() if __name__ == "__main__": num_runs = 3 main(num_runs) ```

I'd love to hear from true experts on ctranslate2 teaching me the proper way to convert llama3 other otherwise using it...I'm sure my novice understanding of python leads to an excessive amount of time spent on running something basic. Thanks!

minhthuc2502 commented 5 months ago

I push the MR #1671 to fix llama 3. There is only a problem about the unk_token because it does not exist in the config of Llama 3. I also added a script chat.py like Llama 2 as an example of using template for Llama 3.

OpenNMT / CTranslate2