OpenNMT / CTranslate2

Fast inference engine for Transformer models
https://opennmt.net/CTranslate2
MIT License
3.3k stars 289 forks source link

Running Llama3 like a genius (that I am not) [SOLVED]: #1669

Closed BBC-Esq closed 5 months ago

BBC-Esq commented 5 months ago

Deleting my initial message because it contained paths on my computer and my follow up post below addresses the error I was getting anyways!

BBC-Esq commented 5 months ago

Finally was able to solve the issue. Here are the steps to run the new Llama3 model.

Step 1 - GET FILES

Get the model from here: https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct

Step 2 - PREPARE FILES

Step 3 - CONVERT TO CTRANSLATE2 FORMAT

Convert the model to the ctranslate2 format. You can do this one of two ways:

  1. Use a command within the command line for the ctranslate2 converter itself...NO INSTRUCTIONS are here for that. You'll have to figure that out yourself.
  2. Use my script that creates the command for you. It requires installing Pyside6, but here is the entire script:
CLICK HERE FOR MY VERY EXCELLENT SCRIPT ``` from PySide6.QtWidgets import QApplication, QWidget, QPushButton, QLabel, QFileDialog, QMessageBox, QVBoxLayout, QHBoxLayout, QCheckBox, QTextEdit, QStyleFactory from PySide6.QtCore import QThread, Signal import subprocess import os class ConversionThread(QThread): started = Signal(str, str) finished = Signal(str, str) error = Signal(str) def __init__(self, command, quantization): super().__init__() self.command = command self.quantization = quantization def run(self): self.started.emit(self.quantization, self.command) result = subprocess.run(self.command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) if result.returncode == 0: self.finished.emit(self.quantization, result.stdout) else: self.error.emit(f"Command failed with return code {result.returncode}: {result.stderr}") class App(QWidget): def __init__(self): super().__init__() self.setWindowTitle("chintellalaw.com - for non-commercial use") self.model_path = "" self.resize(800, 500) layout = QVBoxLayout(self) browse_layout = QHBoxLayout() self.browse_btn = QPushButton("Browse") self.browse_btn.clicked.connect(self.browse) browse_layout.addWidget(self.browse_btn) self.path_label = QLabel("") browse_layout.addWidget(self.path_label) layout.addLayout(browse_layout) self.quantization_options = ["float32", "float16", "bfloat16", "int8_float32", "int8_float16", "int8_bfloat16", "int8"] self.quant_vars = {option: QCheckBox(option) for option in self.quantization_options} quant_layout = QHBoxLayout() for option, chk in self.quant_vars.items(): chk.setChecked(False) quant_layout.addWidget(chk) layout.addLayout(quant_layout) self.run_btn = QPushButton("Run") self.run_btn.clicked.connect(self.run_conversion) layout.addWidget(self.run_btn) self.output_text = QTextEdit() self.output_text.setReadOnly(True) layout.addWidget(self.output_text) self.conversion_queue = [] self.current_conversion = None def browse(self): path = QFileDialog.getExistingDirectory(self, "Select Model Directory") if path: self.model_path = path self.path_label.setText(path) def append_to_text_widget(self, content): self.output_text.append(content) def run_conversion(self): self.output_text.clear() self.conversion_queue.clear() if not self.model_path: QMessageBox.critical(self, "Error", "Please select a model directory.") return for option, chk in self.quant_vars.items(): if chk.isChecked(): self.conversion_queue.append(option) self.process_next_conversion() def process_next_conversion(self): if self.conversion_queue: option = self.conversion_queue.pop(0) model_dir = self.model_path output_dir = os.path.join(os.path.dirname(model_dir), f'{os.path.basename(model_dir)}-ct2-{option}') copy_files = [filename for filename in os.listdir(model_dir) if not filename.endswith(('.bin', '.safetensors')) and filename not in ["config.json", ".git", "coreml", "configs"]] if copy_files: copy_files_option = ' '.join([f'"{filename}"' for filename in copy_files]) copy_files_cmd_part = f'--copy_files {copy_files_option} ' else: copy_files_cmd_part = '' cmd = (f'ct2-transformers-converter --model "{model_dir}" ' f'--output_dir "{output_dir}" ' f'--quantization {option} ' f'--low_cpu_mem_usage --trust_remote_code ' f'{copy_files_cmd_part.strip()}') self.current_conversion = ConversionThread(cmd, option) self.current_conversion.started.connect(self.on_conversion_started) self.current_conversion.finished.connect(self.on_conversion_finished) self.current_conversion.error.connect(self.on_conversion_error) self.current_conversion.start() else: self.append_to_text_widget("All selected conversions are complete.") def on_conversion_started(self, quantization, command): self.append_to_text_widget(f"Starting conversion for {quantization} with command:\n{command}") def on_conversion_finished(self, quantization, output): completion_message = f"Conversion completed for {quantization}.\n{output}" self.append_to_text_widget(completion_message) self.process_next_conversion() def on_conversion_error(self, error_message): self.append_to_text_widget(error_message) if __name__ == "__main__": app = QApplication([]) app.setStyle(QStyleFactory.create("Fusion")) widget = App() widget.show() app.exec() ```

If you correctly install Pyside6 and run the script you should see an easy-to-understand GUI that will convert any compatible model. You only need to select the folder containing the model files:

image

Step 4 - FIX THE STUPID VALUE IN CONFIG.JSON

image

Step 5 - PROMPT FORMAT

prompt = f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n{system_message}<|eot_id|><|start_header_id|>user<|end_header_id|>\n{user_message}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n"

Note, this does NOT entail a multi-turn conversation with memory. For that you'll need to consult the llama3 github repository for how to continue using the prompt format. But more importantly, you'll have to construct a ctranslate2 script that can utilize the prompting format as well as manage "memory..." @ guillikam created a basic script with "memory," albeit for llama2, which are in the docs. To reiterate, MY EXAMPLE IS ONLY for a single-turn question - i.e. for rag-based applications.

Step 6 - PROPER TOKENIZATION and PROMPTING

I've complained before about the dearth of examples on how to use stereotypical "chat" models. The only helpful example from the "Docs" was for Falcon, which I adapted. Anyhow, I'll provide my full script below for the benefit of the community, but before that, it's helpful to understand a few things:

Step 7 - SAMPLE SCRIPT

Without further ado, here is a sample script. I have put in all caps placeholders for personal information as well as things that depend on your use case;

MY VERY COOL AWESOME VERY DECENT SCRIPT USING LLAMA3 ``` import os import ctranslate2 import gc import torch from time import perf_counter from transformers import AutoTokenizer import pynvml import threading import time pynvml.nvmlInit() handle = pynvml.nvmlDeviceGetHandleByIndex(0) user_prompt = """INSERT HERE YOUR HARDCODED PROMPT...E.G. HERE ARE 5 CONTEXTS AND I ONLY WANT YOU TO ANSWER MY QUESTION BASED OFF OF THEM...AND HERE'S MY QUESTION...AND IF YOU DON'T KNOW SAY YOU DON'T KNOW""" system_prompt = "You are a helpful assistant who answers questions in a succinct fashion based on the contexts given to you." def build_prompt_llama_3_8b_instruct(): prompt = f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>\n{user_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n" return prompt def poll_vram_usage(stop_event, vram_readings): while not stop_event.is_set(): memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle) vram_usage = memory_info.used / 1024**2 vram_readings.append(vram_usage) time.sleep(0.1) def main(num_runs=1): start_time = perf_counter() model_dir = r"PATH TO THE FOLDER CONTAINING THE CTRANSLATE2 CONVERTED FILES" model_name = os.path.basename(model_dir) print(f"\033[32mLoading the model: {model_name}...\033[0m") load_start_time = perf_counter() intra_threads = max(os.cpu_count() - 4, os.cpu_count()) generator = ctranslate2.Generator(model_dir, device="cuda", compute_type="int8", intra_threads=intra_threads) tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct") load_end_time = perf_counter() load_time = load_end_time - load_start_time prompt = build_prompt_llama_3_8b_instruct() tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(prompt)) total_generation_time = 0 total_tokens_generated = 0 total_vram_usage = 0 for i in range(num_runs): print(f"\nRun {i+1}:") stop_event = threading.Event() vram_readings = [] poll_thread = threading.Thread(target=poll_vram_usage, args=(stop_event, vram_readings)) poll_thread.start() generation_start_time = perf_counter() results = generator.generate_batch( [tokens], include_prompt_in_result=False, # bool: Include start tokens in the result, default=True end_token="<|eot_id|>", return_end_token=False, max_batch_size=4095, # int: Maximum batch size, default=0 batch_type="tokens", # str: 'examples' or 'tokens', default='examples' beam_size=1, # int: Beam size, 1 for greedy search, default=1 num_hypotheses=1, # int: Number of hypotheses to return, default=1 max_length=512, # int: Maximum generation length, default=512 sampling_temperature=1, # float: Sampling temperature, default=1, not used if not sampling sampling_topk=1, # int: Top K candidates to sample from, default=1, not used if not sampling sampling_topp=1, # float: Cumulative probability cutoff, default=1, not used if not sampling # Optional additional parameters with their details: # asynchronous=False, # bool: Run generation asynchronously, default=False # patience=1, # float: Beam search patience factor, default=1 # length_penalty=1, # float: Exponential penalty for length in beam search, default=1 # repetition_penalty=1, # float: Penalty for repeating tokens, default=1 # no_repeat_ngram_size=0, # int: Size of ngram that cannot repeat, set 0 to disable, default=0 # disable_unk=False, # bool: Disable generation of unknown token, default=False # suppress_sequences=None, # Optional[List[List[str]]]: Sequences to suppress, default=None # end_token=None, # Optional[Union[str, List[str], List[int]]]: End token for decoding, default=None # return_end_token=False, # bool: Include end token in results, default=False # min_length=0, # int: Minimum generation length, default=0 # static_prompt=None, # Optional[List[str]]: Static prompt for the model, default=None # cache_static_prompt=True, # bool: Cache static prompt's model state, default=True # return_scores=False, # bool: Include scores in the output, default=False # return_alternatives=False, # bool: Return alternatives at first unconstrained position, default=False # min_alternative_expansion_prob=0, # float: Min prob to expand an alternative, default=0 # callback=None, # Callable: Function called for each generated token when beam_size is 1, default=None ) generation_end_time = perf_counter() generation_time = generation_end_time - generation_start_time stop_event.set() poll_thread.join() output = tokenizer.decode(results[0].sequences_ids[0]) generated_tokens = len(results[0].sequences_ids[0]) max_vram_usage = max(vram_readings) if vram_readings else 0 print("\nGenerated response:") print(output) print(f"\nResponse generation time: {generation_time:.4f} seconds") print(f"Generated tokens: {generated_tokens}") print(f"Max VRAM Usage: {max_vram_usage:.2f} MB") total_generation_time += generation_time total_tokens_generated += generated_tokens total_vram_usage += max_vram_usage average_generation_time = total_generation_time / num_runs tokens_per_second = total_tokens_generated / total_generation_time average_vram_usage = total_vram_usage / num_runs print(f"{model_name}") #print(f"\nModel loading time: {load_time:.4f} seconds") print(f"Beam size = 1") print(f"Average tokens per second: {tokens_per_second:.2f}") print(f"Average VRAM Usage: {average_vram_usage:.2f} MB") end_time = perf_counter() total_time = end_time - start_time #print(f"Total execution time: {total_time:.4f} seconds") del generator gc.collect() torch.cuda.empty_cache() if __name__ == "__main__": num_runs = 3 main(num_runs) ```

I'd love to hear from true experts on ctranslate2 teaching me the proper way to convert llama3 other otherwise using it...I'm sure my novice understanding of python leads to an excessive amount of time spent on running something basic. Thanks!

minhthuc2502 commented 5 months ago

I push the MR #1671 to fix llama 3. There is only a problem about the unk_token because it does not exist in the config of Llama 3. I also added a script chat.py like Llama 2 as an example of using template for Llama 3.