The answers from model with batching return different answer even with the same input

Here is my HF format of exllamav2 model

import torch, os
from contextlib import contextmanager
from pathlib import Path
from typing import Optional, List, Union, Dict
from transformers import AutoConfig, PretrainedConfig
from transformers.generation.utils import GenerationMixin, GenerationConfig
from transformers.modeling_outputs import CausalLMOutputWithPast

from exllamav2 import ExLlamaV2, ExLlamaV2Cache, ExLlamaV2Config, ExLlamaV2Lora

class ExLlamaV2ForCausalLM(GenerationMixin):

    def __init__(
        self,
        config: PretrainedConfig,
        generation_config: GenerationConfig,
        exllama_config: ExLlamaV2Config,
        model: ExLlamaV2,
        loras: Dict[str, ExLlamaV2Lora] = {'': None},
        active_adapter: str = '',
        **kwargs
    ):
        self.config = config
        self.generation_config = generation_config
        self.exllama_config = exllama_config
        self.model = model
        self.loras = loras
        if '' not in self.loras:
            self.loras[''] = None
        self._active_adapter = active_adapter
        self._adapter_enabled = True
        if active_adapter == '':
            self.disable_adapter_layers()

    def can_generate(self):
        return True

    @property
    def device(self) -> torch.device:
        return torch.device(0)

    @property
    def main_input_name(self) -> str:
        return 'input_ids'

    @property
    def active_adapters(self) -> List[str]:
        return [self._active_adapter] if self._adapter_enabled else []

    @property
    def active_adapter(self) -> List[str]:
        return self._active_adapter if self._adapter_enabled else ''

    def prepare_inputs_for_generation(self, input_ids, **kwargs):
        return {'input_ids': input_ids, **kwargs}

    def __call__(self, *args, **kwargs):
        return self.forward(*args, **kwargs)

    def forward(
        self,
        input_ids: torch.LongTensor,
        attention_mask: Optional[torch.Tensor] = None,
        past_key_values: Optional[ExLlamaV2Cache] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        cache_size: int = -1,
        **kwargs
    ):
        use_cache = use_cache if use_cache is not None else self.config.use_cache
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        loras = self.loras.get(self.active_adapter, None)
        loras = [loras] if loras else loras

        if labels is None:
            if past_key_values is None:
                past_key_values = ExLlamaV2Cache(self.model, input_ids.shape[0], cache_size)
                self.model.forward(input_ids[...,:-1], past_key_values, preprocess_only=True, loras=loras, input_mask=attention_mask)

            logits = self.model.forward(input_ids[...,-1:], past_key_values, loras=loras, input_mask=attention_mask).to(input_ids.device)
        else:
            if past_key_values is None:
                past_key_values = ExLlamaV2Cache(self.model, input_ids.shape[0], cache_size)

            logits = self.model.forward(input_ids, past_key_values, loras=loras, input_mask=attention_mask)

        loss = None
        if labels is not None:
            # Shift so that tokens < n predict n
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            # Flatten the tokens
            loss_fct = torch.nn.CrossEntropyLoss()
            shift_logits = shift_logits.view(-1, logits.shape[-1])
            shift_labels = shift_labels.view(-1)
            # Enable model parallelism
            shift_labels = shift_labels.to(shift_logits.device)
            loss = loss_fct(shift_logits, shift_labels)

        if not return_dict:
            output = (logits, past_key_values if use_cache else None)
            return (loss,) + output if loss is not None else output

        return CausalLMOutputWithPast(logits=logits, past_key_values=past_key_values if use_cache else None, loss=loss)

    def load_adapter(self, lora_path: Union[str, os.PathLike], adapter_name: str):

        if adapter_name in self.loras:
            raise ValueError('This adapter is already existed')

        if isinstance(lora_path, str):
            lora_path = Path(lora_path)

        lora_model = ExLlamaV2Lora.from_directory(self.model, lora_path)

        self.loras[adapter_name] = lora_model

    def set_adapter(self, adapter_name: str):

        if adapter_name not in self.loras:
            raise ValueError('The adapter is not existed')

        self._active_adapter = adapter_name

    def enable_adapter_layers(self):

        self._adapter_enabled = True

    def disable_adapter_layers(self):

        self._adapter_enabled = False

    @contextmanager
    def disable_adapter(self):

        try:
            self.disable_adapter_layers()
            yield
        finally:
            self.enable_adapter_layers()

    @classmethod
    def from_pretrained(
        cls,
        pretrained_model_name_or_path: Union[str, os.PathLike],
        gpu_split: Optional[str] = None,
        lora_path: Optional[Union[str, os.PathLike]] = None,
        adapter_name: str = 'default',
        trust_remote_code: bool = False,
        use_flash_attention_2: bool = False
    ):
        if isinstance(pretrained_model_name_or_path, str):
            pretrained_model_name_or_path = Path(pretrained_model_name_or_path)

        if isinstance(lora_path, str):
            lora_path = Path(lora_path)

        config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=trust_remote_code)

        try:
            generation_config = GenerationConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=trust_remote_code)
        except:
            generation_config = GenerationConfig()

        exllama_config = ExLlamaV2Config()
        exllama_config.model_dir = pretrained_model_name_or_path
        exllama_config.no_flash_attn = not use_flash_attention_2
        if getattr(config, 'rope_scaling', None) is not None:
            if config.rope_scaling['type'] == 'linear':
                exllama_config.scale_pos_emb = config.rope_scaling['factor']
            elif config.rope_scaling['type'] == 'dynamic':
                exllama_config.scale_alpha_value = config.rope_scaling['factor']
        exllama_config.prepare()

        model = ExLlamaV2(exllama_config)
        if gpu_split is not None:
            gpu_split = [float(d) for d in gpu_split.split(' ')]
        model.load(gpu_split=gpu_split)

        lora_model = None
        if lora_path is not None:
            lora_model = ExLlamaV2Lora.from_directory(model, lora_path)

        if lora_model is None:
            adapter_name = ''

        return cls(config, generation_config, exllama_config, model, {adapter_name: lora_model}, adapter_name)

    @staticmethod
    def _reorder_cache(past_key_values: ExLlamaV2Cache, beam_idx):

        for i in range(len(past_key_values.key_states)):
            past_key_values.key_states[i] = past_key_values.key_states[i].index_select(0, beam_idx.to(past_key_values.key_states[i].device))
            past_key_values.value_states[i] = past_key_values.value_states[i].index_select(0, beam_idx.to(past_key_values.value_states[i].device))

        return past_key_values

I test it like this:

import torch
from transformers import AutoTokenizer

# load the model and tokenizer
model = ExLlamaV2ForCausalLM.from_pretrained('Llama-2-7b-chat', use_flash_attention_2=True)
tokenizer = AutoTokenizer.from_pretrained('Llama-2-7b-chat')

# make batch of text with same input
text = ['[INST] What is AI? [/INST]', '[INST] What is AI? [/INST]']

inputs = tokenizer(text, return_tensors='pt')

# make generation deterministic
with torch.inference_mode():
    outputs = model.generate(**inputs, do_sample=False)

print(tokenizer.batch_decode(outputs))

Here is the output:

['<s> [INST] What is AI? [/INST]  Artificial intelligence (AI) is the field of study and development of intelligent machines, such as computers, robots, and software, that can perform tasks that typically require human intelligence, such as learning, problem-solving, decision-making, and perception.\n\nAI is a multidisciplinary field that combines techniques from computer science, mathematics, engineering, and cognitive science to create intelligent systems that can interact with the world in a way that is similar to how humans do.\n\nThere are several types of AI, including:\n\n1. Narrow or weak AI: This type of AI is designed to perform a specific task, such as playing chess or recognizing faces. Narrow AI is the most common type of AI and is used in many applications, such as virtual assistants, language translation, and image recognition.\n2. General or strong AI: This type of AI is designed to perform any intellectual task that a human can do. General AI is still a topic of ongoing research and development, but it has the potential to revolutionize many industries, including healthcare, finance, and education.\n3. Superintelligence: This type of AI is significantly more intelligent than the best human minds. Superintelligence could potentially solve complex problems that are currently unsolvable, but it also raises concerns about safety and control.\n4. Artificial general intelligence (AGI): This type of AI is designed to perform any intellectual task that a human can do, without being specifically designed for a particular task. AGI is still a topic of ongoing research and development.\n\nAI has many applications, including:\n\n1. Natural language processing (NLP): AI can analyze and generate human language, allowing for applications such as language translation, sentiment analysis, and chatbots.\n2. Computer vision: AI can analyze and understand visual data from images and videos, allowing for applications such as object recognition, facial recognition, and autonomous vehicles.\n3. Robotics: AI can control and manipulate physical devices, allowing for applications such as robotic manufacturing, robotic surgery, and autonomous robots.\n4. Predictive analytics: AI can analyze and make predictions about future events, allowing for applications such as fraud detection, risk management, and demand forecasting.\n5. Machine learning: AI can learn from data and improve its performance over time, allowing for applications such as recommendation systems, personalized advertising, and autonomous vehicles.\n\nThe potential benefits of AI are numerous, including:\n\n1. Increased productivity: AI can automate routine tasks and free up time for more creative and strategic work.\n2. Improved decision-making: AI can analyze large amounts of data and make decisions based on patterns and trends, leading to better decision-making.\n3. Enhanced customer experience: AI can personalize experiences for customers, leading to increased satisfaction and loyalty.\n4. New business models: AI can enable new business models, such as autonomous vehicles and personalized healthcare.\n5. Scientific breakthroughs: AI can accelerate scientific discovery and lead to new breakthroughs in fields such as medicine, physics, and astronomy.\n\nHowever, AI also raises ethical and societal concerns, including:\n\n1. Job displacement: AI could displace human workers in many industries, leading to job loss and economic disruption.\n2. Bias and discrimination: AI can perpetuate and amplify existing biases and discrimination, leading to unfair outcomes.\n3. Privacy and security: AI can collect and analyze large amounts of personal data, leading to privacy and security concerns.\n4. Autonomous weapons: AI could be used to create autonomous weapons, leading to ethical and legal concerns.\n5. Unintended consequences: AI could have unintended consequences, such as autonomous vehicles causing accidents or AI-generated fake news leading to social unrest.\n\nTo address these concerns, it is important to develop ethical and regulatory frameworks for AI, as well as to invest in education and retraining programs to help workers adapt to the changing job market. Additionally, it is important to ensure that AI is developed and deployed in a responsible and transparent manner, with consideration for the potential consequences and ethical implications.</s>', '<s> [INST] What is AI? [/INST]  Artificial intelligence (AI) is a field of computer science focused on creating intelligent machines that can perform tasks that typically require human intelligence, such as learning, problem-solving, and decision-making.\n\nAI involves the development of algorithms and models that enable machines to learn from data, adapt to new situations, and improve their performance over time. This is achieved through various techniques, including:\n\n1. Machine learning: A subset of AI that involves training machines to learn from data without being explicitly programmed.\n2. Deep learning: A subset of machine learning that uses neural networks to analyze and interpret data.\n3. Natural language processing: A subset of AI that focuses on enabling machines to understand, interpret, and generate human language.\n4. Computer vision: A subset of AI that focuses on enabling machines to interpret and understand visual data from the world around them.\n\nSome examples of AI applications include:\n\n1. Virtual assistants: AI-powered systems that can perform tasks such as scheduling appointments, sending messages, and making recommendations.\n2. Fraud detection: AI-powered systems that can analyze financial transactions to detect and prevent fraudulent activity.\n3. Self-driving cars: AI-powered systems that can analyze sensory data from cameras, radar, and other sensors to navigate and make decisions in real-time.\n4. Personalized recommendations: AI-powered systems that can analyze user behavior and preferences to provide personalized recommendations for products, services, and content.\n5. Predictive maintenance: AI-powered systems that can analyze sensor data from machines to predict when maintenance is required, reducing downtime and improving overall efficiency.\n6. Chatbots: AI-powered systems that can simulate human conversation, either through text or voice interactions, to provide customer service, answer questions, or perform other tasks.\n7. Healthcare: AI can be used to analyze medical images, diagnose diseases, and develop personalized treatment plans.\n8. Finance: AI can be used to analyze financial data, detect fraud, and make investment predictions.\n9. Education: AI can be used to personalize learning, grade assignments, and develop virtual teaching assistants.\n10. Manufacturing: AI can be used to optimize production processes, predict maintenance needs, and improve product quality.\n\nThese are just a few examples of the many ways that AI is transforming industries and improving the way we live and work. As the field of AI continues to evolve, we can expect to see even more innovative applications and advancements in the years to come.</s><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk>']

The answer is quite different for the same question. If I set the temperature or top_k, it's natural the answer will be different. But, setting do_sample=False meaning token chosen based on the maximum value in logits and it's deterministic. But, here the answer is completely different.

After I test to only forward and getting the logits. It seems that the logits is quite different.

Here is how I test it

import torch
from transformers import AutoTokenizer

# load the model and tokenizer
model = ExLlamaV2ForCausalLM.from_pretrained('Llama-2-7b-chat', use_flash_attention_2=True)
tokenizer = AutoTokenizer.from_pretrained('Llama-2-7b-chat')

# make batch of text with same input
text = ['[INST] What is AI? [/INST]', '[INST] What is AI? [/INST]']

inputs = tokenizer(text, return_tensors='pt').to('cuda')

with torch.inference_mode():
    outputs = model(**inputs)

print(outputs.logits)

Here is the result of logits

tensor([[[-5.5977, -7.8750,  5.0234,  ..., -3.3066, -4.6328, -0.5127]],

        [[-5.5938, -7.8125,  5.0352,  ..., -3.2676, -4.6250, -0.5039]]],
       device='cuda:0', dtype=torch.float16)

Even after doing like #200 which set use_flash_attention_2=False, the logits still differ. Here is the logits after disable flash attention 2

tensor([[[-5.6289, -7.8828,  4.9375,  ..., -3.2871, -4.6719, -0.4893]],

        [[-5.5898, -7.8242,  4.9727,  ..., -3.2773, -4.6758, -0.5430]]],
       device='cuda:0', dtype=torch.float16)

Possible same problem with #135

Short answer: The results you're getting are expected because the implementation isn't 100% deterministic.

Now, please forgive the following rant, but since this question comes up often I may as well explain it in detail so I have something to refer to next time.

What it comes down to is the use of atomic operations in the GEMV-oriented matmul kernels, the fact that CUDA's thread launch order is nondeterministic and the fact that floating-point addition is non-associative. Essentially (a+b)+c != a+(b+c) for floating point types, and if you compute a+b+c across threads in CUDA using atomic addition, you can't know if the result is going to be (a+b)+c or a+(b+c). The two results are often the same but they can sometimes differ by a tiny amount due to rounding of the intermediate result.

Now, after accumulating millions and millions of those tiny errors, one forward pass might produce a distribution of [0.700, 0.104, 0.102...] while the same forward pass next time around could give you [0.702, 0.101, 0.100...]. Both would be correct to the precision of FP16, but they wouldn't be identical bit-for-bit.

Then, all you need is one binomial sample that's right on the boundary between two tokens, say, sampling to a cutoff of 0.701 in that example, and your output sequences are now different and diverging. Or you could end up with distributions where the top two tokens are practically the same probability, and the random jitter flips their order, so even greedy sampling can give diverging results. This seems likely in your example since the divergence starts at the point of Artificial intelligence (AI) is followed by either a or the, both of which seem like they would be highly preferred tokens, quite possibly scoring roughly the same and becoming a "tipping point" in the autoregressive sequence.

As for what to do about it, well, you can either do inference in FP32 while rounding to FP16 at deterministic points along the way, hoping that the FP32 error never accumulates to the point that it would flip a bit in the nearest FP16 value, or you can use deterministic (reduction-based) matmul approaches. Both of those hurt performance, so there's no immediate support for them in ExLlama.

The third option is to question the need for determinism in the first place. Consider this:

If you're already using quantized models it's clearly not perfect precision you're after. The problem with nondeterminism can't be that it doesn't give you "the precisely correct result" because that went out the window anyway when you distilled the model down to the most important 25% of its bits.
If you get two slightly different outputs from the same input, there's no sense in which either of the two outputs is the correct one, unless you arbitrarily define an order of additions to be canonical and then write/select CUDA kernels to always follow that order. But then the resulting output is still only correct by an arbitrary definition which depends on specifics you're probably not concerned with, such as which kernel (or set of kernels) you end up launching by calling torch.matmul() in Python or cublasHgemm in C++. It can even change with your CUDA version or your hardware architecture. From the cuBLAS API docs:

By design, all cuBLAS API routines from a given toolkit version, generate the same bit-wise results at every run when executed on GPUs with the same architecture and the same number of SMs. However, bit-wise reproducibility is not guaranteed across toolkit versions because the implementation might differ due to some implementation changes.

What is it that you actually achieve by passing the same input twice instead of just duplicating the output? Consider if the implementation secretly did this to "fake" being deterministic, i.e., imagine it cached the result of each forward pass using a database of hashed inputs and whatever was the first output non-deterministically computed for each of them. Is there any conceivable way you could detect that the determinism was "faked" in this way, if you just had the outputs to go by?

Personally, I would argue that good testing methodologies in this context need to be robust to noise anyway. Yes, bitwise-identical outputs are a useful proxy for functionally identical implementations, as a way to quickly verify that you didn't make any subtle mistakes by having a huge, chaotic-dynamic computation arrive at the exact same result as a reference implementation.

But it doesn't tell you anything more than that. As soon as you switch to a different GPU, or you split your tensors across multiple GPUs, or you update to a new version of PyTorch, your could get any other output from the set of possible outputs that are all correct to within the precision of FP16. If you're getting Artificial intelligence (AI) is the field of study... on your system, a user of whatever application you're building might still see Artificial intelligence (AI) is a field of computer science... with the exact same prompt, only because they're two weeks behind on nightly builds of PyTorch.

And even within a single, deterministic context, causal LM is still causal which makes it is chaotic and sensitive to minute changes in initial conditions: Try the same sequence with an extra space at the beginning and watch it take a completely different turn regardless of precision and determinism in the underlying framework.

All in all, while it would be nice to add determinism as an option, it would either degrade performance just for the sake of test cases like yours, or it would be a switchable option that doesn't actually prove anything about the correctness of the other, non-deterministic code path.

Again, sorry for the rant. But since your observation is correct and your concern is entirely valid I feel it deserves a full explanation. I'd happily entertain discussion if anyone should disagree.

I ask this because this thing never occurred in exllama version 1. Kinda puzzled me this "problem" occurred in version 2.

Greedy search from the name itself is a deterministic which is when the result is not deterministic feels weird. Then what is the purporse of sampling method if greedy is also kind of sampling? One of the usages of deterministic greedy search in my case is to check if lora is applied correctly or not. Because if lora applied correctly, the answer with lora and without lora would be slightly different for question that is not in dataset for lora finetuning. Deterministic greedy search is also for debugging my LLM API for checking if the model is loaded and generate "correctly" for my purpose.

I'm sorry if my question might offend you.

First of all, I adore your implementation and it's clear that certain optimizations come with a cost.

However, I believe there actually might be some issues or the extent of optimization deserves an option for reduction (We can try to make a PR if you point me in the right direction). llama.cpp fully offloaded to GPU has much less variance and an additional determinism issue can be pretty much completely resolved by changing one line from using 8 CUDE threads to 1. (i.e. pointing to the code you need to rebuild it yourself).

Example of first 3 tokens for the same prompt: Top 5 values: tensor([17.3906, 13.8125, 13.0938, 12.4141, 12.4141], device='cuda:0') Top 5 probs: tensor([0.9370, 0.0262, 0.0128, 0.0065, 0.0065], device='cuda:0')

Top 5 values: tensor([19.0156, 17.9531, 16.3125, 11.7891, 10.9922], device='cuda:0') Top 5 probs: tensor([7.0688e-01, 2.4429e-01, 4.7358e-02, 5.1391e-04, 2.3164e-04], device='cuda:0')

Top 5 values: tensor([21.6875, 21.6719, 19.8906, 17.4375, 16.9844], device='cuda:0') Top 5 probs: tensor([0.4503, 0.4433, 0.0747, 0.0064, 0.0041], device='cuda:0')

Top 5 values: tensor([17.5625, 13.7031, 13.0859, 12.6094, 12.4375], device='cuda:0') Top 5 probs: tensor([0.9470, 0.0200, 0.0108, 0.0067, 0.0056], device='cuda:0')

Top 5 values: tensor([19.1094, 18.1094, 16.3281, 11.8438, 10.8984], device='cuda:0') Top 5 probs: tensor([6.9844e-01, 2.5694e-01, 4.3276e-02, 4.8833e-04, 1.8974e-04], device='cuda:0')

Top 5 values: tensor([21.6875, 21.6719, 19.9531, 17.5312, 17.0625], device='cuda:0') Top 5 probs: tensor([0.4473, 0.4404, 0.0790, 0.0070, 0.0044], device='cuda:0')

Top 5 values: tensor([17.5156, 13.9297, 13.2422, 12.4688, 12.4219], device='cuda:0') Top 5 probs: tensor([0.9396, 0.0260, 0.0131, 0.0060, 0.0058], device='cuda:0')

Top 5 values: tensor([19.0938, 17.8750, 16.3281, 11.8516, 10.8984], device='cuda:0') Top 5 probs: tensor([7.3504e-01, 2.1728e-01, 4.6261e-02, 5.2610e-04, 2.0283e-04], device='cuda:0')

Top 5 values: tensor([21.7344, 21.7188, 19.9375, 17.5312, 17.0312], device='cuda:0') Top 5 probs: tensor([0.4504, 0.4434, 0.0747, 0.0067, 0.0041], device='cuda:0')

Top 5 values: tensor([17.4531, 13.9141, 13.0938, 12.4531, 12.3281], device='cuda:0') Top 5 probs: tensor([0.9388, 0.0273, 0.0120, 0.0063, 0.0056], device='cuda:0')

Top 5 values: tensor([19.0781, 17.9688, 16.3750, 11.8594, 10.9375], device='cuda:0') Top 5 probs: tensor([7.1493e-01, 2.3576e-01, 4.7897e-02, 5.2384e-04, 2.0837e-04], device='cuda:0')

Top 5 values: tensor([21.7656, 21.6875, 20.0000, 17.5000, 16.9844], device='cuda:0') Top 5 probs: tensor([0.4622, 0.4275, 0.0791, 0.0065, 0.0039], device='cuda:0')

Top 5 values: tensor([17.4688, 13.7656, 13.3125, 12.4609, 12.3828], device='cuda:0') Top 5 probs: tensor([0.9400, 0.0232, 0.0147, 0.0063, 0.0058], device='cuda:0')

Top 5 values: tensor([19.0625, 18.0156, 16.2656, 11.8516, 10.8359], device='cuda:0') Top 5 probs: tensor([7.0720e-01, 2.4825e-01, 4.3140e-02, 5.2224e-04, 1.8914e-04], device='cuda:0')

Top 5 values: tensor([21.7031, 21.6562, 19.9688, 17.4062, 17.0469], device='cuda:0') Top 5 probs: tensor([0.4542, 0.4334, 0.0802, 0.0062, 0.0043], device='cuda:0')

It seems like there is some factor that will make logits more off, than I noticed when quantizing from 8bit to 4bit.

I created a variance of the HumanEval benchmark using HumanEvalFix from the OctoCoder paper, where the prompt has buggy code, and test cases and asks to fix the code. I have further changed the tests so that all variables and function names are renamed to reduce recall of the certainly leaked benchmark.

Running this test in llama.cpp with Phind v2 CodeLlama 34B Q5_K_M you will always get 26.8% (full GPU offload, no change to CUDA threads)

In the latest Exllama commit: {'pass@1': 0.2926829268292683} {'pass@1': 0.2621951219512195} {'pass@1': 0.2621951219512195} {'pass@1': 0.2621951219512195} {'pass@1': 0.2804878048780488} {'pass@1': 0.2621951219512195} {'pass@1': 0.27439024390243905} {'pass@1': 0.2804878048780488} {'pass@1': 0.2804878048780488} {'pass@1': 0.2865853658536585} {'pass@1': 0.2621951219512195} {'pass@1': 0.25} {'pass@1': 0.27439024390243905} {'pass@1': 0.2926829268292683} {'pass@1': 0.2804878048780488} {'pass@1': 0.25609756097560976} {'pass@1': 0.2682926829268293} {'pass@1': 0.2621951219512195}

Simplifying: 9 x 25/26% 9 x 27/28/29%

While working on a finetune I hope to get 2% more in this benchmark (GPT4 has around 44%), while this benchmark might not be great I think it shows a certain degree of variance that might impact for example function calling.

@fahadh4ilyas I am certainly not offended. :) But V1 was definitely not deterministic either. Bunch of threads about the issue too.

As for sampling, it still changes how the output distribution is used. Without sampling, the top-scoring logit entirely dictates what the next token will be, even if the jitter can cause what the top token is to change in certain situations (specifically when there are two tokens at the top with near-equal probability.) Nondeterminism will not introduce a 10% chance of selecting a logit that has a 10% score after the softmax, but sampling will. It may cause the model to be nudged onto a different autoregressive path if it reaches a tipping point where it was a 50.1%/49.9% choice to begin with.

I think there's a bit of a misconception that people have about determinism. I see it here, for instance:

Because if lora applied correctly, the answer with lora and without lora would be slightly different for question that is not in dataset for lora finetuning.

This is not correct. Even with determinism, anything that affects the hidden state at any point, even slightly, will give you a different answer some number of tokens down the line, since the output always feeds back on the input via the keys/value carried over from token to token, and then ultimately through the embeddings once you reach a tipping point and end up selecting a different token than you otherwise would have.

The way to tell if the LoRA is being applied is with some instrumentation of the forward pass (or a debugger), and to tell if it's working as intended, and this is the key point, you need a testing method that's robust to the chaotic dynamics of causal language modeling, even in a deterministic framework. That same methodology will also be robust to nondeterminism.

@krzysiekpodk Suppose I added an option like that. You can now run ExLlama in a slower mode where you get a guaranteed result of 27.4% on your test, instead of the fast mode where results differ by +/- 2%. Now what? Are you going to use it in the slow mode going forward? If not, what does it help you to know that you could have run it in a mode where the test always gave a score of 27.4%, while you are actually using it in a mode where result is the 27.4% +/- 2%?

I would even say there's a case to be made for determinism giving you a false sense of stability here. Cause if you do get 27.4% on your test in deterministic mode, that doesn't mean other people will be able to reproduce that result. They may be using a different CUDA version, for instance, or a different GPU, or a different driver or whatever. And as that variance shows, their results could be anywhere from 25.4% to 29.4%.

Just out of interest, what happens if you change some variable names slightly and run the test again in llama.cpp? Maybe insert an extra linefeed or something, a bit of indentation, or something else that shouldn't matter to the substance of the test problems--do you still get the exact same score? My guess is you wouldn't, because of the same butterfly effect.

Now, I wouldn't mind suggestions on how to improve reproducibility overall. Just remember that for test purposes you can already do more deterministic but slower inference in Transformers, for instance. You can also set MAX_Q_GEMM_ROWS to 0 in exllamav2/exllamav2_ext/config.h, which will disable the quantized matmul and give you tighter but 75% slower results. Or you can play around with GPTQ_BLOCK_KN_SIZE and EXL2_BLOCK_KN_SIZE which directly influence the number of partial sums produced by the quantized matmul (larger blocks -> fewer atomicAdds -> less variance, presumably). Feel free to experiment and/or contribute. Just know that I don't want two different code paths to maintain as long as the only use case for one of them is already covered by other frameworks. It needs to be a reasonable tradeoff between precision and speed.

@turboderp The weird thing is, I already load my API to many kind of GPU (A5000, 4080, 4090, A6000, A100) and using different cuda version (11 and 12), different pytorch version (2.0 and 2.1), and even different model. But, everytime I set do_sample=False, the answer is never change when I use exllama v1.

Why this is an issue for me is I can not recreate something when one of my user has some problem with their generation. When I'm using exllama v1, whenever the user said that the result of generation is "bad", I ask for their prompt and test it myself first with do_sample=False and then with sampling. There I can know what is wrong. Maybe their prompt or maybe the way I make the API.

Also because my API is using queue which mean two or more prompt from different user can be generated in one model simultaneously in batch, I don't want my user will get unexpected answer because of batching with another prompt (which I never had the problem using exllama v1).

Just out of interest, what happens if you change some variable names slightly and run the test again in llama.cpp? Maybe insert an extra linefeed or something, a bit of indentation, or something else that shouldn't matter to the substance of the test problems--do you still get the exact same score? My guess is you wouldn't, because of the same butterfly effect.

This is not an apple to apple comparation. Changing variable or insert different input is of course will got you a different answer. But, it's something that you do consciously, so you do expect the different score. But, not changing anything and giving an exact input but getting a completely different answer is not something to be expected.

@turboderp I think this topic will always come back unless you decide/document what is the level of variance you allow for the sake of better performance.

If you state that results are much more random, but performance is SOTA - it's a valid use case and then different frameworks will be for different use cases. I love ExllamaV2 and wanted t use it for everything I do instead of changing backends.

In coding, agent, function calling/tools use cases, the current level of variance is simply a no-go. It's not a matter of one or two logits that are off where "both are valid" as a comma in the wrong place makes a whole generation worthless. Let's say, I'm asking a model to write me a snake game or how to update docker configuration - if model fails to do so more than 9 out of 10 times I simply throw it out.

In the above use cases it takes a considerable amount of time to test prompts, generations, settings, quantization levels and models themselves to get it right in my case if I would need to test 4 presets, 3/4 quantizations, across 100 prompts AND run it just 10 times to get average is 16k generations for a simple check while I need to do dozens, in this case it is no longer possible using consumer compute.

I tested the block size up to 512 after which it crashes and I don't see any difference. Then I even did a topk on 3 top logits and if the difference between 2 (optionally 3) is less or equal to 0.5, I'm overwriting them with the same value and always taking the rightmost token (i.e. higher token id among those that are almost equal. )

Even with as severe flattening as above I still got some variance (there was break-point where variance was 0.7 between top logits) ran the tests again with that hacky token pick and the whole performance degraded by 33%.

You also mention that adding a space or small change in the prompt will make a similar variance - it's not the case: using CoT and huge prompt vs. zero-shot makes no more than 4% difference.

With yesterday's testing, I think this is not just about greedy decoding, but in general its an issue for models where the distribution of probability of very top tokens is detrimental to model performance. The current quality of coding models, unfortunately, is like that.

Another point, that I might not have made clear above is that the benefits of higher quants are completely diminishing - I think I mentioned it in another issue some time ago. It's also easy to reproduce - take any 4bit and 8 bit quant and run HumanEval few times (which might be more of a recall rather than coding capabilities, but still) You won't be able to tell them apart from scoring.

Perhaps I can better illustrate with an example. Take this bit of code:

torch.manual_seed(0)
a = torch.randn((100,), dtype = torch.half)

def element_sum(x, deterministic):
    if not deterministic:
        x = x[torch.randperm(x.shape[0])]
    accum = torch.zeros((1,), dtype = x.dtype)
    for i in range(x.shape[0]):
        accum += x[i]
    return accum.item()

print("Deterministic:")
for _ in range(5):
    print(element_sum(a, deterministic = True))

print()
print("Nondeterministic:")
for _ in range(5):
    print(element_sum(a, deterministic = False))

Here, we take an arbitrary list of half-precision values and compute their sum, but in two different ways. The first version adds the elements deterministically, while the second randomizes the order.

Deterministic:
6.2734375
6.2734375
6.2734375
6.2734375
6.2734375

Nondeterministic:
6.265625
6.24609375
6.2734375
6.27734375
6.2734375

The crucial thing to note is that we haven't given up any precision by shuffling the elements around in the nondeterministic version. Every sum here is still the sum of the same 100 elements. The order used when deterministic = True isn't more correct than the order used when deterministic = False: it's just one fixed choice among the 100! possible ways to sum up 100 numbers. And it's not more precise, either. While sticking to one (arbitrary) permutation causes you to lose precision in a deterministic way, you're still accumulating rounding errors with every step.

for i in range(5):
    d = i / 100
    print(element_sum(a + d, deterministic = True) - d * a.shape[0])

Here we're adding a small, varying amount to every element but then subtracting the total of what we added from the sum of the elements. Despite using the deterministic sum function, we get different results. This is because determinism doesn't ensure predictability or any kind of stability at all. All it implies is that when your inputs are exactly the same, so is the output. If that's a desirable property, we might even achieve it like so:

cache = []
def cached_sum(x):
    global cache
    for (k, v) in cache:
        if torch.eq(x, k).all(): return v
    v = element_sum(x, deterministic = False)
    cache.append((x, v))
    return v

for _ in range(5):
    print(cached_sum(a))

And now we get the same result every time. The takeaway should be that determinism is a red herring. Precision is the real issue, and all determinism does is mask any loss of precision.

Of course, I'm always working on ways to make the implementation both faster and more precise. I'd take any help I can get in that department. But to judge whether FP16 math is precise enough you need to also consider what problem we're trying to solve here. We've already given up the raw precision of the original model by quantizing it. We're not even storing quantized weights, technically, but quantized solutions to a reconstruction problem in which we assume some relatively tiny amount of calibration data is a suitable proxy for the entire trillion-token dataset the model was originally built from. The whole premise of quantization is that this is okay as long as the quantized model performs well enough in the end.

But well enough as determined by which benchmark? And compared to what, exactly? The original model already has chaotic behavior, even without quantization or noise induced by FP16 inference, and even with a fully deterministic forward pass. Try this random online demo of Mistral, and ask these two questions with top-K=1:

If I have four apples and two oranges and one more apple and then another apple, how many apples is that? If I have four apples and two oranges and one more apple and then another apple... how many apples is that?

You'll get two different answers (six or seven). And the problem isn't that one of the prompts was bad, it's that the hidden state is bouncing around the model's embedding space like a plinko ball, and the board just doesn't have the shape it needs to reliably guide the ball into the right hole. Which is another way of saying this model can't count. And more precision doesn't help, nor does determinism (although determinism will allow you to dial in the exact initial conditions to get any desired outcome, but that tells you nothing about how well the model performs generally.)

When it comes to HumanEval, I don't think I quite follow what you mean. The 4-bit model's completions aren't going to be directly comparable one-to-one with those of the 8-bit model. They're going to be different attempts, effectively running with different seeds no matter what you do. But it's not that much of a concern since the standard is (I believe?) to run every prompt 200 times. This still leaves some uncertainty, of course, but hey, even GPT4 doesn't score consistently on HumanEval to my knowledge, with results varying by something like +/- 1%. GPT4 is also nondeterministic, incidentally.

Of course, one other reason why you can't tell the 4-bit results from the 8-bit results might be that they're just very similar scores. I did some tests on Mistral-instruct earlier:

.	pass@1	pass@10
4.0 bpw	0.258537	0.524154
6.0 bpw	0.265122	0.537700
FP16	0.264634	0.548656

I only ran 25 samples per task since that already takes like an hour per run, and I didn't use any prompt formatting, which is likely why the scores don't line up with this other test I referenced. But they're in the ballpark, and most importantly the relationship between quantized and FP16 versions correlates nicely with the difference in perplexity, and the impact on MMLU scores.

It kinda just is what it is. I keep working to improve it, and greater precision may be somewhere on the roadmap, but I have to target the metrics that matter.

I did some tests on Mistral-instruct earlier:

Was that Mistral 7B instruct v0.2? If so, it's amazing how well 4.0 bpw performs (when compared to FP16) with such a small model. Until now, it's always been the case that 7B models suffer greatly from quantization below 6bpw

@turboderp Great explanation and I think it covers the topic, the only reason why I thought it worth a follow-up is because of llama.cpp fully offloaded is much more consistent.

I have not been testing Mistral or anything less than 33/34B as they are way too weak for actual coding daily use.

This topic has a few parts - technical feasibility - hard, and the benefits of slower inference might not provide any real benefits, you really covered this part very well.

The second part is use case feasibility - there is a need for true greedy decoding like in transformers, and I will try to explain on it:

It might not look like that here, but I'm one of the people who always debate that models should be run with high temp and low top p and top k, but for engineering new stuff, I cannot afford to use my regular settings - I only pick a really good candidate based on greedy results and check if they follow through in real life.

Our end goal is to get consistently good enough replies (i.e. that the Median of replies is good - I don't understand why average is being measured tbh.) - i.e. I'm working on a script, fixing code, generating boiler code that "always"(almost) compiles out of the box.

Or tap into the maximum, best output of the model for a given prompt even if it takes many generations (i.e. agents)

3rd category I'm interested in is assistant/workflow use - this requires reproducible results as if you got a todo item created instead of sending an email is a big deal, caching won't help as every prompt is different.

Not to even mention the current main production use case of all LLMs - RAG. Creating an RAG pipeline and configuring it for the best results takes literally hundreds of iterations, every change in dataset, chunking&ranking strategy, prompt etc. is driving completely different results

Anyway, you also highlighted one of the issues I have - running full test of 164 generations can already take an hour, doing it, even pass @ 10? No longer feasible for a user like me with 48GB VRAM. Testing RAG pipeline without greedy? Nearly impossible

Another thing - We are all in a very active experimental phase, while doing a finetune for a day I would like to be able to tell whether I continue after an hour of testing instead of another day,

Regarding your results in HE and what I meant by 4bit & 8bit - in the scenarios above, in all my testing, there are no benefits in using higher quant over 5bit, even more - using a tailored calibration dataset will almost certainly give you more benefits on average. However every so often on 8bit you will get much better replies. My theory is that while models are quantized recall is not hurt that much (and it could be seen in perplexity) as much are reasoning capabilities. (i.e. this could be visible while asking very vague, poorly written, moderately challenging questions)

The test that I'm currently running is hard even for GPT-4 (44%) I only ran it few times but I didn't see any variance yet.

Today I ran deepseek 33b:

{'pass@1': 0.3719512195121951} {'pass@1': 0.36585365853658536} {'pass@1': 0.34146341463414637} {'pass@1': 0.34146341463414637} {'pass@1': 0.3597560975609756} {'pass@1': 0.35365853658536583}

It is much more consistent than Phind v2, and I'm not surprised you will not see that much variance in Mixtral as I see in codellama derivates.

Currently, I'm working on Contrastive decoding of two fine-tuned models and I'm doing sweeps of two attributes for adjustments: in transformers, it's extremely slow as there is no way to get logits fast (I need to generate 1 token). Llama.cpp was consistent enough, but I was going OOM with batching, and a silly bug wasted 2 months of 24h/7 heating of my flat for various testing. I have to redo everything now and I really hoped to use Exllama but even if I need to run each configuration more than 2/3 times it's unfortunately faster with llama.cpp (which also has some very weird issues with caching in my case)

To summarize - IMO there is a need for greedy decoding for local lab experiments, the worse model is (or more challenging task) the bigger issue with inconsistency. In the past, my usual workflow was to test models in Transformers in greedy and then move with candidates to Exllama which still seems like the only option (on second thought if computation in f32 with doubled cache size, would help exllama would be still the fastest)

@krzysiekpodk From your explanation, It seems that exllamaV2 may not be the best fit for your testing phase. If you feel confident that llama.cpp is more consistent for your use case, then you should stick to it. Just my 2 cents.

At the same time, as @turboderp has explained - chasing "determinism" (when it cannot be guaranteed in any way due to CUDA's floating point quirks) is a red herring. Especially if it comes at the cost of sacrificing performance. I and others believe that @turboderp 's effort is better spent continuing to make exllamaV2 the fastest inference engine while consuming the least amount of VRAM (for the same quality) and so far he has done a great job at it. If that weren't the case, we wouldn't be here now discussing it. :D It's a project that has become so popular only due to the amazing results it gives and the way it currently works.

I've been waiting for this issue to be addressed for a long time, and it seems that there's finally a clear explanation. Can I understand it as trading accuracy for speed? https://github.com/turboderp/exllamav2/issues/135#issue-1967796294

Thanks for the explanations about output determinism @turboderp, very clear and insightful and answers all of the questions I had. As others have mentioned in the thread, if use-case requires reproducibility perhaps other engines might serve better. I think a lot of use-cases might actually be better-served with a degree of fuzziness, where aggregating the outcomes might be useful for the task (e.g. wider range of outputs, insight towards the model and how to prompt it better, perhaps to give clues on what to finetune, etc).

You can't really guarantee reproducibility with other frameworks, either. Not in any strict sense.

Even if you're only considering a single GPU architecture and you have perfectly defined input IDs, they won't evaluate the same at batch size 1 as they do at batch size 2. I.e. if the sequence "Hello, my name is" always produces the next token "Jim", because the model implementation is "perfectly deterministic", it does not follow that a batch of ["Hello, my name is", "Hello, my name is"] will produce the tokens ["Jim", "Jim"].

I'm not aware of any frameworks that would actually guarantee determinism. Some are more deterministic than others, yes, if you're creating an application or doing research that somehow relies on determinism, I think you took a wrong turn somewhere. And in any case you'll want to avoid CUDA.

turboderp / exllamav2

The answers from model with batching return different answer even with the same input #232