TypeError: apply_rep_penalty()

Describe the bug

Running Llama-2 Chat 70b GPTQ with Exllama loader, was working fine until I updated to the latest version. Now I'm getting this error when using the API, but only using the API. I'm using a trimmed down version of the API example, but I'm not sure what could be causing this error.

Traceback (most recent call last):
  File "/home/zino/oobabooga/text-generation-webui/modules/text_generation.py", line 344, iate_reply_custom
    for reply in shared.model.generate_with_streaming(question, state):
  File "/home/zino/oobabooga/text-generation-webui/modules/exllama.py", line 126, in genera_streaming
    token = self.generator.gen_single_token()
  File "/home/zino/oobabooga/installer_files/env/lib/python3.10/site-packages/exllama/gener", line 353, in gen_single_token
    self.apply_rep_penalty(logits)
  File "/home/zino/oobabooga/installer_files/env/lib/python3.10/site-packages/exllama/gener", line 335, in apply_rep_penalty
    cuda_ext.ext_apply_rep_penalty_mask_cpu(self.sequence,
  File "/home/zino/oobabooga/installer_files/env/lib/python3.10/site-packages/exllama/cuda_, line 110, in ext_apply_rep_penalty_mask_cpu
    apply_rep_penalty(sequence, penalty_max, sustain, decay, logits)
TypeError: apply_rep_penalty(): incompatible function arguments. The following argument typsupported:
    1. (arg0: torch.Tensor, arg1: float, arg2: int, arg3: int, arg4: torch.Tensor) -> None

Invoked with: tensor([], size=(1, 0), dtype=torch.int64), 1.01, -1, 128, None
Output generated in 0.00 seconds (0.00 tokens/s, 0 tokens, context 650, seed 1529046018)

Is there an existing issue for this?

[X] I have searched the existing issues

Reproduction

Install, upgrade_linux.sh Use API

API function:

def call_ai_chat(self, prompt, history):
        history_null = {'visible': [], 'internal': []}
        request = {
            'user_input': prompt,
            'max_new_tokens': 4096,
            'history': history_null,
            'mode': 'chat',  # Valid options: 'chat', 'chat-instruct', 'instruct'
            'character': 'Ava',
            'instruction_template': 'Llama-v2',
            'your_name': 'User',
            'regenerate': False,
            '_continue': False,
            'stop_at_newline': False,
            'chat_generation_attempts': 1,
            'chat-instruct_command': 'Continue the chat dialogue below. Write a single reply for the character "<|character|>".\n\n<|prompt|>',
            'preset': 'Ava',
            'do_sample': True,
            'temperature': 0.87,
            'no_repeat_ngram_size': 0,
            'seed': -1,
            'add_bos_token': True,
            'truncation_length': 2048,
            'ban_eos_token': False,
            'skip_special_tokens': True,
            'stopping_strings': [],
            'turn_template':  "<|user|><|user-message|> [/INST] <|bot|><|bot-message|> </s><s>[INST] "
        }

        logger.debug("writing json")
        filename = "debug/request.json"
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(request, f, ensure_ascii=False, indent=4)

        logger.info("engaging LLM")
        response = requests.post(self.uri, json=request)

        filename = "debug/response.txt"
        with open(filename, 'w') as f:
            f.write(str(response.text))

        if response.status_code == 200:
            result = response.json()['results'][0]['history']
            print(json.dumps(result, indent=4))
            print()
            reply = result['visible'][-1][1]
        else:
            reply = "error"

        return result

Screenshot

No response

Logs

NA

System Info

Ubuntu 22.04 LTS
72 Cores
256GB DDR4 RAM
48GB VRAM
GPU Nvidia A40

A new API seems to have fixed the issue, something between the top function and this one is the problem:

def call_ai_chat(self, prompt, history):
        history_null = {'visible': [], 'internal': []} 
        request = {
            'user_input': prompt,
            'max_new_tokens': 512,
            'auto_max_new_tokens': False,
            'history': history_null,
            'mode': 'chat',  # Valid options: 'chat', 'chat-instruct', 'instruct'
            'character': 'Ava',
            'instruction_template': 'Llama-v2',  # Will get autodetected if unset
            'your_name': 'HUMAN',
            'preset': 'Ava',
            'temperature': 0.87,
            # 'name1': 'name of user', # Optional
            # 'name2': 'name of character', # Optional
            # 'context': 'character context', # Optional
            # 'greeting': 'greeting', # Optional
            # 'name1_instruct': 'You', # Optional
            # 'name2_instruct': 'Assistant', # Optional
            # 'context_instruct': 'context_instruct', # Optional
            # 'turn_template': 'turn_template', # Optional
            'regenerate': False,
            '_continue': False,
            'chat_instruct_command': 'Continue the chat dialogue below. Write a single reply for the character "<|character|>".\n\n<|prompt|>',
            'seed': -1,
            'add_bos_token': True,
            'truncation_length': 2048,
            'ban_eos_token': False,
            'skip_special_tokens': True,
            'stopping_strings': [],
        }
        logger.debug("writing json")
        filename = "debug/request.json"
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(request, f, ensure_ascii=False, indent=4)
        logger.info("engaging LLM")
        response = requests.post(self.uri, json=request)
        filename = "debug/response.txt"
        with open(filename, 'w') as f:
            f.write(str(response.text))
        if response.status_code == 200:
            result = response.json()['results'][0]['history']
            print(json.dumps(result, indent=4))
            print()
            reply = result['visible'][-1][1]
        else:
            reply = "error"
        return result

oobabooga / text-generation-webui