Integrating with outlines

I'd love to use ctransformers with the outlines library for constrained generation. I opened this issue about it on their repo.

In order to hack away at this integration, the forward pass and pure logit output of the model would need to be exposed in the ctransformers API -- having looked through ctransformers/llm.py it seems as though the .sample method which wraps the .ctransformers_llm_sample method from the C API is the only way to sample tokens from the ctransformers models in it's current state.

Would there be any scope for adding a .forward or .sample_logits method? How would I go about implementing it if there were?

You can get the logits using llm.logits property. In order to implement forward, you can use the low-level llm.eval() method. I have done some work on this in the past to make ctransformers a drop-in replacement for 🤗 transformers models (see https://github.com/marella/ctransformers/issues/13#issuecomment-1597662836) Here is the code for reference. Recently I created a better version of it but haven't pushed it yet. Here is a sample code from the newer version:

import torch
from transformers import PretrainedConfig, PreTrainedModel
from transformers.modeling_outputs import CausalLMOutput

class Model(PreTrainedModel):
    def __init__(self, config: PretrainedConfig, llm):
        config.vocab_size = llm.vocab_size
        config.eos_token_id = llm.eos_token_id
        config.pad_token_id = llm.eos_token_id
        super().__init__(config)
        self._llm = llm
        self._past = []

    def prepare_inputs_for_generation(
        self,
        input_ids,
        attention_mask=None,
        **kwargs,
    ):
        return {"input_ids": input_ids}

    def forward(
        self,
        input_ids=None,
        return_dict=None,
        **kwargs,
    ):
        llm = self._llm
        tokens = input_ids.flatten().tolist()
        n_past = len(self._past)
        if tokens[:n_past] == self._past:
            self._past = tokens
            tokens = tokens[n_past:]
        else:
            self._past = tokens
            llm.reset()
        llm.eval(tokens)
        logits = torch.tensor(llm.logits).reshape([1, 1, -1])
        if not return_dict:
            return (logits,)
        return CausalLMOutput(logits=logits)

    @property
    def device(self) -> torch.device:
        return torch.device("cpu")

It can be used as:

from ctransformers import AutoModelForCausalLM

llm = AutoModelForCausalLM.from_pretrained(...)
model = Model(PretrainedConfig(), llm)

But the API is not finalized and may change by the time I release it.

Im trying to use ctransformers with outlines i used this code

from ctransformers import AutoModelForCausalLM
from ctransformers.llm import  LLM, get
import torch
from transformers import PretrainedConfig, PreTrainedModel
from transformers.modeling_outputs import CausalLMOutput
import outlines.text.generate as generate
import outlines.models as models
from typing import List, Optional, Union

from transformers import GenerationConfig, LogitsProcessorList, StoppingCriteriaList
from transformers.generation import SampleDecoderOnlyOutput
from transformers.generation.streamers import BaseStreamer

class Model(PreTrainedModel):
    def __init__(self, config: PretrainedConfig, llm):
        config.vocab_size = llm.vocab_size
        config.eos_token_id = llm.eos_token_id
        config.pad_token_id = llm.eos_token_id
        super().__init__(config)
        self._llm = llm
        self._past = []

    def prepare_inputs_for_generation(
        self,
        input_ids,
        attention_mask=None,
        **kwargs,
    ):
        return {"input_ids": input_ids}

    def forward(
        self,
        input_ids=None,
        return_dict=None,
        **kwargs,
    ):
        llm = self._llm
        tokens = input_ids.flatten().tolist()
        n_past = len(self._past)
        if tokens[:n_past] == self._past:
            self._past = tokens
            tokens = tokens[n_past:]
        else:
            self._past = tokens
            llm.reset()
        llm.eval(tokens)
        logits = torch.tensor(llm.logits).reshape([1, 1, -1])
        if not return_dict:
            return (logits,)
        return CausalLMOutput(logits=logits)

    @property
    def device(self) -> torch.device:
        return torch.device("cpu")

llm = AutoModelForCausalLM.from_pretrained(r"openorca-platypus2-13b.ggmlv3.q4_K_S.bin",
                                           model_type='llama',
                                           gpu_layers=30,
                                           context_length=4096)
model = Model(PretrainedConfig(), llm)
answer = generate.choice(model, ["Positive", "Negative"])(prompt)

I tried that and got this error AttributeError: 'Model' object has no attribute 'tokenizer'

So i added this:

class Tokenizer:
    def __init__(self, llm: LLM) -> None:
        self._llm = llm
        self.vocab_size = llm.vocab_size
        self.eos_token_id = llm.eos_token_id
        self.eos_token = llm.detokenize(self.eos_token_id) or "</s>"  # TODO
        self.max_sequence_length = llm.context_length
        # self.vocabulary = llm.get_vocabulary()

    def encode(self, text: str) -> List[int]:
        return self._llm.tokenize(text)

    def decode(
        self,
        token_ids: Union[int, List[int], torch.Tensor],
    ) -> str:
        if isinstance(token_ids, torch.Tensor):
            token_ids = token_ids.tolist()
        return self._llm.detokenize(token_ids)

    def convert_ids_to_tokens(
        self, ids: Union[int, List[int]]
    ) -> Union[str, List[str]]:
        if isinstance(ids, int):
            return self.decode(ids)
        else:
            return [self.decode(id) for id in ids]

    def convert_tokens_to_string(self, tokens: List[str]) -> str:
        return "".join(tokens)

    def convert_tokens_to_ids(
        self, tokens: Union[str, List[str]]
    ) -> Union[int, List[int]]:
        index = 1 if self._llm.model_type == "llama" else 0
        if tokens is None:
            return None
        elif isinstance(tokens, str):
            return self.encode(tokens)[index]
        else:
            return [self.encode(token)[index] for token in tokens]

model.tokenizer = tokenizer

And got this error AttributeError: 'Tokenizer' object has no attribute 'pad_token_id'. Did you mean: 'eos_token_id'?

So i added this:

model.tokenizer.pad_token_id = model.tokenizer.eos_token_id

And got this error: AttributeError: 'Tokenizer' object has no attribute 'vocabulary'

I however can seem to find a way to get the vocabulary of an llm the only method i found was vocab_size not sure how to proceed

Looks like outlines generate needs a custom model and tokenizer object which can be created using Transformers and TransformersTokenizer classes. I recommend using the original HF tokenizer to simplify things:

model = Model(PretrainedConfig(), llm) # ctransformers model

from outlines.models.transformers import Transformers, TransformersTokenizer

tokenizer = TransformersTokenizer("Open-Orca/OpenOrca-Platypus2-13B") # change based on the model you are using for ctransformers
model = Transformers(model=model, tokenizer=tokenizer) # outlines model

answer = generate.choice(model, ["Positive", "Negative"])(prompt)

thanks seems to be working now for anyone wondering this is the full code

from ctransformers import AutoModelForCausalLM
import torch
from transformers import PretrainedConfig, PreTrainedModel
from transformers.modeling_outputs import CausalLMOutput
from outlines.models.transformers import Transformers, TransformersTokenizer
import outlines.text.generate as generate

class Model(PreTrainedModel):
    def __init__(self, config: PretrainedConfig, llm):
        config.vocab_size = llm.vocab_size
        config.eos_token_id = llm.eos_token_id
        config.pad_token_id = llm.eos_token_id
        super().__init__(config)
        self._llm = llm
        self._past = []

    def prepare_inputs_for_generation(
        self,
        input_ids,
        attention_mask=None,
        **kwargs,
    ):
        return {"input_ids": input_ids}

    def forward(
        self,
        input_ids=None,
        return_dict=None,
        **kwargs,
    ):
        llm = self._llm
        tokens = input_ids.flatten().tolist()
        n_past = len(self._past)
        if tokens[:n_past] == self._past:
            self._past = tokens
            tokens = tokens[n_past:]
        else:
            self._past = tokens
            llm.reset()
        llm.eval(tokens)
        logits = torch.tensor(llm.logits).reshape([1, 1, -1])
        if not return_dict:
            return (logits,)
        return CausalLMOutput(logits=logits)

    @property
    def device(self) -> torch.device:
        return torch.device("cpu")

llm = AutoModelForCausalLM.from_pretrained(r"C:\Users\Kaman\Downloads\openorca-platypus2-13b.ggmlv3.q4_K_S.bin",
                                           model_type='llama',
                                           gpu_layers=30,
                                           context_length=4096)
model = Model(PretrainedConfig(), llm)
tokenizer = TransformersTokenizer("Open-Orca/OpenOrca-Platypus2-13B")
model = Transformers(model=model, tokenizer=tokenizer)

prompt = """User: You are a sentiment-labeling assistant label this review 
Review: This restaurant was very bad! <|end_of_turn|>
Assistant: 
"""

answer = generate.choice(model, ["Positive", "Negative"])(prompt)
print(answer)

For outlines 0.0.9, the forward pass should return a CausalLMOutputWithPast object instead of a CausalLMOutput. A drop in replacement work although more tweaks should allow for significant speedups.

marella / ctransformers

Integrating with outlines #91