MaartenGr / KeyBERT

Minimal keyword extraction with BERT
https://MaartenGr.github.io/KeyBERT/
MIT License
3.47k stars 344 forks source link

Does KeyBERT support openai ada embedding? #188

Closed fortyfourforty closed 10 months ago

fortyfourforty commented 10 months ago

Does KeyBERT support openai ada embedding? I'd like to use openai's ada for embedding instead of huggingface free model's due to large input tokens of openai's ada model.

MaartenGr commented 10 months ago

Currently not but integrating that should be straightforward. Copying from BERTopic, this might just work:

import time
import openai
import numpy as np
from tqdm import tqdm
from typing import List, Mapping, Any
from keybert.backend import BaseEmbedder

class OpenAIBackend(BaseEmbedder):
    """ OpenAI Embedding Model

    Arguments:
        client: A `openai.OpenAI` client.
        embedding_model: An OpenAI model. Default is
                         For an overview of models see:
                         https://platform.openai.com/docs/models/embeddings
        delay_in_seconds: If a `batch_size` is given, use this set
                          the delay in seconds between batches.
        batch_size: The size of each batch.
        generator_kwargs: Kwargs passed to `openai.Embedding.create`.
                          Can be used to define custom engines or
                          deployment_ids.

    Examples:

    ```python
    import openai

    client = openai.OpenAI(api_key="sk-...")
    openai_embedder = OpenAIBackend(client, "text-embedding-ada-002")
"""
def __init__(self,
             embedding_model: str = "text-embedding-ada-002",
             delay_in_seconds: float = None,
             batch_size: int = None,
             generator_kwargs: Mapping[str, Any] = {}):
    super().__init__()
    self.embedding_model = embedding_model
    self.delay_in_seconds = delay_in_seconds
    self.batch_size = batch_size
    self.generator_kwargs = generator_kwargs

    if self.generator_kwargs.get("model"):
        self.embedding_model = generator_kwargs.get("model")
    elif not self.generator_kwargs.get("engine"):
        self.generator_kwargs["model"] = self.embedding_model

def embed(self,
          documents: List[str],
          verbose: bool = False) -> np.ndarray:
    """ Embed a list of n documents/words into an n-dimensional
    matrix of embeddings

    Arguments:
        documents: A list of documents or words to be embedded
        verbose: Controls the verbosity of the process

    Returns:
        Document/words embeddings with shape (n, m) with `n` documents/words
        that each have an embeddings size of `m`
    """
    # Batch-wise embedding extraction
    if self.batch_size is not None:
        embeddings = []
        for batch in tqdm(self._chunks(documents), disable=not verbose):
            response = openai.Embedding.create(input=batch, **self.generator_kwargs)
            embeddings.extend([r["embedding"] for r in response["data"]])

            # Delay subsequent calls
            if self.delay_in_seconds:
                time.sleep(self.delay_in_seconds)

    # Extract embeddings all at once
    else:
        response = openai.Embedding.create(input=documents, **self.generator_kwargs)
        embeddings = [r["embedding"] for r in response["data"]]
    return np.array(embeddings)

def _chunks(self, documents):
    for i in range(0, len(documents), self.batch_size):
        yield documents[i:i + self.batch_size]