Closed simonw closed 5 months ago
files-to-prompt example.txt ../llm-nomic-api-embed/llm_nomic_api_embed.py | \
llm -m pro15 --system \
'write a llm_gemini_embed.py plugin based on llm_nomic_api_embed.py but using the API demonstrated in example.txt'
Gemini Pro 1.5 returned:
from llm import EmbeddingModel, Model, hookimpl
import llm
import httpx
@hookimpl
def register_embedding_models(register):
register(
GeminiEmbeddingModel("text-embedding-004", "text-embedding-004"),
aliases=("gemini",),
)
class GeminiEmbeddingModel(EmbeddingModel):
needs_key = "gemini"
key_env_var = "GEMINI_API_KEY"
def __init__(self, model_id, gemini_model_id):
self.model_id = model_id
self.gemini_model_id = gemini_model_id
def embed_batch(self, items):
headers = {
"Content-Type": "application/json",
}
data = {
"requests": [
{
"model": "models/" + self.gemini_model_id,
"content": {"parts": [{"text": item}]},
}
for item in items
]
}
with httpx.Client() as client:
response = client.post(
f"https://generativelanguage.googleapis.com/v1beta/models/{self.gemini_model_id}:batchEmbedContents?key={self.get_key()}", # Update with your API endpoint and key
headers=headers,
json=data,
)
response.raise_for_status()
return [embedding["result"][0] for embedding in response.json()["responses"]]
Had to manually edit a bunch to get it to work, I didn't give it enough information to guess the output format correctly.
llm embed-multi files --files . '*.py' -d embed.db --store -m text-embedding-004
That worked!
A more ambitious demo:
llm embed-multi readmes --files .. '*/README.md' -d embed.db --store -m text-embedding-004
Embedding [#######-----------------------------] 20% 00:00:30
embed.db
is now 3.7MB
sqlite-utils tables embed.db --counts
[{"table": "_sqlite_migrations", "count": 5},
{"table": "collections", "count": 2},
{"table": "embeddings", "count": 587}]
llm similar readmes -c 'upload csvs to stuff' -d embed.db | jq .id
"datasette-upload-csvs/README.md"
"csvs-to-sqlite/README.md"
"datasette-csv-url/README.md"
"dogsheep-photos/README.md"
"datasette-import/README.md"
"datasette-upload-dbs/README.md"
"datasette-app-support/README.md"
"datasette-files/README.md"
"sqlite-utils/README.md"
"datasette-open/README.md"
Looks good!
Thanks to https://github.com/google-gemini/cookbook/blob/main/quickstarts/rest/Embeddings_REST.ipynb via https://twitter.com/random_forests/status/1778168430153056740 I have the recipe now: