tensorchord / pgvecto.rs

Scalable, Low-latency and Hybrid-enabled Vector Search in Postgres. Revolutionize Vector Search, not Database.
https://docs.pgvecto.rs/getting-started/overview.html
Apache License 2.0
1.66k stars 62 forks source link

feat: Python native SDK (without SQL) #528

Open gaocegege opened 2 months ago

cutecutecat commented 1 month ago

Design of native SDK

Manage Databases

Abilities

from sdk import PGVectoClient 
client = PGVectoClient(host="127.0.0.1", port=19530, user_name="postgres", db_name="postgres", password="")

Manage Schema

Concept

Supported data types:

Column attributes:

Abilities

from sdk import Field, VectorField, Schema, DataType

id_field = Field(name="id", dtype=DataType.INT, is_primary=True, description="primary id")
age_field = Field(name="age", dtype=DataType.INT, description="age")
embedding_field = VectorField(name="embedding", dtype=DataType.VECTOR, dim=128, description="vector")
position_field = Field(name="position", dtype=DataType.TEXT)

schema = Schema(fields=[id_field, age_field, embedding_field], auto_id=False, description="desc of a collection", partition=None)

Manage Collections

Concept

Abilities

# Quick setup mode without schema, with columns: id(int), vector(Vector) and meta(jsonb)
client.create_basic_collection(
    collection_name="quick_setup",
    dimension=5,
)

# Custom mode: create columns by schema
client.create_collection(
    collection_name="customized_setup",
    schema=schema,
)

client.drop_collection(
    collection_name="customized_setup"
)

Data Insert

Abilities

data=[
    {"id": 0, "vector": [0.3580376395471989, -0.6023495712049978, 0.18414012509913835, -0.26286205330961354, 0.9029438446296592], "color": "pink_8682"},
    {"id": 1, "vector": [0.19886812562848388, 0.06023560599112088, 0.6976963061752597, 0.2614474506242501, 0.838729485096104], "color": "red_7025"},
]

client.insert(
    collection_name="quick_setup",
    data=data
)

Update and Delete

Abilities

# UPDATE table SET ... WHERE id=3;
# INSERT INTO table (id, ...)
#        SELECT ...
#        WHERE NOT EXISTS (SELECT 1 FROM table WHERE id=3);

# Insert if id doesn't exist, else update
res = client.upsert(
    collection_name='quick_setup',
    data=data
)

# UPDATE table SET ... WHERE color=pink_8682;
res = client.update(
    collection_name='quick_setup',
    data= {"vector": [0.3580376395471989, -0.6023495712049978, 0.18414012509913835, -0.26286205330961354, 0.9029438446296592], "color": "pink_8682"},
    filter="color = \"pink_8682\"",
)

# DELETE from quick_setup where id != ANY('{18, 19}'::int[])
res = client.delete(
    collection_name="quick_setup",
    ids=[18, 19],
)
res = client.delete(
    collection_name='quick_setup',
    filter='color like "blue%"'
)

Create Index

Concept

Abilities

client.create_vector_index(
    collection_name="customized_setup",
    field_name="my_vector",
    metric_type="IP",
    option=IndexOption(...)
)

client.drop_index(
    index_name="idx"
)

Search

Single-Vector Search

{
    "id": 0,
    "distance": 1.4093276262283325,
    "entity": {}
},
{
    "id": 4,
    "distance": 0.9902134537696838,
    "entity": {}
},
from sdk import ANNSearchRequest

req = ANNSearchRequest(
    data: Vector | SparseVector | ...,
    field: str,
    metric_type: str,
    limit: int | None,
    filter: str | None,
    range: float | None,
    group_by_field: str | None,
    outputs: List[str] | None,
    distance_alias: str = "distance",
)

# Single-vector search
# SELECT id, emb <=> [1, 1, 1] as distance from t ORDER BY emb <=> [1, 1, 1] LIMIT 5
req = ANNSearchRequest(data=[1, 1, 1], field="emb", metric_type="L2", limit=5)

# Search with extra output fields
# SELECT id, emb <=> [1, 1, 1] as distance, color from t ORDER BY emb <=> [1, 1, 1] LIMIT 5
req = ANNSearchRequest(data=[1, 1, 1], field="emb", metric_type="L2", limit=5, outputs=["color"])
# SELECT id, emb <=> [1, 1, 1] as dis, distance from t ORDER BY emb <=> [1, 1, 1] LIMIT 5
req = ANNSearchRequest(data=[1, 1, 1], field="emb", metric_type="L2", limit=5, outputs=["distance"], distance_alias="dis")

# Filtered search
# SELECT id, emb <=> [1, 1, 1] as dis, distance from t WHERE age > 5 ORDER BY emb <=> [1, 1, 1]
req = ANNSearchRequest(data=[1, 1, 1], field="emb", metric_type="L2", filter="age > 5")

# Range search
# SELECT id, emb <=> [1, 1, 1] as dis, distance from t WHERE emb <<=>> sphere([1, 1, 1], 0.2) ORDER BY emb <=> [1, 1, 1]
req = ANNSearchRequest(data=[1, 1, 1], field="emb", metric_type="L2", range=0.2, limit=5)

# Group search: https://milvus.io/docs/single-vector-search.md#Grouping-search
req = ANNSearchRequest(data=[1, 1, 1], field="emb", metric_type="L2", limit=10, group_by_field="doc_id",
    output_fields=["doc_id", "passage_id"])

res = client.search(req)

Hybrid search

from sdk import RRFRanker

rerank = RRFRanker()
reqs = [request_1, request_2]

client.hybrid_search(
    reqs,
    rerank,
    limit=2
)

Iterative Search

# Create iterator
res = client.search_iterator(req, batch_size=10)

results = []

# Iter until end
while True:
    result = iterator.next()
    if not result:
        iterator.close()
        break

    results.extend(result)

Manage Partitions

Concept

Abilities

from sdk.partition import Partition, Hash, In, Range

# Hash partition - Random split inserted rows
# CREATE TABLE partitionA PARTITION OF documents FOR VALUES FROM WITH (MODULUS 3, REMAINDER 0);
 p = Partition(
    partition_name="partitionA",
     partition_field="id",
    partition_by=Hash(3, 0)
)

# Group partition - Split discrete data based on distribution
# CREATE TABLE partitionA PARTITION OF documents FOR VALUES IN ('A', 'B');
p = Partition(
    partition_name="partitionA",
    partition_field="alpha",
    partition_by=In(('A', 'B'))
)

# Range partition - Split continuous data based on distribution
# CREATE TABLE partitionA PARTITION OF documents FOR VALUES FROM ('2023-03-01') TO ('2023-04-01');
p = Partition(
    partition_name="partitionA",
    partition_field="day"
    partition_by=Range('2023-03-01', '2023-04-01')
)