create or replace function match_file_sections(project_id uuid, embedding vector(1536), match_threshold float, match_count int, min_content_length int)
returns table (path text, content text, token_count int, similarity float)
language plpgsql
as $$
#variable_conflict use_variable
begin
return query
select
files.path,
file_sections.content,
file_sections.token_count,
(file_sections.embedding <#> embedding) * -1 as similarity
from file_sections
join files
on file_sections.file_id = files.id
where files.project_id = project_id
-- We only care about sections that have a useful amount of content
and length(file_sections.content) >= min_content_length
-- The dot product is negative because of a Postgres limitation, so we negate it
and (file_sections.embedding <#> embedding) * -1 > match_threshold
-- OpenAI embeddings are normalized to length 1, so
-- cosine similarity and dot product will produce the same results.
-- Using dot product which can be computed slightly faster.
--
-- For the different syntaxes, see https://github.com/pgvector/pgvector
order by file_sections.embedding <#> embedding
limit match_count;
end;
$$;
First: Thanks for this program!
Bug that bit me:
This function (from https://github.com/motifland/markprompt/blob/main/config/schema.sql) has an extra space added to the
<#>
operator, so it looks like:< #>
. That's an operator from thepgvector
extension. It should be left as<#>
.