Open yarikoptic opened 4 months ago
Before tackling this issue. I want to give an attempt in resolving https://github.com/datalad/datalad-registry/issues/361. Without resolving https://github.com/datalad/datalad-registry/issues/361, no changes in DataLad-Registry can be made public, through the read-only public instance.
I will add the support to filter repos based on the existence an element in a JSON array as well.
The following excerpt contains an example in filtering RepoURL
objects that have certain extracted metadata with the representation of the value of an arbitrary key containing a given substring.
from sqlalchemy import select, text, func, cast, Text
from sqlalchemy.dialects.postgresql import JSONB
from datalad_registry import create_app
from datalad_registry.models import db, RepoUrl, URLMetadata
import json
flask_app = create_app()
print("=== App is up ===")
# === Test for one level key down ===
# key = "a"
# expected_value = "3.1415"
#
# with flask_app.app_context():
# result = db.session.execute(
# select(RepoUrl).where(RepoUrl.branches[key].astext == expected_value)
# ).scalars().all()
#
# print(result)
#
# if len(result) == 0:
# print("No result")
# else:
# e0 = result[0]
# print(f"type(e0): {type(e0)}")
# print(f"e0: {e0}")
#
# print(f"type(e0.branches['{key}']): {type(e0.branches[key])}")
# === Test for elements in array ===
# key = "key"
# desired_element = "element"
#
# with flask_app.app_context():
# result = (
# db.session.execute(
# select(RepoUrl).where(
# RepoUrl.branches[key].contains([{"l2": {"l3_1": 44}}, "hello"])
# )
# )
# .scalars()
# .all()
# )
#
# print(result)
# === Test for elements in array with jsonb_path_exists ===
# key = "key"
# desired_element = 44
#
# jsonpath_expr = f'$.{key}[*].l2.l3_1 ? (@ == {desired_element})'
# # jsonpath_expr = "true"
#
# with flask_app.app_context():
# result = (
# db.session.execute(
# select(RepoUrl).where(
# text(
# "jsonb_path_exists(branches, :jsonpath_expr)"
# ).bindparams(jsonpath_expr=jsonpath_expr)
# )
# )
# .scalars()
# .all()
# )
# === Test for element as text contains substring with call to `jsonb_path_query` ===
# === in SQL ===
# key = "key"
# substring = "Pot"
#
# # Escape substrings
#
#
# jsonpath_expr = f"$.{key}[*].l2.l3_2"
#
# with flask_app.app_context():
# stmt = select(RepoUrl).where(
# text(
# """
# EXISTS (
# SELECT 1
# FROM jsonb_path_query(branches, :jsonpath_expr) AS value
# WHERE value::text ILIKE '%' || :substring || '%'
# )
# """
# ).bindparams(jsonpath_expr=jsonpath_expr, substring=substring)
# )
#
# results = db.session.execute(stmt).scalars().all()
# === Test for element as text contains substring with call to `jsonb_path_query` ===
# === in SQL ===
# === with column in an external table ===
substring = r"t\\t"
# Escape substrings
substring = substring.replace("\\", r"\\").replace("%", r"\%").replace("_", r"\_")
jsonpath_expr = f"$.l0[*].l1_0"
with flask_app.app_context():
stmt = select(RepoUrl).where(
RepoUrl.metadata_.any(
text(
"""
EXISTS (
SELECT 1
FROM jsonb_path_query(extracted_metadata, :jsonpath_expr) AS value
WHERE value::text ILIKE '%' || :substring || '%'
)
"""
).bindparams(jsonpath_expr=jsonpath_expr, substring=substring)
)
)
print("=== Compiled statement ===")
print(stmt.compile(), "\n")
results = db.session.execute(stmt).scalars().all()
md = results[0].metadata_[0].extracted_metadata
md_in_json = json.dumps(md, indent=2)
# === testing ===
# stmt = select(RepoUrl).join(URLMetadata)
#
# with flask_app.app_context():
# print("compiled statement")
# print(stmt.compile(), "\n")
#
#
# results = db.session.execute(stmt).scalars().all()
# print(results)
# === testing ends ===
# === Test for element as text contains substring with call to `jsonb_path_query`
# === through SQLAlchemy ===
# === This is not working ===
#
# key = "key"
# substring = "hello"
#
# jsonpath_expr = f"$.{key}[*]"
#
# with flask_app.app_context():
# json_subquery = select(
# func.jsonb_path_query(RepoUrl.branches, jsonpath_expr).label("value")
# ).subquery()
#
# stmt = select(RepoUrl).where(
# func.exists(
# select(1)
# .select_from(json_subquery)
# .where(json_subquery.c.value.cast(Text).ilike(f"%{substring}%"))
# )
# )
#
# results = db.session.execute(stmt).scalars().all()
@yarikoptic Consider the following scenario.
url1
has the metadata of
{
"l0_k0": [{"l1_k0": "hello"}, {"l1_k0": "world"}],
"l0_k1": {"l1_k0": 1}
}
url2
has the metadata of
{
"l0_k0": [{"l1_k0": "hi"}, {"l1_k0": "world"}],
"l0_k1": {"l1_k0": 1}
}
The query in the previous post can be generalized to search for url1
by filtering urls that possess metadata which has any object in the array value of key "l0_k0"
that has a value of a particular key with a string representation containing a substring of "el"
.
The value under consideration in this scenario is referenced by "$.l0_k0[*].l1_k0"
using the SQL/JSON path language.
Since our proposed syntax in the search grammar uses [key]
to express a key in the metadata. We can't use [*]
to express "any object". Any suggestion for expressing "any object" in the search grammar?
@yarikoptic What about mimicking the SQL/JSON path language as much as possible as the following?
metadata[<extractor_name>, <extractor_name> ...].<key>.<key>.[*].<key>...
So your example can be expressed as metadata[bids_dataset,metalad_core].BIDSVersion:v
.
at postgres level it looks like https://www.postgresql.org/docs/current/functions-json.html#FUNCTIONS-SQLJSON-PATH
so we could point to specific field, e.g. instead of json serialization string matching we have as an example now:
metadata[bids_dataset,metalad_core]:"BIDSVersion\": \"v"
we could saymetadata[bids_dataset,metalad_core][BIDSVersion]:v
Will require
TODO 4 others:
dandi:files
based examples to assess performance etc