Open aclum opened 10 months ago
This work is part of FY24 roadmap milestone Milestone - Add support for all queries available in the data portal available via the public API (4.8)#496
after discussion with @PeopleMakeCulture , I'd like to generate a derived single collection that allows graph-like queries across all current collections' documents in a more streamlined (less complicated mongo aggregation queries) manner.
@jeffbaumes the approach here may benefit frok the work you did to take the data portal's postgres tables to mongo. If you have any ideas here, please note them.
sample query from @brynnz22: https://github.com/microbiomedata/notebook_hackathons/blob/soil-contig-tax/taxonomic_dist_by_soil_layer/python/mongodb_query.txt.js
db.getCollection("biosample_set").aggregate(
[
{ $match: { 'soil_horizon': { '$in': ['O horizon', 'M horizon'] } } },
{
$project: {
"id": 1,
"soil_horizon": 1
}
},
{
$lookup:
{
from: "pooling_set",
localField: "id",
foreignField: "has_input",
as: "pooling_set"
}
},
{
$project: {
"id": 1,
"soil_horizon": 1,
"pooling_set.has_input": 1,
"pooling_set.has_output": 1
}
},
{
$lookup:
{
from: "processed_sample_set",
localField: "pooling_set.has_output",
foreignField: "id",
as: "processed_sample_set"
}
},
{
$project: {
"id": 1,
"soil_horizon": 1,
"pooling_set.has_input": 1,
"pooling_set.has_output": 1,
"processed_sample_set.id": 1
}
},
{
$lookup:
{
from: "extraction_set",
localField: "processed_sample_set.id",
foreignField: "has_input",
as: "extraction_set"
}
},
{
$project: {
"id": 1,
"soil_horizon": 1,
"pooling_set.has_input": 1,
"pooling_set.has_output": 1,
"processed_sample_set.id": 1,
"extraction_set.has_input": 1,
"extraction_set.has_output": 1,
"extraction_set.id": 1
}
},
{
$lookup:
{
from: "processed_sample_set",
localField: "extraction_set.has_output",
foreignField: "id",
as: "processed_sample_set2"
}
},
{
$project: {
"id": 1,
"soil_horizon": 1,
"pooling_set.has_input": 1,
"pooling_set.has_output": 1,
"processed_sample_set.id": 1,
"extraction_set.has_input": 1,
"extraction_set.has_output": 1,
"extraction_set.id": 1,
"processed_sample_set2.id": 1
}
},
{
$lookup:
{
from: "library_preparation_set",
localField: "processed_sample_set2.id",
foreignField: "has_input",
as: "library_preparation_set"
}
},
{
$project: {
"id": 1,
"soil_horizon": 1,
"pooling_set.has_input": 1,
"pooling_set.has_output": 1,
"processed_sample_set.id": 1,
"extraction_set.has_input": 1,
"extraction_set.has_output": 1,
"extraction_set.id": 1,
"processed_sample_set2.id": 1,
"library_preparation_set.has_input": 1,
"library_preparation_set.has_output": 1,
"library_preparation_set.id": 1
}
},
{
$lookup:
{
from: "processed_sample_set",
localField: "library_preparation_set.has_output",
foreignField: "id",
as: "processed_sample_set3"
}
},
{
$project: {
"id": 1,
"soil_horizon": 1,
"pooling_set.has_input": 1,
"pooling_set.has_output": 1,
"processed_sample_set.id": 1,
"extraction_set.has_input": 1,
"extraction_set.has_output": 1,
"extraction_set.id": 1,
"processed_sample_set2.id": 1,
"library_preparation_set.has_input": 1,
"library_preparation_set.has_output": 1,
"library_preparation_set.id": 1,
"processed_sample_set3.id": 1
}
},
{
$lookup:
{
from: "omics_processing_set",
localField: "processed_sample_set3.id",
foreignField: "has_input",
as: "omics_processing_set"
}
},
{
$project: {
"id": 1,
"soil_horizon": 1,
"pooling_set.has_input": 1,
"pooling_set.has_output": 1,
"processed_sample_set.id": 1,
"extraction_set.has_input": 1,
"extraction_set.has_output": 1,
"extraction_set.id": 1,
"processed_sample_set2.id": 1,
"library_preparation_set.has_input": 1,
"library_preparation_set.has_output": 1,
"library_preparation_set.id": 1,
"processed_sample_set3.id": 1,
"omics_processing_set.has_input": 1,
"omics_processing_set.id": 1
}
},
{
$lookup:
{
from: "metagenome_annotation_activity_set",
localField: "omics_processing_set.id",
foreignField: "was_informed_by",
as: "metagenome_annotation_activity_set"
}
},
{
$project: {
"id": 1,
"soil_horizon": 1,
"pooling_set.has_input": 1,
"pooling_set.has_output": 1,
"processed_sample_set.id": 1,
"extraction_set.has_input": 1,
"extraction_set.has_output": 1,
"extraction_set.id": 1,
"processed_sample_set2.id": 1,
"library_preparation_set.has_input": 1,
"library_preparation_set.has_output": 1,
"library_preparation_set.id": 1,
"processed_sample_set3.id": 1,
"omics_processing_set.has_input": 1,
"omics_processing_set.id": 1,
"metagenome_annotation_activity_set.was_informed_by": 1,
"metagenome_annotation_activity_set.has_output": 1
}
},
{
$lookup:
{
from: "data_object_set",
localField: "metagenome_annotation_activity_set.has_output",
foreignField: "id",
as: "data_object_set"
}
},
{
$project: {
"id": 1,
"soil_horizon": 1,
"pooling_set.has_input": 1,
"pooling_set.has_output": 1,
"processed_sample_set.id": 1,
"extraction_set.has_input": 1,
"extraction_set.has_output": 1,
"extraction_set.id": 1,
"processed_sample_set2.id": 1,
"library_preparation_set.has_input": 1,
"library_preparation_set.has_output": 1,
"library_preparation_set.id": 1,
"processed_sample_set3.id": 1,
"omics_processing_set.has_input": 1,
"omics_processing_set.id": 1,
"metagenome_annotation_activity_set.was_informed_by": 1,
"metagenome_annotation_activity_set.has_output": 1,
"data_object_set.id": 1,
"data_object_set.data_object_type": "Scaffold Lineage tsv",
"data_object_set.url": 1
}
}
]
)
See work done by @brynnz22 and @kheal in this notebook to connect study to taxonomic taxonomic information. https://github.com/microbiomedata/notebook_hackathons/tree/main/taxonomic_dist_by_soil_layer cc @cmungall @shreddd
Duplicates #401
The needs here have not been addressed. In order to address this we need 1) filtering options besides just providing a study 2) be able to to return more complete intermediate records. cc @shreddd
Running list of difficult Mongo queries
https://docs.google.com/spreadsheets/d/1a9cN9ZDyjVOp6NtHiaUlpP_92sMInZtQV-Q-L5iWcOk/edit?usp=sharing
Please add/edit edit Google Sheet
#
In working on the jupyter notebooks and fielding user requests we need some endpoints that make it easier to combine study or biosample filter with workflow execution activities and/or data objects.
See related issue https://github.com/microbiomedata/nmdc-runtime/issues/246
Example requests from jupyter notebook work or users requests related to linking data objects
Example searches for supporting API search to match data portal queries: -Return studies and biosamples from study X that have a processing institution of Y (return study based on ({'id':'nmdc:styX'}), return biosamples based on ({'part_of':'nmdc:styX'}) AND tracing through PlannedProcess classes to determine which Biosamples or ProcessedSamples derived from Biosamples are the values for has_input for class OmicsProcessing where({'processing_institution':'Y'}) -Return studies and biosamples where the annotation results have a hit to 'KEGG.ORTHOLOGY:K00005'. Implementation sketch would search for 'KEGG.ORTHOLOGY:K00005' in functional_annotation_agg slot gene_function_id, get the metagenome_annotation_id, trace back from that to the WorkflowExecutionActivity -> OmicsProcessing-> PlannedProcess Classes -> Biosample/ProcessedSample -> Loop until getting back to a Biosample -> Study
@shreddd to determine if this can be worked on in the next month in advance of the webinar with NEON.
cc @cmungall @brynnz22 @kheal