iomega / paired-data-form

Linking mas spectra and genomic information to discover new chemistry
https://pairedomicsdata.bioinformatics.nl
Apache License 2.0
5 stars 4 forks source link

Compute stats with elastic search #136

Closed sverhoeven closed 4 years ago

sverhoeven commented 4 years ago

There is overlap in the functionality for calculating stats and for indexing projects in elastic search. Both map ontology urls to titles.

With elastic search aggregates we could simplify the stats implementation

sverhoeven commented 4 years ago

Some stats are more complex and might not be possible with es:

sverhoeven commented 4 years ago

ES returns doc counts instead of nested counts. For example genome type will return 2 for genome when 2 documents that contain one or more genomes each.

sverhoeven commented 4 years ago
Example es query ```json { "size": 0, "aggs": { "unique_pis": { "cardinality": { "field": "project.personal.PI_name.keyword" } }, "top_pis": { "terms": { "field": "project.personal.PI_name.keyword" } }, "top_submitters": { "terms": { "script": { "source": "if (doc['project.personal.submitter_name_secondary.keyword'].size() == 0) { doc['project.personal.submitter_name.keyword'].value} else {[doc['project.personal.submitter_name.keyword'].value, doc['project.personal.submitter_name_secondary.keyword'].value]}" } } }, "genome_type": { "terms": { "field": "project.genomes.genome_ID.genome_type.keyword" } }, "species": { "terms": { "field": "enrichments.genomes.species.scientific_name.keyword" } }, "instrument_types": { "terms": { "field": "project.experimental.instrumentation_methods.instrumentation.instrument_title.keyword" } }, "ionization_modes": { "terms": { "field": "project.experimental.instrumentation_methods.mode_title.keyword" } }, "growth_media": { "terms": { "field": "project.experimental.sample_preparation.medium_details.medium_title.keyword" } }, "metagenomic_environment": { "terms": { "field": "project.experimental.sample_preparation.medium_details.metagenomic_environment_title.keyword" } }, "metabolome_samples": { "value_count": { "field": "project.genome_metabolome_links.metabolomics_file.keyword" } }, "solvents": { "terms": { "field": "project.experimental.extraction_methods.solvents.solvent_title.keyword" } } } } ```
Returns ```json { "took": 319, "timed_out": false, "_shards": { "total": 1, "successful": 1, "skipped": 0, "failed": 0 }, "hits": { "total": { "value": 67, "relation": "eq" }, "max_score": null, "hits": [] }, "aggregations": { "metagenomic_environment": { "doc_count_error_upper_bound": 0, "sum_other_doc_count": 0, "buckets": [ { "key": "Aquatic invertebrate", "doc_count": 3 }, { "key": "Human", "doc_count": 2 }, { "key": "Other mammal", "doc_count": 2 }, { "key": "Ocean", "doc_count": 1 }, { "key": "Sediment", "doc_count": 1 } ] }, "top_pis": { "doc_count_error_upper_bound": 0, "sum_other_doc_count": 33, "buckets": [ { "key": "Pieter C. Dorrestein", "doc_count": 9 }, { "key": "Paul R. Jensen", "doc_count": 6 }, { "key": "Gilles P. van Wezel", "doc_count": 4 }, { "key": "Ellis O'Neill", "doc_count": 3 }, { "key": "Bradley Moore", "doc_count": 2 }, { "key": "Daniel Petras & Roderich Suessmuth", "doc_count": 2 }, { "key": "David Fewer", "doc_count": 2 }, { "key": "Harald Gross", "doc_count": 2 }, { "key": "Jörn Piel", "doc_count": 2 }, { "key": "Lena Gerwick", "doc_count": 2 } ] }, "species": { "doc_count_error_upper_bound": 0, "sum_other_doc_count": 207, "buckets": [ { "key": "Streptomyces sp. CNB091", "doc_count": 3 }, { "key": "Salinispora pacifica CNR114", "doc_count": 2 }, { "key": "Salinispora pacifica CNR942", "doc_count": 2 }, { "key": "Salinispora pacifica CNS055", "doc_count": 2 }, { "key": "Salinispora pacifica CNS237", "doc_count": 2 }, { "key": "Salinispora pacifica CNS801", "doc_count": 2 }, { "key": "Salinispora pacifica CNT029", "doc_count": 2 }, { "key": "Salinispora pacifica CNT403", "doc_count": 2 }, { "key": "Salinispora pacifica CNT569", "doc_count": 2 }, { "key": "Salinispora pacifica CNT851", "doc_count": 2 } ] }, "ionization_modes": { "doc_count_error_upper_bound": 0, "sum_other_doc_count": 0, "buckets": [ { "key": "Positive", "doc_count": 62 }, { "key": "Negative", "doc_count": 4 }, { "key": "Both", "doc_count": 3 } ] }, "growth_media": { "doc_count_error_upper_bound": 0, "sum_other_doc_count": 5, "buckets": [ { "key": "Other", "doc_count": 30 }, { "key": "A1 medium", "doc_count": 12 }, { "key": "LB (Luria-Bertani) medium", "doc_count": 8 }, { "key": "ISP2 medium", "doc_count": 5 }, { "key": "Marine Broth (Difco 2216)", "doc_count": 5 }, { "key": "Minimal medium (MM)", "doc_count": 5 }, { "key": "R5 medium", "doc_count": 5 }, { "key": "Mannitol soy flour medium (MS)", "doc_count": 4 }, { "key": "Tryptic-Soy Broth (TSB)", "doc_count": 4 }, { "key": "ISP4 medium", "doc_count": 2 } ] }, "solvents": { "doc_count_error_upper_bound": 0, "sum_other_doc_count": 1, "buckets": [ { "key": "Methanol", "doc_count": 36 }, { "key": "Ethyl acetate", "doc_count": 26 }, { "key": "Water", "doc_count": 11 }, { "key": "Butanol", "doc_count": 10 }, { "key": "Methylene Chloride / Dichloromethane", "doc_count": 6 }, { "key": "Acetone", "doc_count": 5 }, { "key": "Ethanol", "doc_count": 4 }, { "key": "Other solvent", "doc_count": 4 }, { "key": "Acetonitrile", "doc_count": 2 }, { "key": "Chloroform", "doc_count": 1 } ] }, "genome_type": { "doc_count_error_upper_bound": 0, "sum_other_doc_count": 0, "buckets": [ { "key": "genome", "doc_count": 57 }, { "key": "metagenome", "doc_count": 8 }, { "key": "metagenome-assembled genome", "doc_count": 3 } ] }, "unique_pis": { "value": 43 }, "metabolome_samples": { "value": 4880 }, "top_submitters": { "doc_count_error_upper_bound": 0, "sum_other_doc_count": 60, "buckets": [ { "key": "Eric Helfrich", "doc_count": 4 }, { "key": "Doug Sweeney", "doc_count": 3 }, { "key": "Ellis O'Neill", "doc_count": 3 }, { "key": "Michael Rust", "doc_count": 3 }, { "key": "Alexander Aksenov", "doc_count": 2 }, { "key": "Alyssa Demko", "doc_count": 2 }, { "key": "Benjamin-Florian Hempel", "doc_count": 2 }, { "key": "Carmen Saenz", "doc_count": 2 }, { "key": "Chao Du", "doc_count": 2 }, { "key": "David Fewer", "doc_count": 2 } ] }, "instrument_types": { "doc_count_error_upper_bound": 0, "sum_other_doc_count": 0, "buckets": [ { "key": "Time-of-flight (TOF)", "doc_count": 29 }, { "key": "Orbitrap (Q-Exactive, LTQ-Orbitrap, etc.)", "doc_count": 18 }, { "key": "Other Mass Spectrometer", "doc_count": 9 }, { "key": "Ion trap (IT)", "doc_count": 5 }, { "key": "Quadrupole", "doc_count": 5 }, { "key": "Fourier Transform Ion Cyclotron Resonance (FTICR)", "doc_count": 1 } ] } } } ```
github-actions[bot] commented 4 years ago

This issue is stale because it has been open 60 days with no activity. Remove stale label or comment or this will be closed in 7 days