Closed ecwood closed 11 months ago
I think the problem is with this:
It is a good idea to check the other prefixes as well.
c683172 helped but now
{
"category": "biolink:Pathway",
"category_label": "pathway",
"creation_date": null,
"deprecated": false,
"description": null,
"full_name": "C",
"has_biological_sequence": null,
"id": "KEGG:00020",
"iri": "https://www.genome.jp/dbget-bin/www_bget?pathway:map00020",
"name": "C",
"provided_by": [
"KEGG_source:"
],
"publications": [],
"replaced_by": null,
"synonym": [
"i",
"t",
"r",
"a",
"t",
"e",
"",
"c",
"y",
"c",
"l",
"e",
"",
"(",
"T",
"C",
"A",
"",
"c",
"y",
"c",
"l",
"e",
")",
"",
"-",
"",
"H",
"o",
"m",
"o",
"",
"s",
"a",
"p",
"i",
"e",
"n",
"s",
"",
"(",
"h",
"u",
"m",
"a",
"n",
")"
],
"update_date": "2023-06-29 21:28:17"
},
is an issue.
I do not think the reactions are being brought in either.
Here's a diff between the each version's report:
ubuntu@ip-172-31-59-112:~/kg2-build$ diff kg2.8.3-kegg-report.json kg2-kegg2-report.json
2,4c2,4
< "_number_of_nodes": 62643,
< "_number_of_edges": 195520,
< "_report_datetime": "2023-06-30 06:26:24",
---
> "_number_of_nodes": 32761,
> "_number_of_edges": 166526,
> "_report_datetime": "2023-06-30 06:12:07",
8,13c8,12
< "KEGG.COMPOUND": 19090,
< "KEGG.DRUG": 12079,
< "KEGG.ENZYME": 8056,
< "KEGG.GLYCAN": 11154,
< "KEGG": 352,
< "KEGG.REACTION": 11911,
---
> "KEGG.REACTION": 11942,
> "KEGG.COMPOUND": 15715,
> "KEGG.DRUG": 3998,
> "KEGG.GLYCAN": 752,
> "KEGG": 353,
18,22c17,20
< "small_molecule": 30244,
< "drug": 12079,
< "molecular_entity": 8056,
< "pathway": 352,
< "molecular_activity": 11911,
---
> "molecular_activity": 11942,
> "small_molecule": 16467,
> "drug": 3998,
> "pathway": 353,
26c24
< "KEGG_source:": 62643
---
> "KEGG_source:": 32761
29,43c27,39
< "biolink:same_as": 28744,
< "KEGG:compound_to_enzyme": 32487,
< "KEGG:compound_to_reaction": 49990,
< "KEGG:compound_to_pathway": 18740,
< "KEGG:enzyme_to_reaction": 6941,
< "KEGG:enzyme_to_pathway": 9887,
< "KEGG:glycan_to_reaction": 1441,
< "KEGG:glycan_to_pathway": 583,
< "KEGG:glycan_to_enzyme": 967,
< "biolink:in_taxon": 352,
< "KEGG:pathway_to_compound": 5822,
< "KEGG:pathway_to_drug": 10281,
< "KEGG:pathway_to_glycan": 286,
< "KEGG:reaction_to_enzyme": 10318,
< "KEGG:reaction_to_pathway": 18681
---
> "biolink:same_as": 28698,
> "KEGG:reaction_to_enzyme": 10321,
> "KEGG:reaction_to_pathway": 18832,
> "KEGG:compound_to_enzyme": 28709,
> "KEGG:compound_to_reaction": 44575,
> "KEGG:compound_to_pathway": 16414,
> "KEGG:glycan_to_reaction": 1104,
> "KEGG:glycan_to_pathway": 273,
> "KEGG:glycan_to_enzyme": 724,
> "biolink:in_taxon": 353,
> "KEGG:pathway_to_compound": 5864,
> "KEGG:pathway_to_drug": 10373,
> "KEGG:pathway_to_glycan": 286
46,60c42,54
< "same_as": 28744,
< "compound_to_enzyme": 32487,
< "compound_to_reaction": 49990,
< "compound_to_pathway": 18740,
< "enzyme_to_reaction": 6941,
< "enzyme_to_pathway": 9887,
< "glycan_to_reaction": 1441,
< "glycan_to_pathway": 583,
< "glycan_to_enzyme": 967,
< "in_taxon": 352,
< "pathway_to_compound": 5822,
< "pathway_to_drug": 10281,
< "pathway_to_glycan": 286,
< "reaction_to_enzyme": 10318,
< "reaction_to_pathway": 18681
---
> "same_as": 28698,
> "reaction_to_enzyme": 10321,
> "reaction_to_pathway": 18832,
> "compound_to_enzyme": 28709,
> "compound_to_reaction": 44575,
> "compound_to_pathway": 16414,
> "glycan_to_reaction": 1104,
> "glycan_to_pathway": 273,
> "glycan_to_enzyme": 724,
> "in_taxon": 353,
> "pathway_to_compound": 5864,
> "pathway_to_drug": 10373,
> "pathway_to_glycan": 286
63,64c57,58
< "biolink": 29096,
< "KEGG": 166424
---
> "biolink": 29051,
> "KEGG": 137475
67,68c61,62
< "KEGG": 13,
< "biolink": 2
---
> "biolink": 2,
> "KEGG": 11
71c65
< "KEGG_source:": 195520
---
> "KEGG_source:": 166526
75,76c69,71
< "KEGG.COMPOUND---CHEBI": 16739,
< "KEGG.DRUG---CHEBI": 4062,
---
> "KEGG.REACTION---RHEA": 6677,
> "KEGG.COMPOUND---CHEBI": 16750,
> "KEGG.DRUG---CHEBI": 4064,
78,81c73,75
< "KEGG.GLYCAN---KEGG.COMPOUND": 249,
< "KEGG.GLYCAN---KEGG.DRUG": 21,
< "KEGG---GO": 244,
< "KEGG.REACTION---RHEA": 6677
---
> "KEGG.GLYCAN---KEGG.COMPOUND": 193,
> "KEGG.GLYCAN---KEGG.DRUG": 18,
> "KEGG---GO": 244
85,89c79,82
< "small_molecule": 30244,
< "drug": 12079,
< "molecular_entity": 8056,
< "pathway": 352,
< "molecular_activity": 11911,
---
> "molecular_activity": 11942,
> "small_molecule": 16467,
> "drug": 3998,
> "pathway": 353,
94c87
< "Kyoto Encyclopedia of Genes and Genomes v105.0"
---
> "Kyoto Encyclopedia of Genes and Genomes v106.0"
98c91
< "KEGG_source:": 18308
---
> "KEGG_source:": 2
With bf93683, this is the new diff:
ubuntu@ip-172-31-59-112:~/kg2-build$ diff kg2.8.3-kegg-report.json kg2-kegg2-report.json
2,4c2,4
< "_number_of_nodes": 62643,
< "_number_of_edges": 195520,
< "_report_datetime": "2023-06-30 06:26:24",
---
> "_number_of_nodes": 40817,
> "_number_of_edges": 183424,
> "_report_datetime": "2023-06-30 06:41:21",
8,9d7
< "KEGG.COMPOUND": 19090,
< "KEGG.DRUG": 12079,
11,13c9,13
< "KEGG.GLYCAN": 11154,
< "KEGG": 352,
< "KEGG.REACTION": 11911,
---
> "KEGG.REACTION": 11942,
> "KEGG.COMPOUND": 15715,
> "KEGG.DRUG": 3998,
> "KEGG.GLYCAN": 752,
> "KEGG": 353,
18,19d17
< "small_molecule": 30244,
< "drug": 12079,
21,22c19,22
< "pathway": 352,
< "molecular_activity": 11911,
---
> "molecular_activity": 11942,
> "small_molecule": 16467,
> "drug": 3998,
> "pathway": 353,
26c26
< "KEGG_source:": 62643
---
> "KEGG_source:": 40817
29,43c29,43
< "biolink:same_as": 28744,
< "KEGG:compound_to_enzyme": 32487,
< "KEGG:compound_to_reaction": 49990,
< "KEGG:compound_to_pathway": 18740,
< "KEGG:enzyme_to_reaction": 6941,
< "KEGG:enzyme_to_pathway": 9887,
< "KEGG:glycan_to_reaction": 1441,
< "KEGG:glycan_to_pathway": 583,
< "KEGG:glycan_to_enzyme": 967,
< "biolink:in_taxon": 352,
< "KEGG:pathway_to_compound": 5822,
< "KEGG:pathway_to_drug": 10281,
< "KEGG:pathway_to_glycan": 286,
< "KEGG:reaction_to_enzyme": 10318,
< "KEGG:reaction_to_pathway": 18681
---
> "KEGG:enzyme_to_reaction": 6940,
> "KEGG:enzyme_to_pathway": 9958,
> "biolink:same_as": 28698,
> "KEGG:reaction_to_enzyme": 10321,
> "KEGG:reaction_to_pathway": 18832,
> "KEGG:compound_to_enzyme": 28709,
> "KEGG:compound_to_reaction": 44575,
> "KEGG:compound_to_pathway": 16414,
> "KEGG:glycan_to_reaction": 1104,
> "KEGG:glycan_to_pathway": 273,
> "KEGG:glycan_to_enzyme": 724,
> "biolink:in_taxon": 353,
> "KEGG:pathway_to_compound": 5864,
> "KEGG:pathway_to_drug": 10373,
> "KEGG:pathway_to_glycan": 286
46,60c46,60
< "same_as": 28744,
< "compound_to_enzyme": 32487,
< "compound_to_reaction": 49990,
< "compound_to_pathway": 18740,
< "enzyme_to_reaction": 6941,
< "enzyme_to_pathway": 9887,
< "glycan_to_reaction": 1441,
< "glycan_to_pathway": 583,
< "glycan_to_enzyme": 967,
< "in_taxon": 352,
< "pathway_to_compound": 5822,
< "pathway_to_drug": 10281,
< "pathway_to_glycan": 286,
< "reaction_to_enzyme": 10318,
< "reaction_to_pathway": 18681
---
> "enzyme_to_reaction": 6940,
> "enzyme_to_pathway": 9958,
> "same_as": 28698,
> "reaction_to_enzyme": 10321,
> "reaction_to_pathway": 18832,
> "compound_to_enzyme": 28709,
> "compound_to_reaction": 44575,
> "compound_to_pathway": 16414,
> "glycan_to_reaction": 1104,
> "glycan_to_pathway": 273,
> "glycan_to_enzyme": 724,
> "in_taxon": 353,
> "pathway_to_compound": 5864,
> "pathway_to_drug": 10373,
> "pathway_to_glycan": 286
63,64c63,64
< "biolink": 29096,
< "KEGG": 166424
---
> "KEGG": 154373,
> "biolink": 29051
71c71
< "KEGG_source:": 195520
---
> "KEGG_source:": 183424
75,76c75,77
< "KEGG.COMPOUND---CHEBI": 16739,
< "KEGG.DRUG---CHEBI": 4062,
---
> "KEGG.REACTION---RHEA": 6677,
> "KEGG.COMPOUND---CHEBI": 16750,
> "KEGG.DRUG---CHEBI": 4064,
78,81c79,81
< "KEGG.GLYCAN---KEGG.COMPOUND": 249,
< "KEGG.GLYCAN---KEGG.DRUG": 21,
< "KEGG---GO": 244,
< "KEGG.REACTION---RHEA": 6677
---
> "KEGG.GLYCAN---KEGG.COMPOUND": 193,
> "KEGG.GLYCAN---KEGG.DRUG": 18,
> "KEGG---GO": 244
85,86d84
< "small_molecule": 30244,
< "drug": 12079,
88,89c86,89
< "pathway": 352,
< "molecular_activity": 11911,
---
> "molecular_activity": 11942,
> "small_molecule": 16467,
> "drug": 3998,
> "pathway": 353,
94c94
< "Kyoto Encyclopedia of Genes and Genomes v105.0"
---
> "Kyoto Encyclopedia of Genes and Genomes v106.0"
98c98
< "KEGG_source:": 18308
---
> "KEGG_source:": 2226
It looks like there's substantially less compounds, glycans, and drugs in the new build of KEGG. I am curious if this is due to a different method of storage within the dump.
Also, I believe the bottom difference is referring to orphan nodes. So, it might be a big positive that there are less orphan nodes, though there are also roughly 20,000 less nodes in general.
With the changes from dfaab9f, here is the new diff: (updating with sort_keys=True
on)
ubuntu@ip-172-31-59-112:~/kg2-build$ diff kg2.8.3-kegg-report.json kg2-kegg2-report.json
4,6c4,6
< "_number_of_edges": 195520,
< "_number_of_nodes": 62643,
< "_report_datetime": "2023-06-30 07:42:46",
---
> "_number_of_edges": 308987,
> "_number_of_nodes": 83315,
> "_report_datetime": "2023-06-30 07:42:28",
9,18c9,18
< "KEGG:compound_to_enzyme": 32487,
< "KEGG:compound_to_pathway": 18740,
< "KEGG:compound_to_reaction": 49990,
< "KEGG:enzyme_to_pathway": 9887,
< "KEGG:enzyme_to_reaction": 6941,
< "KEGG:glycan_to_enzyme": 967,
< "KEGG:glycan_to_pathway": 583,
< "KEGG:glycan_to_reaction": 1441,
< "KEGG:pathway_to_compound": 5822,
< "KEGG:pathway_to_drug": 10281,
---
> "KEGG:compound_to_enzyme": 61183,
> "KEGG:compound_to_pathway": 35315,
> "KEGG:compound_to_reaction": 94650,
> "KEGG:enzyme_to_pathway": 9958,
> "KEGG:enzyme_to_reaction": 6940,
> "KEGG:glycan_to_enzyme": 1701,
> "KEGG:glycan_to_pathway": 856,
> "KEGG:glycan_to_reaction": 2573,
> "KEGG:pathway_to_compound": 5864,
> "KEGG:pathway_to_drug": 10373,
20,23c20,23
< "KEGG:reaction_to_enzyme": 10318,
< "KEGG:reaction_to_pathway": 18681,
< "biolink:in_taxon": 352,
< "biolink:same_as": 28744
---
> "KEGG:reaction_to_enzyme": 10321,
> "KEGG:reaction_to_pathway": 18832,
> "biolink:in_taxon": 353,
> "biolink:same_as": 49782
26,27c26,27
< "KEGG": 166424,
< "biolink": 29096
---
> "KEGG": 258852,
> "biolink": 50135
30,40c30,40
< "compound_to_enzyme": 32487,
< "compound_to_pathway": 18740,
< "compound_to_reaction": 49990,
< "enzyme_to_pathway": 9887,
< "enzyme_to_reaction": 6941,
< "glycan_to_enzyme": 967,
< "glycan_to_pathway": 583,
< "glycan_to_reaction": 1441,
< "in_taxon": 352,
< "pathway_to_compound": 5822,
< "pathway_to_drug": 10281,
---
> "compound_to_enzyme": 61183,
> "compound_to_pathway": 35315,
> "compound_to_reaction": 94650,
> "enzyme_to_pathway": 9958,
> "enzyme_to_reaction": 6940,
> "glycan_to_enzyme": 1701,
> "glycan_to_pathway": 856,
> "glycan_to_reaction": 2573,
> "in_taxon": 353,
> "pathway_to_compound": 5864,
> "pathway_to_drug": 10373,
42,44c42,44
< "reaction_to_enzyme": 10318,
< "reaction_to_pathway": 18681,
< "same_as": 28744
---
> "reaction_to_enzyme": 10321,
> "reaction_to_pathway": 18832,
> "same_as": 49782
47c47
< "KEGG_source:": 195520
---
> "KEGG_source:": 308987
50c50
< "drug": 12079,
---
> "drug": 16155,
52c52
< "molecular_activity": 11911,
---
> "molecular_activity": 11942,
54,55c54,55
< "pathway": 352,
< "small_molecule": 30244
---
> "pathway": 353,
> "small_molecule": 46808
58,60c58,60
< "KEGG": 352,
< "KEGG.COMPOUND": 19090,
< "KEGG.DRUG": 12079,
---
> "KEGG": 353,
> "KEGG.COMPOUND": 34834,
> "KEGG.DRUG": 16155,
62,63c62,63
< "KEGG.GLYCAN": 11154,
< "KEGG.REACTION": 11911,
---
> "KEGG.GLYCAN": 11974,
> "KEGG.REACTION": 11942,
67c67
< "KEGG_source:": 62643
---
> "KEGG_source:": 83315
71c71
< "drug": 12079,
---
> "drug": 16155,
73c73
< "molecular_activity": 11911,
---
> "molecular_activity": 11942,
75,76c75,76
< "pathway": 352,
< "small_molecule": 30244
---
> "pathway": 353,
> "small_molecule": 46808
81c81
< "KEGG_source:": 18308
---
> "KEGG_source:": 18413
88c88
< "Kyoto Encyclopedia of Genes and Genomes v105.0"
---
> "Kyoto Encyclopedia of Genes and Genomes v106.0"
92,93c92,93
< "KEGG.COMPOUND---CHEBI": 16739,
< "KEGG.DRUG---CHEBI": 4062,
---
> "KEGG.COMPOUND---CHEBI": 33500,
> "KEGG.DRUG---CHEBI": 8128,
95,96c95,96
< "KEGG.GLYCAN---KEGG.COMPOUND": 249,
< "KEGG.GLYCAN---KEGG.DRUG": 21,
---
> "KEGG.GLYCAN---KEGG.COMPOUND": 442,
> "KEGG.GLYCAN---KEGG.DRUG": 39,
That diff
looks really good (none of the values go down with this newer version), so I am going to flag this issue for verification.
I am closing this issue because the code worked in KG2.8.4pre
's build.
While investigating #210, I learned that there are no pathway nodes in
kg2-kegg.json
with the current conversion script. This is surprising, because there are pathways in the KEGG dump.