ctti-clinicaltrials / aact

Improving Public Access to Aggregate Content of ClinicalTrials.gov
http://aact.ctti-clinicaltrials.org
MIT License
123 stars 33 forks source link

Request: add foreign keys (includes SQL to do so) #755

Closed benjie closed 2 years ago

benjie commented 5 years ago

This is an incredible resource - thanks!

The database at the moment lacks a primary key on the studies table, and has no foreign keys defined which makes it more challenging to use certain tooling against (for example database visualisations). Also most of the columns are nullable, when many need not be.

I include below SQL required to convert the studies unique index into a primary key, set the referencing columns non-null, and create the missing foreign keys. I've tested this locally against the latest dump 20181111. The SQL was mostly auto-generated, so I've included the SQL I used to generate the statements to make it easy to update in future.

I would be happy to send a pull request creating the relevant migration file in ruby if this is something that's likely to be merged?

BEGIN;
SET search_path TO ctgov, public;

-- Set nct_id as the primary key on studies, using the existing unique index
ALTER TABLE studies ADD CONSTRAINT pk_studies PRIMARY KEY USING INDEX index_studies_on_nct_id;

-- Set referencing columns non-null. (Optional)
-- select 'ALTER TABLE ' || relname || ' ALTER COLUMN ' || attname || ' SET NOT NULL;' from pg_class inner join pg_attribute on (pg_class.oid = pg_attribute.attrelid) where relnamespace = (select oid from pg_namespace where nspname = 'ctgov') and relkind='r' and attname LIKE '%\_id' and not attnotnull;
ALTER TABLE browse_conditions ALTER COLUMN nct_id SET NOT NULL;
ALTER TABLE design_outcomes ALTER COLUMN nct_id SET NOT NULL;
ALTER TABLE id_information ALTER COLUMN nct_id SET NOT NULL;
ALTER TABLE interventions ALTER COLUMN nct_id SET NOT NULL;
ALTER TABLE sponsors ALTER COLUMN nct_id SET NOT NULL;
ALTER TABLE baseline_counts ALTER COLUMN nct_id SET NOT NULL;
ALTER TABLE baseline_counts ALTER COLUMN result_group_id SET NOT NULL;
ALTER TABLE baseline_measurements ALTER COLUMN nct_id SET NOT NULL;
ALTER TABLE baseline_measurements ALTER COLUMN result_group_id SET NOT NULL;
ALTER TABLE brief_summaries ALTER COLUMN nct_id SET NOT NULL;
ALTER TABLE browse_interventions ALTER COLUMN nct_id SET NOT NULL;
ALTER TABLE calculated_values ALTER COLUMN nct_id SET NOT NULL;
ALTER TABLE central_contacts ALTER COLUMN nct_id SET NOT NULL;
ALTER TABLE conditions ALTER COLUMN nct_id SET NOT NULL;
ALTER TABLE countries ALTER COLUMN nct_id SET NOT NULL;
ALTER TABLE design_group_interventions ALTER COLUMN nct_id SET NOT NULL;
ALTER TABLE design_group_interventions ALTER COLUMN design_group_id SET NOT NULL;
ALTER TABLE design_group_interventions ALTER COLUMN intervention_id SET NOT NULL;
ALTER TABLE design_groups ALTER COLUMN nct_id SET NOT NULL;
ALTER TABLE designs ALTER COLUMN nct_id SET NOT NULL;
ALTER TABLE detailed_descriptions ALTER COLUMN nct_id SET NOT NULL;
ALTER TABLE documents ALTER COLUMN nct_id SET NOT NULL;
-- ALTER TABLE documents ALTER COLUMN document_id SET NOT NULL; -- Disabled because isn't a FK reference
ALTER TABLE drop_withdrawals ALTER COLUMN nct_id SET NOT NULL;
ALTER TABLE drop_withdrawals ALTER COLUMN result_group_id SET NOT NULL;
ALTER TABLE eligibilities ALTER COLUMN nct_id SET NOT NULL;
ALTER TABLE facilities ALTER COLUMN nct_id SET NOT NULL;
ALTER TABLE facility_contacts ALTER COLUMN nct_id SET NOT NULL;
ALTER TABLE facility_contacts ALTER COLUMN facility_id SET NOT NULL;
ALTER TABLE facility_investigators ALTER COLUMN nct_id SET NOT NULL;
ALTER TABLE facility_investigators ALTER COLUMN facility_id SET NOT NULL;
ALTER TABLE intervention_other_names ALTER COLUMN nct_id SET NOT NULL;
ALTER TABLE intervention_other_names ALTER COLUMN intervention_id SET NOT NULL;
ALTER TABLE ipd_information_types ALTER COLUMN nct_id SET NOT NULL;
ALTER TABLE keywords ALTER COLUMN nct_id SET NOT NULL;
ALTER TABLE links ALTER COLUMN nct_id SET NOT NULL;
ALTER TABLE milestones ALTER COLUMN nct_id SET NOT NULL;
ALTER TABLE milestones ALTER COLUMN result_group_id SET NOT NULL;
ALTER TABLE outcome_analyses ALTER COLUMN nct_id SET NOT NULL;
ALTER TABLE outcome_analyses ALTER COLUMN outcome_id SET NOT NULL;
ALTER TABLE outcome_analysis_groups ALTER COLUMN nct_id SET NOT NULL;
ALTER TABLE outcome_analysis_groups ALTER COLUMN outcome_analysis_id SET NOT NULL;
ALTER TABLE outcome_analysis_groups ALTER COLUMN result_group_id SET NOT NULL;
ALTER TABLE outcome_counts ALTER COLUMN nct_id SET NOT NULL;
ALTER TABLE outcome_counts ALTER COLUMN outcome_id SET NOT NULL;
ALTER TABLE outcome_counts ALTER COLUMN result_group_id SET NOT NULL;
ALTER TABLE outcome_measurements ALTER COLUMN nct_id SET NOT NULL;
ALTER TABLE outcome_measurements ALTER COLUMN outcome_id SET NOT NULL;
ALTER TABLE outcome_measurements ALTER COLUMN result_group_id SET NOT NULL;
ALTER TABLE outcomes ALTER COLUMN nct_id SET NOT NULL;
ALTER TABLE overall_officials ALTER COLUMN nct_id SET NOT NULL;
ALTER TABLE participant_flows ALTER COLUMN nct_id SET NOT NULL;
ALTER TABLE pending_results ALTER COLUMN nct_id SET NOT NULL;
ALTER TABLE reported_events ALTER COLUMN nct_id SET NOT NULL;
ALTER TABLE reported_events ALTER COLUMN result_group_id SET NOT NULL;
ALTER TABLE responsible_parties ALTER COLUMN nct_id SET NOT NULL;
ALTER TABLE result_agreements ALTER COLUMN nct_id SET NOT NULL;
ALTER TABLE result_contacts ALTER COLUMN nct_id SET NOT NULL;
ALTER TABLE result_groups ALTER COLUMN nct_id SET NOT NULL;
ALTER TABLE studies ALTER COLUMN nct_id SET NOT NULL;
ALTER TABLE study_references ALTER COLUMN nct_id SET NOT NULL;

-- Create foreign key references for all relevant relations so they can be auto-discovered by various tools. ("ON DELETE CASCADE" optional, perhaps "ON DELETE RESTRICT" more appropriate?)
-- select 'ALTER TABLE ' || local.relname || ' ADD FOREIGN KEY (' || attname || ') REFERENCES ' || (case attname when 'nct_id' then 'studies' when 'result_group_id' then 'result_groups' when 'design_group_id' then 'design_groups' when 'intervention_id' then 'interventions' when 'document_id' then 'documents' when 'facility_id' then 'facilities' when 'outcome_id'  then 'outcomes' when 'outcome_analysis_id' then 'outcome_analyses' else '???' end)  || ' ON DELETE CASCADE;' from pg_class local inner join pg_attribute on (pg_attribute.attrelid = local.oid and pg_attribute.attnum >= 1) where relnamespace = (select oid from pg_namespace where nspname = 'ctgov') and relkind='r' and attname LIKE '%\_id';
ALTER TABLE browse_conditions ADD FOREIGN KEY (nct_id) REFERENCES studies ON DELETE CASCADE;
ALTER TABLE design_outcomes ADD FOREIGN KEY (nct_id) REFERENCES studies ON DELETE CASCADE;
ALTER TABLE id_information ADD FOREIGN KEY (nct_id) REFERENCES studies ON DELETE CASCADE;
ALTER TABLE interventions ADD FOREIGN KEY (nct_id) REFERENCES studies ON DELETE CASCADE;
ALTER TABLE sponsors ADD FOREIGN KEY (nct_id) REFERENCES studies ON DELETE CASCADE;
ALTER TABLE baseline_counts ADD FOREIGN KEY (nct_id) REFERENCES studies ON DELETE CASCADE;
ALTER TABLE baseline_counts ADD FOREIGN KEY (result_group_id) REFERENCES result_groups ON DELETE CASCADE;
ALTER TABLE baseline_measurements ADD FOREIGN KEY (nct_id) REFERENCES studies ON DELETE CASCADE;
ALTER TABLE baseline_measurements ADD FOREIGN KEY (result_group_id) REFERENCES result_groups ON DELETE CASCADE;
ALTER TABLE brief_summaries ADD FOREIGN KEY (nct_id) REFERENCES studies ON DELETE CASCADE;
ALTER TABLE browse_interventions ADD FOREIGN KEY (nct_id) REFERENCES studies ON DELETE CASCADE;
ALTER TABLE calculated_values ADD FOREIGN KEY (nct_id) REFERENCES studies ON DELETE CASCADE;
ALTER TABLE central_contacts ADD FOREIGN KEY (nct_id) REFERENCES studies ON DELETE CASCADE;
ALTER TABLE conditions ADD FOREIGN KEY (nct_id) REFERENCES studies ON DELETE CASCADE;
ALTER TABLE countries ADD FOREIGN KEY (nct_id) REFERENCES studies ON DELETE CASCADE;
ALTER TABLE design_group_interventions ADD FOREIGN KEY (nct_id) REFERENCES studies ON DELETE CASCADE;
ALTER TABLE design_group_interventions ADD FOREIGN KEY (design_group_id) REFERENCES design_groups ON DELETE CASCADE;
ALTER TABLE design_group_interventions ADD FOREIGN KEY (intervention_id) REFERENCES interventions ON DELETE CASCADE;
ALTER TABLE design_groups ADD FOREIGN KEY (nct_id) REFERENCES studies ON DELETE CASCADE;
ALTER TABLE designs ADD FOREIGN KEY (nct_id) REFERENCES studies ON DELETE CASCADE;
ALTER TABLE detailed_descriptions ADD FOREIGN KEY (nct_id) REFERENCES studies ON DELETE CASCADE;
ALTER TABLE documents ADD FOREIGN KEY (nct_id) REFERENCES studies ON DELETE CASCADE;
-- ALTER TABLE documents ADD FOREIGN KEY (document_id) REFERENCES documents ON DELETE CASCADE; -- Disabled because isn't a FK reference
ALTER TABLE drop_withdrawals ADD FOREIGN KEY (nct_id) REFERENCES studies ON DELETE CASCADE;
ALTER TABLE drop_withdrawals ADD FOREIGN KEY (result_group_id) REFERENCES result_groups ON DELETE CASCADE;
ALTER TABLE eligibilities ADD FOREIGN KEY (nct_id) REFERENCES studies ON DELETE CASCADE;
ALTER TABLE facilities ADD FOREIGN KEY (nct_id) REFERENCES studies ON DELETE CASCADE;
ALTER TABLE facility_contacts ADD FOREIGN KEY (nct_id) REFERENCES studies ON DELETE CASCADE;
ALTER TABLE facility_contacts ADD FOREIGN KEY (facility_id) REFERENCES facilities ON DELETE CASCADE;
ALTER TABLE facility_investigators ADD FOREIGN KEY (nct_id) REFERENCES studies ON DELETE CASCADE;
ALTER TABLE facility_investigators ADD FOREIGN KEY (facility_id) REFERENCES facilities ON DELETE CASCADE;
ALTER TABLE intervention_other_names ADD FOREIGN KEY (nct_id) REFERENCES studies ON DELETE CASCADE;
ALTER TABLE intervention_other_names ADD FOREIGN KEY (intervention_id) REFERENCES interventions ON DELETE CASCADE;
ALTER TABLE ipd_information_types ADD FOREIGN KEY (nct_id) REFERENCES studies ON DELETE CASCADE;
ALTER TABLE keywords ADD FOREIGN KEY (nct_id) REFERENCES studies ON DELETE CASCADE;
ALTER TABLE links ADD FOREIGN KEY (nct_id) REFERENCES studies ON DELETE CASCADE;
ALTER TABLE milestones ADD FOREIGN KEY (nct_id) REFERENCES studies ON DELETE CASCADE;
ALTER TABLE milestones ADD FOREIGN KEY (result_group_id) REFERENCES result_groups ON DELETE CASCADE;
ALTER TABLE outcome_analyses ADD FOREIGN KEY (nct_id) REFERENCES studies ON DELETE CASCADE;
ALTER TABLE outcome_analyses ADD FOREIGN KEY (outcome_id) REFERENCES outcomes ON DELETE CASCADE;
ALTER TABLE outcome_analysis_groups ADD FOREIGN KEY (nct_id) REFERENCES studies ON DELETE CASCADE;
ALTER TABLE outcome_analysis_groups ADD FOREIGN KEY (outcome_analysis_id) REFERENCES outcome_analyses ON DELETE CASCADE;
ALTER TABLE outcome_analysis_groups ADD FOREIGN KEY (result_group_id) REFERENCES result_groups ON DELETE CASCADE;
ALTER TABLE outcome_counts ADD FOREIGN KEY (nct_id) REFERENCES studies ON DELETE CASCADE;
ALTER TABLE outcome_counts ADD FOREIGN KEY (outcome_id) REFERENCES outcomes ON DELETE CASCADE;
ALTER TABLE outcome_counts ADD FOREIGN KEY (result_group_id) REFERENCES result_groups ON DELETE CASCADE;
ALTER TABLE outcome_measurements ADD FOREIGN KEY (nct_id) REFERENCES studies ON DELETE CASCADE;
ALTER TABLE outcome_measurements ADD FOREIGN KEY (outcome_id) REFERENCES outcomes ON DELETE CASCADE;
ALTER TABLE outcome_measurements ADD FOREIGN KEY (result_group_id) REFERENCES result_groups ON DELETE CASCADE;
ALTER TABLE outcomes ADD FOREIGN KEY (nct_id) REFERENCES studies ON DELETE CASCADE;
ALTER TABLE overall_officials ADD FOREIGN KEY (nct_id) REFERENCES studies ON DELETE CASCADE;
ALTER TABLE participant_flows ADD FOREIGN KEY (nct_id) REFERENCES studies ON DELETE CASCADE;
ALTER TABLE pending_results ADD FOREIGN KEY (nct_id) REFERENCES studies ON DELETE CASCADE;
ALTER TABLE reported_events ADD FOREIGN KEY (nct_id) REFERENCES studies ON DELETE CASCADE;
ALTER TABLE reported_events ADD FOREIGN KEY (result_group_id) REFERENCES result_groups ON DELETE CASCADE;
ALTER TABLE responsible_parties ADD FOREIGN KEY (nct_id) REFERENCES studies ON DELETE CASCADE;
ALTER TABLE result_agreements ADD FOREIGN KEY (nct_id) REFERENCES studies ON DELETE CASCADE;
ALTER TABLE result_contacts ADD FOREIGN KEY (nct_id) REFERENCES studies ON DELETE CASCADE;
ALTER TABLE result_groups ADD FOREIGN KEY (nct_id) REFERENCES studies ON DELETE CASCADE;
-- ALTER TABLE studies ADD FOREIGN KEY (nct_id) REFERENCES studies ON DELETE CASCADE; -- Disabled: shouldn't reference itself!
ALTER TABLE study_references ADD FOREIGN KEY (nct_id) REFERENCES studies ON DELETE CASCADE;

COMMIT;

Thanks again!

benjie commented 5 years ago

I'm going to add some notes below on other ways the schema could be improved by adding constraints; I hope that's okay!

study --- participant_flows is a one-to-one relationship:

ALTER TABLE participant_flows ADD UNIQUE (nct_id);
benjie commented 5 years ago

Some of the foreign keys are missing indexes:

CREATE INDEX ON reported_events (result_group_id);
CREATE INDEX ON outcome_measurements (result_group_id);
CREATE INDEX ON outcome_measurements (outcome_id);
CREATE INDEX ON baseline_measurements (result_group_id);
CREATE INDEX ON design_group_interventions (intervention_id);
CREATE INDEX ON design_group_interventions (design_group_id);
CREATE INDEX ON outcome_counts (outcome_id);
CREATE INDEX ON outcome_counts (result_group_id);
CREATE INDEX ON milestones (result_group_id);
CREATE INDEX ON outcome_analysis_groups (result_group_id);
CREATE INDEX ON outcome_analysis_groups (outcome_analysis_id);
CREATE INDEX ON facility_contacts (facility_id);
CREATE INDEX ON drop_withdrawals (result_group_id);
CREATE INDEX ON intervention_other_names (intervention_id);
CREATE INDEX ON facility_investigators (facility_id);
CREATE INDEX ON outcome_analyses (outcome_id);
CREATE INDEX ON baseline_counts (result_group_id);
Generated with... ```sql WITH indexed_tables AS ( select ns.nspname, t.relname as table_name, i.relname as index_name, array_to_string(array_agg(a.attname), ', ') as column_names, ix.indrelid, string_to_array(ix.indkey::text, ' ')::smallint[] as indkey FROM pg_class i JOIN pg_index ix ON i.OID = ix.indrelid JOIN pg_class t ON ix.indrelid = t.oid JOIN pg_namespace ns ON ns.oid = t.relnamespace JOIN pg_attribute a ON a.attrelid = t.oid where a.attnum = ANY(ix.indkey) and t.relkind = 'r' and nspname not in ('pg_catalog') group by ns.nspname, t.relname, i.relname, ix.indrelid, ix.indkey order by ns.nspname, t.relname, i.relname, ix.indrelid, ix.indkey ) SELECT 'CREATE INDEX ON ' || conrelid::regclass || ' (' || regexp_replace( regexp_replace( pg_get_constraintdef(pgc.oid), '\).*$', '' ), '^[^(]+\(', '' ) || ');' FROM pg_constraint pgc JOIN pg_class ON (conrelid = pg_class.oid) WHERE contype = 'f' AND NOT EXISTS( SELECT 1 FROM indexed_tables WHERE indrelid = conrelid AND conkey = indkey OR (array_length(indkey, 1) > 1 AND indkey @> conkey) ) ORDER BY reltuples DESC; ```
tibbs001 commented 5 years ago

This is great! Thanks so much. You're absolutely right - this is something we need to do. We've added your notes to a jira ticket and intend to do this in the next release (after the one that's about to go out). Really appreciate your input. Thank you.