Closed NickCEBM closed 6 years ago
Some updates to the queries, now implemented on the full dataset:
SELECT
TRIM(json_EXTRACT(json,
"$.clinical_study.id_info.nct_id"), '"') AS nct_id,
TRIM(json_EXTRACT(json,
"$.clinical_study.study_type"), '"') AS study_type,
TRIM(json_EXTRACT(json,
"$.clinical_study.overall_status"), '"') AS study_status,
TRIM(TRIM(json_EXTRACT(json,
"$.clinical_study.phase"), '"')) AS phase,
json_EXTRACT(json,
"$.clinical_study.intervention") AS intervention_type,
IF(REGEXP_CONTAINS(JSON_EXTRACT_SCALAR(json,
"$.clinical_study.start_date.text"), r"\d,"),
PARSE_DATE("%B %e, %Y",
JSON_EXTRACT_SCALAR(json,
"$.clinical_study.start_date.text")),
DATE_SUB(DATE_ADD(PARSE_DATE("%B %Y",
JSON_EXTRACT_SCALAR(json,
"$.clinical_study.start_date.text")), INTERVAL 1 MONTH), INTERVAL 1 DAY)) AS start_date,
Case when json_extract(json,
"$.clinical_study.primary_completion_date.text") is null then (IF(REGEXP_CONTAINS(JSON_EXTRACT_SCALAR(json,
"$.clinical_study.completion_date.text"), r"\d,"),
PARSE_DATE("%B %e, %Y",
JSON_EXTRACT_SCALAR(json,
"$.clinical_study.completion_date.text")),
DATE_SUB(DATE_ADD(PARSE_DATE("%B %Y",
JSON_EXTRACT_SCALAR(json,
"$.clinical_study.completion_date.text")), INTERVAL 1 MONTH), INTERVAL 1 DAY))) else
( IF(REGEXP_CONTAINS(JSON_EXTRACT_SCALAR(json,
"$.clinical_study.primary_completion_date.text"), r"\d,"),
PARSE_DATE("%B %e, %Y",
JSON_EXTRACT_SCALAR(json,
"$.clinical_study.primary_completion_date.text")),
DATE_SUB(DATE_ADD(PARSE_DATE("%B %Y",
JSON_EXTRACT_SCALAR(json,
"$.clinical_study.primary_completion_date.text")), INTERVAL 1 MONTH), INTERVAL 1 DAY))) end as available_completion_date,
IF(REGEXP_CONTAINS(JSON_EXTRACT_SCALAR(json,
"$.clinical_study.primary_completion_date.text"), r"\d,"),
PARSE_DATE("%B %e, %Y",
JSON_EXTRACT_SCALAR(json,
"$.clinical_study.primary_completion_date.text")),
DATE_SUB(DATE_ADD(PARSE_DATE("%B %Y",
JSON_EXTRACT_SCALAR(json,
"$.clinical_study.primary_completion_date.text")), INTERVAL 1 MONTH), INTERVAL 1 DAY)) AS primary_completion_date,
IF(REGEXP_CONTAINS(JSON_EXTRACT_SCALAR(json,
"$.clinical_study.completion_date.text"), r"\d,"),
PARSE_DATE("%B %e, %Y",
JSON_EXTRACT_SCALAR(json,
"$.clinical_study.completion_date.text")),
DATE_SUB(DATE_ADD(PARSE_DATE("%B %Y",
JSON_EXTRACT_SCALAR(json,
"$.clinical_study.completion_date.text")), INTERVAL 1 MONTH), INTERVAL 1 DAY)) AS completion_date,
TRIM(json_EXTRACT(json,
"$.clinical_study.study_design_info.primary_purpose"), '"') AS primary_purpose,
TRIM(json_EXTRACT(json,
"$.clinical_study.oversight_info.is_fda_regulated_drug"), '"') AS fda_reg_drug,
TRIM(json_EXTRACT(json,
"$.clinical_study.oversight_info.is_fda_regulated_device"), '"') AS fda_reg_device,
json_EXTRACT(json,
"$.clinical_study.is_us_export") AS exported,
json_EXTRACT(json,
"$.clinical_study.location") AS study_location,
PARSE_DATE("%B %e, %Y",
JSON_EXTRACT_SCALAR(json,
"$.clinical_study.results_first_submitted")) AS results_submitted_date,
CASE
WHEN json_EXTRACT(json, "$.clinical_study.clinical_results") IS NOT NULL THEN 1
ELSE 0
END AS has_results,
PARSE_DATE("%B %e, %Y",
JSON_EXTRACT_SCALAR(json,
"$.clinical_study.disposition_first_submitted")) AS certificate_date,
TRIM(json_EXTRACT(json,
"$.clinical_study.location_countries"), '"') AS location,
TRIM(json_EXTRACT(json,
"$.clinical_study.sponsors.lead_sponsor.agency"), '"') AS sponsor,
TRIM(json_EXTRACT(json,
"$.clinical_study.official_title"), '"') AS title,
TRIM(JSON_EXTRACT(json,
"$.clinical_study.required_header.url"), '"') AS url,
json_EXTRACT(json,
"$.clinical_study.condition") AS condition,
json_EXTRACT(json,
"$.clinical_study.condition_browse") AS condition_mesh,
json_EXTRACT(json,
"$.clinical_study.intervention") AS intervention,
json_EXTRACT(json,
"$.clinical_study.intervention_browse") AS intervention_mesh
FROM
clinicaltrials.clinicaltrials_json
SELECT
nct_id,
CASE
WHEN study_type = 'Interventional'
AND (fda_reg_drug = 'Yes' OR fda_reg_device = 'Yes')
AND (phase = 'Phase 1/Phase 2' OR phase = 'Phase 2' OR Phase = 'Phase 2/Phase 3' or phase = 'Phase 3' or phase = 'Phase 4' or phase = 'N/A')
AND (primary_purpose <> 'Device Feasibility')
AND (start_date >= '2017-01-18')
AND study_status <> 'Withdrawn' THEN 1
ELSE 0
END AS act_flag,
CASE
WHEN study_type = 'Interventional'
AND (regexp_contains(intervention, '"Biological"') OR regexp_contains(intervention, '"Drug"')
OR regexp_contains(intervention, '"Device"') OR regexp_contains(intervention, '"Genetic"') OR regexp_contains(intervention, '"Radiation"') OR regexp_contains(intervention, '"Combination Product"') OR regexp_contains(intervention, '"Diagnostic Test"'))
AND (phase = 'Phase 1/Phase 2' OR phase = 'Phase 2' OR Phase = 'Phase 2/Phase 3' or phase = 'Phase 3' or phase = 'Phase 4' or phase = 'N/A')
AND (available_completion_date >= '2017-01-18')
AND (start_date < '2017-01-18')
AND study_status <> 'Withdrawn'
AND (regexp_contains(location, concat("\\b", "United States", "\\b"))
OR regexp_contains(location, concat("\\b", "American Samoa", "\\b"))
OR regexp_contains(location, concat("\\b", "Guam", "\\b"))
OR regexp_contains(location, concat("\\b", "Northern Mariana Islands", "\\b"))
OR regexp_contains(location, concat("\\b", "Puerto Rico", "\\b"))
OR regexp_contains(location, concat("\\b", "U.S. Virgin Islands", "\\b")))
THEN 1
ELSE 0
END AS included_pact_flag,
location,
exported,
phase,
start_date,
available_completion_date,
case when primary_completion_date is not null then 1 else 0 end as primary_completion_date_used,
has_results,
results_submitted_date,
case when certificate_date is not null then 1 else 0 end as has_certificate,
certificate_date,
case when
#sets the deadline for results as 1 year + 30 days from completion date
(Date_Add(Date_Add(available_completion_date, INTERVAL 1 YEAR), INTERVAL 30 DAY) < current_date())
AND
#checks to see if it is an ACT
(
((study_type = 'Interventional')
AND (fda_reg_drug = 'Yes' OR fda_reg_device = 'Yes')
AND (phase = 'Phase 1/Phase 2' OR phase = 'Phase 2' OR Phase = 'Phase 2/Phase 3' or phase = 'Phase 3' or phase = 'Phase 4' or phase = 'N/A')
AND (primary_purpose <> 'Device Feasibility')
AND (start_date >= '2017-01-18')
AND (study_status <> 'Withdrawn'))
OR
#checks to see if it's a pACT
((study_type = 'Interventional')
AND (regexp_contains(intervention, '"Biological"') OR regexp_contains(intervention, '"Drug"')
OR regexp_contains(intervention, '"Device"') OR regexp_contains(intervention, '"Genetic"') OR regexp_contains(intervention, '"Radiation"') OR regexp_contains(intervention, '"Combination Product"') OR regexp_contains(intervention, '"Diagnostic Test"'))
AND (phase = 'Phase 1/Phase 2' OR phase = 'Phase 2' OR Phase = 'Phase 2/Phase 3' or phase = 'Phase 3' or phase = 'Phase 4' or phase = 'N/A')
AND (available_completion_date >= '2017-01-18')
AND (start_date < '2017-01-18')
AND (study_status <> 'Withdrawn')
AND (regexp_contains(location, concat("\\b", "United States", "\\b"))
OR regexp_contains(location, concat("\\b", "American Samoa", "\\b"))
OR regexp_contains(location, concat("\\b", "Guam", "\\b"))
OR regexp_contains(location, concat("\\b", "Northern Mariana Islands", "\\b"))
OR regexp_contains(location, concat("\\b", "Puerto Rico", "\\b"))
OR regexp_contains(location, concat("\\b", "U.S. Virgin Islands", "\\b"))))
)
#checks to see if it has a certificate of exemption or if it's 3 years + 30 days after the primary completion date in which it's due no matter what
AND (certificate_date is null OR (Date_ADD(Date_ADD(available_completion_date, Interval 3 YEAR), Interval 30 DAY) < current_date()))
then 1 else 0 end as results_due,
study_status,
study_type,
primary_purpose,
fda_reg_drug,
fda_reg_device,
sponsor,
sponsor_type,
url,
title,
condition,
condition_mesh,
intervention,
intervention_mesh
FROM
`clinicaltrials.full_data_extract`
Mainly have pulled in some info on condition and intervention that we may want to integrate into the website in the long-run. Will discuss at datalab day tomorrow.
First query to extract data from JSON:
SELECT
TRIM(json_EXTRACT(json,
"$.clinical_study.id_info.nct_id"), '"') AS nct_id,
TRIM(json_EXTRACT(json,
"$.clinical_study.study_type"), '"') AS study_type,
TRIM(json_EXTRACT(json,
"$.clinical_study.overall_status"), '"') AS study_status,
TRIM(TRIM(json_EXTRACT(json,
"$.clinical_study.phase"), '"')) AS phase,
json_EXTRACT(json,
"$.clinical_study.intervention") AS intervention_type,
CASE
WHEN json_EXTRACT(json, "$.clinical_study.start_date.text") IS NOT NULL THEN (IF(REGEXP_CONTAINS(JSON_EXTRACT_SCALAR(json, "$.clinical_study.start_date.text"), r"\d,"), PARSE_DATE("%B %e, %Y", JSON_EXTRACT_SCALAR(json, "$.clinical_study.start_date.text")), DATE_SUB(DATE_ADD(PARSE_DATE("%B %Y", JSON_EXTRACT_SCALAR(json, "$.clinical_study.start_date.text")), INTERVAL 1 MONTH), INTERVAL 1 DAY)))
ELSE (IF(REGEXP_CONTAINS(JSON_EXTRACT_SCALAR(json,
"$.clinical_study.start_date"), r"\d,"),
PARSE_DATE("%B %e, %Y",
JSON_EXTRACT_SCALAR(json,
"$.clinical_study.start_date")),
DATE_SUB(DATE_ADD(PARSE_DATE("%B %Y",
JSON_EXTRACT_SCALAR(json,
"$.clinical_study.start_date")), INTERVAL 1 MONTH), INTERVAL 1 DAY)))
END AS start_date,
CASE
WHEN json_EXTRACT(json, "$.clinical_study.primary_completion_date.text") IS NULL THEN
(CASE
WHEN json_EXTRACT(json, "$.clinical_study.completion_date.text") IS NOT NULL THEN (IF(REGEXP_CONTAINS(JSON_EXTRACT_SCALAR(json, "$.clinical_study.completion_date.text"), r"\d,"), PARSE_DATE("%B %e, %Y", JSON_EXTRACT_SCALAR(json, "$.clinical_study.completion_date.text")), DATE_SUB(DATE_ADD(PARSE_DATE("%B %Y", JSON_EXTRACT_SCALAR(json, "$.clinical_study.completion_date.text")), INTERVAL 1 MONTH), INTERVAL 1 DAY)))
ELSE (IF(REGEXP_CONTAINS(JSON_EXTRACT_SCALAR(json,
"$.clinical_study.completion_date"), r"\d,"),
PARSE_DATE("%B %e, %Y",
JSON_EXTRACT_SCALAR(json,
"$.clinical_study.completion_date")),
DATE_SUB(DATE_ADD(PARSE_DATE("%B %Y",
JSON_EXTRACT_SCALAR(json,
"$.clinical_study.completion_date")), INTERVAL 1 MONTH), INTERVAL 1 DAY))) END)
ELSE ( IF(REGEXP_CONTAINS(JSON_EXTRACT_SCALAR(json,
"$.clinical_study.primary_completion_date.text"), r"\d,"),
PARSE_DATE("%B %e, %Y",
JSON_EXTRACT_SCALAR(json,
"$.clinical_study.primary_completion_date.text")),
DATE_SUB(DATE_ADD(PARSE_DATE("%B %Y",
JSON_EXTRACT_SCALAR(json,
"$.clinical_study.primary_completion_date.text")), INTERVAL 1 MONTH), INTERVAL 1 DAY)))
END AS available_completion_date,
IF(REGEXP_CONTAINS(JSON_EXTRACT_SCALAR(json,
"$.clinical_study.primary_completion_date.text"), r"\d,"),
PARSE_DATE("%B %e, %Y",
JSON_EXTRACT_SCALAR(json,
"$.clinical_study.primary_completion_date.text")),
DATE_SUB(DATE_ADD(PARSE_DATE("%B %Y",
JSON_EXTRACT_SCALAR(json,
"$.clinical_study.primary_completion_date.text")), INTERVAL 1 MONTH), INTERVAL 1 DAY)) AS primary_completion_date,
CASE
WHEN json_EXTRACT(json, "$.clinical_study.completion_date.text") IS NOT NULL THEN (IF(REGEXP_CONTAINS(JSON_EXTRACT_SCALAR(json, "$.clinical_study.completion_date.text"), r"\d,"), PARSE_DATE("%B %e, %Y", JSON_EXTRACT_SCALAR(json, "$.clinical_study.completion_date.text")), DATE_SUB(DATE_ADD(PARSE_DATE("%B %Y", JSON_EXTRACT_SCALAR(json, "$.clinical_study.completion_date.text")), INTERVAL 1 MONTH), INTERVAL 1 DAY)))
ELSE (IF(REGEXP_CONTAINS(JSON_EXTRACT_SCALAR(json,
"$.clinical_study.completion_date"), r"\d,"),
PARSE_DATE("%B %e, %Y",
JSON_EXTRACT_SCALAR(json,
"$.clinical_study.completion_date")),
DATE_SUB(DATE_ADD(PARSE_DATE("%B %Y",
JSON_EXTRACT_SCALAR(json,
"$.clinical_study.completion_date")), INTERVAL 1 MONTH), INTERVAL 1 DAY))) END AS completion_date,
TRIM(json_EXTRACT(json,
"$.clinical_study.study_design_info.primary_purpose"), '"') AS primary_purpose,
TRIM(json_EXTRACT(json,
"$.clinical_study.oversight_info.is_fda_regulated_drug"), '"') AS fda_reg_drug,
TRIM(json_EXTRACT(json,
"$.clinical_study.oversight_info.is_fda_regulated_device"), '"') AS fda_reg_device,
TRIM(json_EXTRACT(json,
"$.clinical_study.oversight_info.is_us_export"), '"') AS exported,
json_EXTRACT(json,
"$.clinical_study.location") AS study_location,
PARSE_DATE("%B %e, %Y",
JSON_EXTRACT_SCALAR(json,
"$.clinical_study.results_first_submitted")) AS results_submitted_date,
CASE
WHEN json_EXTRACT(json, "$.clinical_study.clinical_results") IS NOT NULL THEN 1
ELSE 0
END AS has_results,
PARSE_DATE("%B %e, %Y",
JSON_EXTRACT_SCALAR(json,
"$.clinical_study.disposition_first_submitted")) AS certificate_date,
TRIM(json_EXTRACT(json,
"$.clinical_study.location_countries"), '"') AS location,
TRIM(json_EXTRACT(json,
"$.clinical_study.sponsors.lead_sponsor.agency"), '"') AS sponsor,
TRIM(json_EXTRACT(json,
"$.clinical_study.sponsors.lead_sponsor.agency_class"), '"') AS sponsor_type,
CASE WHEN json_EXTRACT(json,
"$.clinical_study.official_title") is null then
TRIM(json_EXTRACT(json,
"$.clinical_study.brief_title"), '"')
ELSE TRIM(json_EXTRACT(json,
"$.clinical_study.official_title"), '"') end AS title,
TRIM(JSON_EXTRACT(json,
"$.clinical_study.required_header.url"), '"') AS url,
json_EXTRACT(json,
"$.clinical_study.condition") AS condition,
json_EXTRACT(json,
"$.clinical_study.condition_browse") AS condition_mesh,
json_EXTRACT(json,
"$.clinical_study.intervention") AS intervention,
json_EXTRACT(json,
"$.clinical_study.intervention_browse") AS intervention_mesh
FROM
clinicaltrials.clinicaltrials_json
Second query to create dataset with appropriate flags
SELECT
nct_id,
CASE
WHEN study_type = 'Interventional'
AND (fda_reg_drug = 'Yes' OR fda_reg_device = 'Yes')
AND (phase = 'Phase 1/Phase 2' OR phase = 'Phase 2' OR Phase = 'Phase 2/Phase 3' or phase = 'Phase 3' or phase = 'Phase 4' or phase = 'N/A')
AND (primary_purpose <> 'Device Feasibility')
AND (start_date >= '2017-01-18')
AND study_status <> 'Withdrawn' THEN 1
ELSE 0
END AS act_flag,
CASE
WHEN study_type = 'Interventional'
AND (fda_reg_drug <> 'No' AND fda_reg_device <> 'No')
AND (regexp_contains(intervention, '"Biological"') OR regexp_contains(intervention, '"Drug"')
OR regexp_contains(intervention, '"Device"') OR regexp_contains(intervention, '"Genetic"') OR regexp_contains(intervention, '"Radiation"'))
AND (phase = 'Phase 1/Phase 2' OR phase = 'Phase 2' OR Phase = 'Phase 2/Phase 3' or phase = 'Phase 3' or phase = 'Phase 4' or phase = 'N/A')
AND (primary_purpose <> 'Device Feasibility')
AND (available_completion_date >= '2017-01-18')
AND (start_date < '2017-01-18')
AND study_status <> 'Withdrawn'
AND (regexp_contains(location, concat("\\b", "United States", "\\b"))
OR regexp_contains(location, concat("\\b", "American Samoa", "\\b"))
OR regexp_contains(location, concat("\\b", "Guam", "\\b"))
OR regexp_contains(location, concat("\\b", "Northern Mariana Islands", "\\b"))
OR regexp_contains(location, concat("\\b", "Puerto Rico", "\\b"))
OR regexp_contains(location, concat("\\b", "U.S. Virgin Islands", "\\b")))
THEN 1
ELSE 0
END AS included_pact_flag,
location,
exported,
phase,
start_date,
available_completion_date,
case when primary_completion_date is not null then 1 else 0 end as primary_completion_date_used,
has_results,
results_submitted_date,
case when certificate_date is not null then 1 else 0 end as has_certificate,
certificate_date,
case when
#sets the deadline for results as 1 year + 30 days from completion date
(Date_Add(Date_Add(available_completion_date, INTERVAL 1 YEAR), INTERVAL 30 DAY) < current_date())
AND
#checks to see if it is an ACT
(
((study_type = 'Interventional')
AND (fda_reg_drug = 'Yes' OR fda_reg_device = 'Yes')
AND (phase = 'Phase 1/Phase 2' OR phase = 'Phase 2' OR Phase = 'Phase 2/Phase 3' or phase = 'Phase 3' or phase = 'Phase 4' or phase = 'N/A')
AND (primary_purpose <> 'Device Feasibility')
AND (start_date >= '2017-01-18')
AND (study_status <> 'Withdrawn'))
OR
#checks to see if it's a pACT
((study_type = 'Interventional')
AND (fda_reg_drug <> 'No' AND fda_reg_device <> 'No')
AND (regexp_contains(intervention, '"Biological"') OR regexp_contains(intervention, '"Drug"')
OR regexp_contains(intervention, '"Device"') OR regexp_contains(intervention, '"Genetic"') OR regexp_contains(intervention, '"Radiation"'))
AND (phase = 'Phase 1/Phase 2' OR phase = 'Phase 2' OR Phase = 'Phase 2/Phase 3' or phase = 'Phase 3' or phase = 'Phase 4' or phase = 'N/A')
AND (primary_purpose <> 'Device Feasibility')
AND (available_completion_date >= '2017-01-18')
AND (start_date < '2017-01-18')
AND (study_status <> 'Withdrawn')
AND (regexp_contains(location, concat("\\b", "United States", "\\b"))
OR regexp_contains(location, concat("\\b", "American Samoa", "\\b"))
OR regexp_contains(location, concat("\\b", "Guam", "\\b"))
OR regexp_contains(location, concat("\\b", "Northern Mariana Islands", "\\b"))
OR regexp_contains(location, concat("\\b", "Puerto Rico", "\\b"))
OR regexp_contains(location, concat("\\b", "U.S. Virgin Islands", "\\b"))))
)
#checks to see if it has a certificate of exemption or if it's 3 years + 30 days after the primary completion date in which it's due no matter what
AND (certificate_date is null OR (Date_ADD(Date_ADD(available_completion_date, Interval 3 YEAR), Interval 30 DAY) < current_date()))
then 1 else 0 end as results_due,
study_status,
study_type,
primary_purpose,
fda_reg_drug,
fda_reg_device,
sponsor,
sponsor_type,
url,
title,
condition,
condition_mesh,
intervention,
intervention_mesh
FROM
`clinicaltrials.full_data_extract`
This is talking through the queries detailed in the above comment. The ultimate basis for all of this is the Final Rule, however the requirements of the final rule are noted in simplified form elsewhere.
The first query only contains the data extractions. The extraction for dates gets a little messy because sometimes they are nested and other times they are not (depending on which date field) so there's some logic to sort that out. Two notes on that:
primary_completion_date
but if that isn't available, we take completion_date
if there is one. This creates the field available_completion_date
. I've created a flag to note which one we use just as a sanity check/FYI. It's not that important.Other than that, the query should be pretty straight forward. I recommend this as a resource of you would like to visualize any of the raw JSON to see the structure of the data in our database and see how I arrived at certain extractions.
For the 2nd query, quite a bit of that is just pulling in fields from the JSON extract table but here are descriptions below for when that isn't the case:
CASE
WHEN study_type = 'Interventional'
AND (fda_reg_drug = 'Yes' OR fda_reg_device = 'Yes')
AND (phase = 'Phase 1/Phase 2' OR phase = 'Phase 2' OR Phase = 'Phase 2/Phase 3' or phase = 'Phase 3' or phase = 'Phase 4' or phase = 'N/A')
AND (primary_purpose <> 'Device Feasibility')
AND (start_date >= '2017-01-18')
AND study_status <> 'Withdrawn' THEN 1
ELSE 0
END AS act_flag,
This creates the ACT flag. The rationale for this logic is from the Final Rule but described here and here.
As of now, the field which described whether a study involves a U.S. FDA IND or IDE is not made publicly available. We are asking ClinicalTrials.gov about this but we are not hopeful it will be made public. In discussions with some other folks, they have told us that the FDA Regulated Drug/Device field cannot be answered affirmatively unless one of the three criteria from question 2 (US location, IND/IDE, Exported) is also a yes. We also have a question out to ClinicalTrials.gov about this.
As a consequence, we are currently making an assumption on the ACT status of ~627 trials (ones that do not have a US location and are not exported but are FDA regulated) are ACTs and are assuming they are covered by an IND/IDE. We're looking into this a bit more but that's the current situation although that might change.
Update: Our ACT flag should be OK per this communication from ClincialTrials.gov -
If you enter no US locations, and answered NO to the question Product Exported from U.S and you answered YES to either U.S. FDA-regulated Drug or U.S. FDA-regulated Device, then you would get the following error. ERROR: U.S. FDA-regulated Drug cannot be 'Yes' unless this study is an IND study, has one or more U.S. Locations, or is a study of a drug that is exported from the U.S. ClinicalTrials.gov
CASE
WHEN study_type = 'Interventional'
AND (fda_reg_drug <> 'No' AND fda_reg_device <> 'No')
AND (regexp_contains(intervention, '"Biological"') OR regexp_contains(intervention, '"Drug"')
OR regexp_contains(intervention, '"Device"') OR regexp_contains(intervention, '"Genetic"') OR regexp_contains(intervention, '"Radiation"') OR regexp_contains(intervention, '"Combination Product"') OR regexp_contains(intervention, '"Diagnostic Test"'))
AND (phase = 'Phase 1/Phase 2' OR phase = 'Phase 2' OR Phase = 'Phase 2/Phase 3' or phase = 'Phase 3' or phase = 'Phase 4' or phase = 'N/A')
AND (primary_purpose <> 'Device Feasibility')
AND (available_completion_date >= '2017-01-18')
AND (start_date < '2017-01-18')
AND study_status <> 'Withdrawn'
AND (regexp_contains(location, concat("\\b", "United States", "\\b"))
OR regexp_contains(location, concat("\\b", "American Samoa", "\\b"))
OR regexp_contains(location, concat("\\b", "Guam", "\\b"))
OR regexp_contains(location, concat("\\b", "Northern Mariana Islands", "\\b"))
OR regexp_contains(location, concat("\\b", "Puerto Rico", "\\b"))
OR regexp_contains(location, concat("\\b", "U.S. Virgin Islands", "\\b")))
THEN 1
ELSE 0
END AS included_pact_flag,
The pACT flag logic largely comes from here as well as the following passage from the Final Rule itself (page 65031):
The Agency used the term “probable applicable clinical trials” (pACTs) to refer to the estimated number of clinical trials subject to section 402(j) of the PHS Act prior to the effective date of the rule. This approach relied on the set of clinical trial registration data elements available prior to enactment of the final rule, but did not include all of the data elements necessary to determine which studies are applicable clinical trials as specified in § 11.22(b) of the final rule. The pACTs were defined as records listing an “interventional” Study Type; with at least one Intervention Type as “Biological,” “Drug,” “Device,” “Genetic,” or “Radiation;” a Study Phase other than “Phase 0” or “Phase 1;” a Primary Completion Date on or after January 2008 or, if the Primary Completion Date was missing, a Study Completion Date on or after January 2008, or any record for which both the Primary Completion Date and the Study Completion Date are missing; an Overall Recruitment Status other than “Withdrawn,” and at least one Facility Location Country in the “United States” or if none, indication that the study is conducted under an FDA IND or IDE.
However, we don't care about all pACTs. We only care about pACTs that have a completion date on or after the final rule went into effect (January 18, 2017). ~I also believe, based on my reading of the final rule, that we should not include trials with start dates prior to September 27, 2007 which I base on the table on page 65121 of the Final Rule below~ After consulting with some colleagues about this, we now believe that the this September 2007 criteria is not relevant to us.
~Note, we are calling something with a Null location that meets all the other criteria a pACT because we don't know about IND/IDE. Getting more info on this but we can decide that one way or another before launch.~ We have decided to exclude trials with a null location criteria from our pACT flag.
UPDATE: We are now including the FDA Regulated Drug/Device
field in our pACT flag to account for trials that started before the effective date but have since updated their record to include this data.
case when
#sets the deadline for results as 1 year + 30 days from completion date
(Date_Add(Date_Add(available_completion_date, INTERVAL 1 YEAR), INTERVAL 30 DAY) < current_date())
AND
#checks to see if it is an ACT
(
((study_type = 'Interventional')
AND (fda_reg_drug = 'Yes' OR fda_reg_device = 'Yes')
AND (phase = 'Phase 1/Phase 2' OR phase = 'Phase 2' OR Phase = 'Phase 2/Phase 3' or phase = 'Phase 3' or phase = 'Phase 4' or phase = 'N/A')
AND (primary_purpose <> 'Device Feasibility')
AND (start_date >= '2017-01-18')
AND (study_status <> 'Withdrawn'))
OR
#checks to see if it's a pACT
((study_type = 'Interventional')
AND (fda_reg_drug <> 'No' AND fda_reg_device <> 'No')
AND (regexp_contains(intervention, '"Biological"') OR regexp_contains(intervention, '"Drug"')
OR regexp_contains(intervention, '"Device"') OR regexp_contains(intervention, '"Genetic"') OR regexp_contains(intervention, '"Radiation"') OR regexp_contains(intervention, '"Combination Product"') OR regexp_contains(intervention, '"Diagnostic Test"'))
AND (phase = 'Phase 1/Phase 2' OR phase = 'Phase 2' OR Phase = 'Phase 2/Phase 3' or phase = 'Phase 3' or phase = 'Phase 4' or phase = 'N/A')
AND (primary_purpose <> 'Device Feasibility')
AND (available_completion_date >= '2017-01-18')
AND (start_date < '2017-01-18')
AND (study_status <> 'Withdrawn')
AND (regexp_contains(location, concat("\\b", "United States", "\\b"))
OR regexp_contains(location, concat("\\b", "American Samoa", "\\b"))
OR regexp_contains(location, concat("\\b", "Guam", "\\b"))
OR regexp_contains(location, concat("\\b", "Northern Mariana Islands", "\\b"))
OR regexp_contains(location, concat("\\b", "Puerto Rico", "\\b"))
OR regexp_contains(location, concat("\\b", "U.S. Virgin Islands", "\\b"))))
)
#checks to see if it has a certificate of exemption or if it's 3 years + 30 days after the primary completion date in which it's due no matter what
AND (certificate_date is null OR (Date_ADD(Date_ADD(available_completion_date, Interval 3 YEAR), Interval 30 DAY) < current_date()))
then 1 else 0 end as results_due
I've added some comments in here to walk through the query already, as it's a little messy but basically the query is broken down into 3 parts.
Let me know if you have any questions.
Full version of SQL now stored (and deployed from) here: https://github.com/ebmdatalab/clinicaltrials-act-tracker/blob/master/view.sql
Draft Queries
The following code generates the data we would need to create a tracking website similar to what we have for EUCTR:
First we must extract the fields we need from the JSON. We could create our final dataset directly from the JSON but that would be extremely messy (and difficult to do cross-checking and testing on) so I've created a workable table first and then created what would be our final dataset from that. Note that as of now, these are all drawing from our sub-set of test data
clinicaltrials_json_extract
and not the full dataset. We can turn everything into 1 query at a later date if we want.The first query to get all the data we need into a table is as follows:
We can then pull or create all the fields we would need to create a website similar to EUCTR:
Notes and Next Steps
delay_notice_check
has the candidate fields). I'm fairly certain we have everything we would need to get the website up and running unless there's other fields we decide we want to include.