theislab / ehrapy

Electronic Health Record Analysis with Python.
https://ehrapy.readthedocs.io/
Apache License 2.0
237 stars 19 forks source link

Integrate Synthea or create own fake data generator #129

Open Zethson opened 3 years ago

Zethson commented 3 years ago

https://github.com/synthetichealth/synthea

We should try to wrap it in Python (while checking for JDK etc).

Output default csv, but others should be supported as well.

CC #102

Zethson commented 3 years ago

java -jar synthea-with-dependencies.jar -p 5 -c synthea.properties

⋊> ~/Desktop cat synthea.properties                                                                                                                                           
exporter.ccda.export = false
exporter.fhir.export = true
exporter.csv.export = true

generates csv files.

Zethson commented 2 years ago

This could also be a cool option: https://github.com/ratschlab/HIRID-ICU-Benchmark/blob/master/icu_benchmarks/synthetic_data/generate_simple_fake_data.py

Zethson commented 1 year ago
import random
import uuid
from datetime import datetime, timedelta
from faker import Faker

fake = Faker()

# Define some example FHIR resources and their fields
patient_fields = ['id', 'birthDate', 'gender', 'address', 'phone']
observation_fields = ['id', 'status', 'category', 'code', 'subject', 'issued', 'valueQuantity']
condition_fields = ['id', 'clinicalStatus', 'category', 'code', 'subject', 'onsetDateTime']

# Define some example values for FHIR resource fields
gender_values = ['male', 'female', 'other']
category_values = ['vital-signs', 'laboratory', 'imaging', 'procedure']
code_values = ['1000001', '1000002', '1000003', '1000004', '1000005']
clinical_status_values = ['active', 'resolved', 'remission', 'relapse']

def generate_patient():
    """
    Generate a fake FHIR Patient resource.
    """
    patient = {
        'resourceType': 'Patient',
        'id': str(uuid.uuid4()),
        'birthDate': fake.date_of_birth(),
        'gender': random.choice(gender_values),
        'address': {
            'line': fake.street_address(),
            'city': fake.city(),
            'state': fake.state_abbr(),
            'postalCode': fake.zipcode()
        },
        'phone': fake.phone_number()
    }
    return patient

def generate_observation(patient_id):
    """
    Generate a fake FHIR Observation resource.
    """
    observation = {
        'resourceType': 'Observation',
        'id': str(uuid.uuid4()),
        'status': 'final',
        'category': {
            'coding': [{
                'system': 'http://terminology.hl7.org/CodeSystem/observation-category',
                'code': random.choice(category_values)
            }]
        },
        'code': {
            'coding': [{
                'system': 'http://loinc.org',
                'code': random.choice(code_values),
                'display': 'Example Code'
            }]
        },
        'subject': {
            'reference': f'Patient/{patient_id}'
        },
        'issued': datetime.now().isoformat(),
        'valueQuantity': {
            'value': random.randint(1, 100),
            'unit': 'mmHg',
            'system': 'http://unitsofmeasure.org',
            'code': 'mm[Hg]'
        }
    }
    return observation

def generate_condition(patient_id):
    """
    Generate a fake FHIR Condition resource.
    """
    condition = {
        'resourceType': 'Condition',
        'id': str(uuid.uuid4()),
        'clinicalStatus': random.choice(clinical_status_values),
        'category': {
            'coding': [{
                'system': 'http://terminology.hl7.org/CodeSystem/condition-category',
                'code': random.choice(category_values)
            }]
        },
        'code': {
            'coding': [{
                'system': 'http://snomed.info/sct',
                'code': random.choice(code_values),
                'display': 'Example Code'
            }]
        },
        'subject': {
            'reference': f'Patient/{patient_id}'
        },
        'onsetDateTime': (datetime.now() - timedelta(days=random.randint(1, 3650))).isoformat()
    }
    return condition

# Generate some example FHIR data
patient = generate_patient()
observation = generate_observation(patient['id'])
condition = generate_condition(patient['id'])

print(patient)
print(observation)
Zethson commented 1 year ago

More complex

import random
import uuid
from datetime import datetime, timedelta
from faker import Faker
from itertools import cycle

fake = Faker()

# Define some example FHIR resources and their fields
patient_fields = ['id', 'birthDate', 'gender', 'address', 'phone']
observation_fields = ['id', 'status', 'category', 'code', 'subject', 'issued', 'valueQuantity']
condition_fields = ['id', 'clinicalStatus', 'category', 'code', 'subject', 'onsetDateTime']
encounter_fields = ['id', 'status', 'class', 'type', 'subject', 'period', 'participant']
medication_request_fields = ['id', 'status', 'medication', 'subject', 'authoredOn', 'dosageInstruction']
medication_dispense_fields = ['id', 'status', 'medicationReference', 'subject', 'whenPrepared', 'dosageInstruction']
procedure_fields = ['id', 'status', 'code', 'subject', 'performedDateTime', 'performer']

# Define some example values for FHIR resource fields
gender_values = ['male', 'female', 'other']
race_values = ['white', 'black', 'asian', 'hispanic', 'other']
ethnicity_values = ['nonhispanic', 'hispanic', 'unknown']
category_values = ['vital-signs', 'laboratory', 'imaging', 'procedure']
code_values = ['1000001', '1000002', '1000003', '1000004', '1000005']
clinical_status_values = ['active', 'resolved', 'remission', 'relapse']
encounter_status_values = ['planned', 'arrived', 'triaged', 'in-progress', 'on-leave', 'finished', 'cancelled']
encounter_class_values = ['ambulatory', 'emergency', 'inpatient', 'outpatient', 'urgentcare']
encounter_type_values = ['office-visit', 'emergency', 'inpatient', 'outpatient', 'urgentcare']
medication_status_values = ['active', 'completed', 'cancelled', 'on-hold', 'stopped', 'draft']
procedure_status_values = ['preparation', 'in-progress', 'not-done', 'on-hold', 'stopped', 'completed', 'entered-in-error', 'unknown']
procedure_code_values = ['1010001', '1010002', '1010003', '1010004', '1010005']
procedure_performer_values = ['primary', 'assistant', 'nurse', 'technician']

# Define some example medications and their dosages
medication_values = {
    'atorvastatin': {
        'dosage': {
            'quantity': 1,
            'unit': 'tablet',
            'frequency': 1,
            'period': 'day'
        }
    },
    'metoprolol': {
        'dosage': {
            'quantity': 1,
            'unit': 'tablet',
            'frequency': 2,
            'period': 'day'
        }
    },
    'lisinopril': {
        'dosage': {
            'quantity': 1,
            'unit': 'tablet',
            'frequency': 1,
            'period': 'day'
        }
    }
}

def generate_patient():
    """
    Generate a fake FHIR Patient resource.
    """
    patient = {
        'resourceType': 'Patient',
        'id': str(uuid.uuid4()),
        'meta': {
            'versionId': '1',
            'lastUpdated
    },
    'text': {
        'status': 'generated',
        'div': '<div xmlns="http://www.w3.org/1999/xhtml"></div>'
    },
    'birthDate': fake.date_of_birth().isoformat(),
    'gender': random.choice(gender_values),
    'address': [{
        'use': 'home',
        'type': 'postal',
        'text': fake.address(),
        'line': [fake.street_address()],
        'city': fake.city(),
        'district': fake.state(),
        'postalCode': fake.postcode(),
        'country': fake.country()
    }],
    'phone': [{
        'system': 'phone',
        'value': fake.phone_number(),
        'use': 'home'
    }]
}

return patient

def generate_observation(patient_id, category=None, code=None, value=None):
"""
Generate a fake FHIR Observation resource.
"""
if not category:
    category = random.choice(category_values)
if not code:
    code = random.choice(code_values)
if not value:
    value = round(random.uniform(1, 100), 2)
bservation = {
    'resourceType': 'Observation',
    'id': str(uuid.uuid4()),
    'status': 'final',
    'category': {
        'coding': [{
            'system': 'http://terminology.hl7.org/CodeSystem/observation-category',
            'code': category,
            'display': category.capitalize()
        }]
    },
    'code': {
        'coding': [{
            'system': 'http://loinc.org',
            'code': code,
            'display': fake.word()
        }]
    },
    'subject': {
        'reference': f'Patient/{patient_id}'
    },
    'issued': datetime.now().isoformat(),
    'valueQuantity': {
        'value': value,
        'unit': random.choice(['mg/dL', 'mmol/L', 'kg', 'cm', 'mmHg', 'bpm'])
    }
}

return observation

def generate_condition(patient_id, category=None, code=None, onset=None):
"""
Generate a fake FHIR Condition resource.
"""
if not category:
    category = random.choice(category_values)
if not code:
    code = random.choice(code_values)
if not onset:
    onset = fake.date_time_between(start_date='-50y', end_date='now').isoformat()
condition = {
    'resourceType': 'Condition',
    'id': str(uuid.uuid4()),
    'clinicalStatus': random.choice(clinical_status_values),
    'category': {
        'coding': [{
            'system': 'http://terminology.hl7.org/CodeSystem/condition-category',
            'code': category,
            'display': category.capitalize()
        }]
    },
    'code': {
        'coding': [{
            'system': 'http://snomed.info/sct',
            'code': code,
            'display': fake.word()
        }]
    },
    'subject': {
        'reference': f'Patient/{patient_id}'
    },
    'onsetDateTime': onset
}

return condition

def generate_encounter(patient_id, type=None, start=None, end=None):
"""
Generate a fake FHIR Encounter resource.
"""
if not type:
     type = random.choice(encounter_type_values)
if not start:
    start = fake.date_time_between(start_date='-50y', end_date='now').isoformat()
if not end:
    end = (datetime.fromisoformat(start) + timedelta(minutes=random.randint(10, 360))).isoformat()
encounter = {
'resourceType': 'Encounter',
'id': str(uuid.uuid4()),
'status': 'finished',
'class': {
'system': 'http://terminology.hl7.org/CodeSystem/v3-ActCode',
'code': 'AMB',
'display': 'ambulatory'
},
'type': [{
'coding': [{
'system': 'http://snomed.info/sct',
'code': type,
'display': type.capitalize()
}]
}],
'subject': {
'reference': f'Patient/{patient_id}'
},
'period': {
'start': start,
'end': end
},
'participant': [{
'individual': {
'reference': f'Patient/{patient_id}'
}
}]
}

return encounter

def generate_medication_request(patient_id, medication=None, status=None, intent=None):
"""
Generate a fake FHIR MedicationRequest resource.
"""
if not medication:
medication = random.choice(medication_values)
if not status:
status = random.choice(status_values)
if not intent:
intent = random.choice(intent_values)
medication_request = {
    'resourceType': 'MedicationRequest',
    'id': str(uuid.uuid4()),
    'status': status,
    'intent': intent,
    'subject': {
        'reference': f'Patient/{patient_id}'
    },
    'medicationCodeableConcept': {
        'coding': [{
            'system': 'http://www.nlm.nih.gov/research/umls/rxnorm',
            'code': medication,
            'display': fake.word()
        }]
    },
    'dosageInstruction': [{
        'sequence': 1,
        'text': fake.sentence(nb_words=6),
        'timing': {
            'repeat': {
                'frequency': random.randint(1, 3),
                'period': random.randint(1, 5),
                'periodUnit': 'd'
            }
        },
        'route': {
            'coding': [{
                'system': 'http://snomed.info/sct',
                'code': random.choice(['26643006', '255560000', '254790003']),
                'display': random.choice(['Oral', 'Injection', 'Topical'])
            }]
        }
    }]
}

return medication_request

def generate_all_resources(num_patients):
"""
Generate a list of all FHIR resources for the specified number of patients.
"""
patients = []
observations = []
conditions = []
encounters = []
medication_requests = []

for i in range(num_patients):
    patient_id = i + 1

    patients.append(generate_patient())

    for j in range(random.randint(5, 20)):
        observations.append(generate_observation(patient_id))

    for j in range(random.randint(1, 5)):
        conditions.append(generate_condition(patient_id))

    for j in range(random.randint(1, 3)):
        encounters.append(generate_encounter(patient_id))

    for j in range(random.randint(1, 5)):
        medication_requests.append(generate_medication_request(patient_id))

resources = patients + observations + conditions + encounters + medication_requests

return resources
Zethson commented 1 year ago
import pytest

def test_generate_all_resources():
    resources = generate_all_resources(10)
    assert len(resources) == 10 * (5 + 1 + 1 + 1 + 1)

    for resource in resources:
        assert resource['resourceType'] in ['Patient', 'Observation', 'Condition', 'Encounter', 'MedicationRequest']

        if resource['resourceType'] == 'Patient':
            assert resource.get('birthDate') is not None
            assert resource.get('gender') is not None

        elif resource['resourceType'] == 'Observation':
            assert resource.get('valueQuantity') is not None
            assert resource.get('code') is not None

        elif resource['resourceType'] == 'Condition':
            assert resource.get('code') is not None
            assert resource.get('subject') is not None

        elif resource['resourceType'] == 'Encounter':
            assert resource.get('class') is not None
            assert resource.get('subject') is not None

        elif resource['resourceType'] == 'MedicationRequest':
            assert resource.get('medicationReference') is not None
            assert resource.get('subject') is not None
Zethson commented 8 months ago

There is a pipeline that first uses synthea to generate CSV files and then a synthea R package to create OMOP files