synthetichealth / synthea

Synthetic Patient Population Simulator
https://synthetichealth.github.io/synthea
Apache License 2.0
2.18k stars 653 forks source link

repeated translation tags #992

Closed sam-hoffman closed 2 years ago

sam-hoffman commented 2 years ago

config file:

# Starting with a properties file because it requires no additional dependencies

exporter.baseDirectory = ./output/
exporter.use_uuid_filenames = false
exporter.subfolders_by_id_substring = false
# number of years of history to keep in exported records, anything older than this may be filtered out
# set years_of_history = 0 to skip filtering altogether and keep the entire history
exporter.years_of_history = 10
# split records allows patients to have one record per provider organization
exporter.split_records = true
exporter.split_records.duplicate_data = false
exporter.ccda.export = true
exporter.fhir.export = true
exporter.fhir_stu3.export = false
exporter.fhir_dstu2.export = false
exporter.fhir.use_shr_extensions = false
exporter.fhir.use_us_core_ig = false
exporter.fhir.transaction_bundle = true
exporter.fhir.bulk_data = false
exporter.groups.fhir.export = false
exporter.hospital.fhir.export = true
exporter.hospital.fhir_stu3.export = false
exporter.hospital.fhir_dstu2.export = false
exporter.practitioner.fhir.export = true
exporter.practitioner.fhir_stu3.export = false
exporter.practitioner.fhir_dstu2.export = false
exporter.encoding = UTF-8
exporter.csv.export = true
# if exporter.csv.append_mode = true, then each run will add new data to any existing CSVs. if false, each run will clear out the files and start fresh
exporter.csv.append_mode = false
# if exporter.csv.folder_per_run = true, then each run will have CSVs placed into a unique subfolder. if false, each run will only use the top-level csv folder
exporter.csv.folder_per_run = true
exporter.cpcds.export = false
exporter.cpcds.append_mode = false
exporter.cpcds.folder_per_run = false
exporter.cpcds.single_payer = false
exporter.cdw.export = false
exporter.text.export = false
exporter.text.per_encounter_export = false
exporter.clinical_note.export = true
exporter.cost_access_outcomes_report = false
exporter.prevalence_report = true
exporter.custom_report = false
# note: prevalence and custom reports require a database (set below)
exporter.custom_report_queries_file = custom_queries.sql
# parameters for symptoms export
exporter.symptoms.csv.export = true
# selection mode of conditions or symptom export: 0 = conditions according to  exporter.years_of_history. other values = all conditions (entire history)
exporter.symptoms.mode = 0
# if exporter.symptoms.csv.append_mode = true, then each run will add new data to any existing CSVs. if false, each run will clear out the files and start fresh
exporter.symptoms.csv.append_mode = false
# if exporter.symptoms.csv.folder_per_run = true, then each run will have CSVs placed into a unique subfolder. if false, each run will only use the top-level csv folder
exporter.symptoms.csv.folder_per_run = false
exporter.symptoms.text.export = false

# the number of patients to generate, by default
# this can be overridden by passing a different value to the Generator constructor
generate.default_population = 1

generate.log_patients.detail = simple
# options are "none", "simple", or "detailed" (without quotes). defaults to simple if another value is used
# none = print nothing to the console during generation
# simple = print patient names once they are generated.
# detailed = print patient names, atributes, vital signs, etc..  May slow down processing

generate.timestep = 604800000
# time is in ms
# 1000 * 60 * 60 * 24 * 7 = 604800000

generate.database_type = file
# options are "file", "in-memory", or "none" (without quotes)
# file = database stored in a file at ./database.mv.db, and results are kept between runs
# in-memory = in-memory DB only, results not kept between runs
# none = no database, limits certain features but increases throughput

# default demographics is every city in the US
generate.demographics.default_file = geography/demographics.csv
generate.geography.zipcodes.default_file = geography/zipcodes.csv
generate.geography.country_code = US
generate.geography.timezones.default_file = geography/timezones.csv
generate.geography.foreign.birthplace.default_file = geography/foreign_birthplace.json

# Lookup Table Folder location
generate.lookup_tables = modules/lookup_tables/

# Set to true if you want every patient to be dead.
generate.only_dead_patients = false
# Set to true if you want every patient to be alive.
generate.only_alive_patients = True
# If both only_dead_patients and only_alive_patients are set to true,
# It they will both default back to false

# if true, tracks and prints out details of transition tables for each module upon completion
# note that this may significantly slow down processing, and is intended primarily for debugging
generate.track_detailed_transition_metrics = false

# If true, person names have numbers appended to them to make them more obviously fake
generate.append_numbers_to_person_names = true

# if true, the entire population will use veteran prevalence data
generate.veteran_population_override = false

# these should add up to 1.0
# weighting and categories are inspired by the following but there are no specific hard numbers to point to
# http://www.ncbi.nlm.nih.gov/pmc/articles/PMC1694190/pdf/amjph00543-0042.pdf
# http://www.ncbi.nlm.nih.gov/pubmed/8122813
generate.demographics.socioeconomic.weights.income = 0.2
generate.demographics.socioeconomic.weights.education = 0.7
generate.demographics.socioeconomic.weights.occupation = 0.1

generate.demographics.socioeconomic.score.low = 0.0
generate.demographics.socioeconomic.score.middle = 0.25
generate.demographics.socioeconomic.score.high = 0.66

generate.demographics.socioeconomic.education.less_than_hs.min = 0.0
generate.demographics.socioeconomic.education.less_than_hs.max = 0.5
generate.demographics.socioeconomic.education.hs_degree.min = 0.1
generate.demographics.socioeconomic.education.hs_degree.max = 0.75
generate.demographics.socioeconomic.education.some_college.min = 0.3
generate.demographics.socioeconomic.education.some_college.max = 0.85
generate.demographics.socioeconomic.education.bs_degree.min = 0.5
generate.demographics.socioeconomic.education.bs_degree.max = 1.0

generate.demographics.socioeconomic.income.poverty = 11000
generate.demographics.socioeconomic.income.high = 75000

generate.birthweights.default_file = birthweights.csv
generate.birthweights.logging = false

# in Massachusetts, the individual insurance mandate became law in 2006
# in the US, the Affordable Care Act become law in 2010,
# and individual and employer mandates took effect in 2014.
# mandate.year will determine when individuals with an occupation score above mandate.occupation
# receive employer mandated insurance (aka "private" insurance).
# prior to mandate.year, anyone with income greater than the annual cost of an insurance plan
# will purchase the insurance.
generate.insurance.mandate.year = 2006
generate.insurance.mandate.occupation = 0.2

# Default Costs, to be used for pricing something that we don't have a specific price for
# -- $500 for procedures is completely invented
generate.costs.default_procedure_cost = 500.00
# -- $255 for medications - also invented
generate.costs.default_medication_cost = 255.00
# -- Encounters billed using avg prices from https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3096340/
# -- Adjustments for initial or subsequent hospital visit and level/complexity/time of encounter
# -- not included. Assume initial, low complexity encounter (Tables 4 & 6)
generate.costs.default_encounter_cost = 125.00
# -- https://www.nytimes.com/2014/07/03/health/Vaccine-Costs-Soaring-Paying-Till-It-Hurts.html
# -- currently all vaccines cost $136.
generate.costs.default_immunization_cost = 136.00

# Providers
generate.providers.hospitals.default_file = providers/hospitals.csv
generate.providers.longterm.default_file = providers/longterm.csv
generate.providers.nursing.default_file = providers/nursing.csv
generate.providers.rehab.default_file = providers/rehab.csv
generate.providers.hospice.default_file = providers/hospice.csv
generate.providers.dialysis.default_file = providers/dialysis.csv
generate.providers.homehealth.default_file = providers/home_health_agencies.csv
generate.providers.veterans.default_file = providers/va_facilities.csv
generate.providers.urgentcare.default_file = providers/urgent_care_facilities.csv
generate.providers.primarycare.default_file = providers/primary_care_facilities.csv

# Provider selection behavior
# How patients select a provider organization:
#  nearest - select the closest provider. See generate.providers.maximum_search_distance
#  quality - select the best provider if quality is known. Otherwise nearest.
#  random  - select randomly.
#  network - select the nearest provider in your insurance network. same as random except it changes every time the patient switches insurance provider.
generate.providers.selection_behavior = nearest

# maximum distance to look for a provider for a given patient, in km
# set to 10 degrees lat/lon to support the model that veterans only seek care at VA facilities
generate.providers.maximum_search_distance = 32

# Payers
generate.payers.insurance_companies.default_file = payers/insurance_companies.csv
generate.payers.insurance_companies.medicare = Medicare
generate.payers.insurance_companies.medicaid = Medicaid
generate.payers.insurance_companies.dual_eligible = Dual Eligible

# Payer selection behavior
# How patients select a payer:
#  best_rates - select insurance with best rates for person's existing conditions and medical needs
#  random  - select randomly.
generate.payers.selection_behavior = random

# Experimental feature. Patients will miss care if true, but side-effects of missing that care
# are not handled. Additionally, the path the disease module might take may no longer make sense.
# It might assume things occurred that haven't actually happened it. Use with care.
generate.payers.loss_of_care = false

# Add a FHIR terminology service URL to enable the use of ValueSet URIs within code definitions.
# generate.terminology_service_url = https://r4.ontoserver.csiro.au/fhir

# Quit Smoking
lifecycle.quit_smoking.baseline = 0.01
lifecycle.quit_smoking.timestep_delta = -0.01
lifecycle.quit_smoking.smoking_duration_factor_per_year = 1.0

# Quit Alcoholism
lifecycle.quit_alcoholism.baseline = 0.001
lifecycle.quit_alcoholism.timestep_delta = -0.001
lifecycle.quit_alcoholism.alcoholism_duration_factor_per_year = 1.0

# Adherence
lifecycle.adherence.baseline = 0.05

# set this to true to enable randomized "death by natural causes"
# highly recommended if "only_dead_patients" is true
lifecycle.death_by_natural_causes = false

# set this to enable "death by loss of care" or missed care,
# e.g. not covered by insurance or otherwise unaffordable.
# only functional if "generate.payers.loss_of_care" is also true.
lifecycle.death_by_loss_of_care = false

# Use physiology simulations to generate some VitalSigns
physiology.generators.enabled = false

# Allow physiology module states to be executed
# If false, all Physiology state objects will immediately redirect to the state defined in
# the alt_direct_transition field
physiology.state.enabled = false

# set to true to introduce errors in height, weight and BMI observations for people
# under 20 years old
growtherrors = false

java version: 17 run with ./run_synthea -p 10 -s 111111 --exporter.fhir.export false --exporter.ccda.export true

Lots of translation tags are repeated :(

One example is in a file for a person named Jasper Hirthe, who has many repetitions of the line <translation code="49436004" codeSystem="2.16.840.1.113883.6.96" displayName="Atrial Fibrillation"/>

Is there a way to avoid having these repeated lines?

eedrummer commented 2 years ago

This appears to be a bug. I can reproduce it using the same arguments you provided when running Synthea. Thanks for including clear reproducibility instructions with this issue.

It appears to be related to the CardiovascularDiseaseModule. The code related to adding conditions and medications to the person's record isn't checking to see whether the code already exists. I doesn't have any clinical impact on the simulation, but the CCDA export dutifully lists out all of the codes, which is why you see all of those translations.

I'm going to start working on a fix for this.