FDA / openfda

openFDA is an FDA project to provide open APIs, raw data downloads, documentation and examples, and a developer community for an important collection of FDA public datasets.
https://open.fda.gov
Creative Commons Zero v1.0 Universal
569 stars 131 forks source link

JSON lines format #127

Open maksle opened 4 years ago

maksle commented 4 years ago

It would be very useful to have a version of the dataset download files provided in JSON Lines format (one self contained record per line) so that it is splittable for ingestion by a distributed cluster computing system like Spark. In the current format, each file has to be loaded into memory entirely before it can be ingested.

beckyconning commented 4 years ago

JSON lines would be preferable but btw https://github.com/precog/tectonic can be used to load data like this in a streaming manner.

apos37 commented 2 years ago

So, has there been any progress on this in the two years since this was requested? Or does anybody have suggestions on formatting the results? I'm using PHP to fetch the data, and would like to format it with html/css. Just a string of text is impossible to format when the results are vastly different from one drug to the next. At the very least indicate line breaks.

cwdgservices commented 1 month ago

I made a simple python script to modify the json documents. The only issue I have run into is broken json within the export files. The script tries to fix them if it can otherwise it stores them in a log file. It is currently made to run through two folders named device and drug. This version only works on windows.

import os
import json
import subprocess
from datetime import datetime
from multiprocessing import Pool, cpu_count
from fix_busted_json import repair_json

def fix_json_file(file_path):
    print(f"Fixing JSON file: {file_path}")
    try:
        with open(file_path, 'r') as file:
            invalid_json = file.read()

        fixed_json = repair_json(invalid_json)

        with open(file_path, 'w') as file:
            file.write(fixed_json)
        print(f"Fixed JSON file: {file_path}")
    except Exception as e:
        print(f"Error fixing JSON file {file_path}: {e}")

def process_file(file_path):
    print(f"Processing file: {file_path}")
    valid_entries = []
    bad_files = []

    try:
        with open(file_path, 'r') as file:
            data = json.load(file)
    except json.JSONDecodeError:
        print(f"JSONDecodeError in file {file_path}. Logging for repair.")
        return [file_path]
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return None
    except Exception as e:
        print(f"Unexpected error reading file {file_path}: {e}")
        return None

    results_content = data.get('results', [])
    if not results_content:
        print(f"No 'results' key found or it's empty in file {file_path}.")

    cleaned_file_path = file_path.replace('.json', '_cleaned.json')
    try:
        with open(cleaned_file_path, 'w') as file:
            for entry in results_content:
                try:
                    json.dumps(entry)  # Check for validity
                    json.dump(entry, file)
                    file.write('\n')
                except (TypeError, ValueError) as e:
                    print(f"Skipping invalid entry: {e}")
    except Exception as e:
        print(f"Error writing to file {cleaned_file_path}: {e}")

    return bad_files if bad_files else None

def process_all_json_files():
    directories = ['device', 'drug']
    file_paths = []
    log_file = 'files_to_repair.log'
    current_directory = os.getcwd()
    print(f"Current directory: {current_directory}")

    for directory in directories:
        dir_path = os.path.join(current_directory, directory)
        if os.path.isdir(dir_path):
            for root, dirs, files in os.walk(dir_path):
                file_paths.extend(os.path.join(root, filename) for filename in files if filename.endswith('.json'))
                print(f"Found files in {root}")

    if not file_paths:
        print("No JSON files found in the specified directories.")
        return

    print(f"Found {len(file_paths)} JSON files. Processing...")

    repair_files = []
    with Pool(cpu_count()) as pool:
        results = pool.map(process_file, file_paths)

    for result in results:
        if result:
            repair_files.extend(result)

    if repair_files:
        print(f"Files that need repair: {repair_files}")
        with open(log_file, 'w') as log:
            for file_path in repair_files:
                log.write(file_path + '\n')

        # Process repair files separately
        print(f"Starting repair process for {len(repair_files)} files...")
        for file_path in repair_files:
            fix_json_file(file_path)
    else:
        print("No files needed repair.")

if __name__ == "__main__":
    process_all_json_files()