TheChymera / neuralynx_nwb

Neuralynx to NWB converstion scripts (ideally to be upstreamed)
0 stars 0 forks source link

Files generated seem suspiciously small 🤔 #10

Closed TheChymera closed 1 year ago

TheChymera commented 1 year ago

So everything seems to “work” now, but when checking the output file it's 13MB for a 9.9GB input directory. I also don't get any warnings from the interface regarding something being missing.

@CodyCBakerPhD any ideas? Just to keep in mind, this is no longer the multi-stream session. Decided to defer that so we can first make sure things work in a simpler case.

The code run is shown below. There's a bunch of additional stuff for custom metadata read-in and output dir organization, the key part is on the last few lines, which should still be analogous to the approach first recommended (this is all equivalent to the code in this repo as of 1ccd550be1804330975c8c4c7cd265d5d87a7f02 ). This is using the current master from neuroconv (924e41a917f38222f0b8c51ba5bdff13485b8a03).

[dark]~/src/neuralynx_nwb ❱ ./convert.sh ~/.local/share/datalad/vStr_phase_stim/raw/M322/M322-2022-07-22 /tmp/
INFO:Reading lab metadata from /home/chymera/.local/share/datalad/vStr_phase_stim/raw/M322/M322-2022-07-22/M322_2022_07_22_Expkeys.m
INFO:Recording the following metadata:
{'Ecephys': {'Device': [{'description': 'no description',
                         'name': 'DeviceEcephys'},
                        {'description': 'Cheetah 6.4.2.dev0',
                         'name': 'AcqSystem1 DigitalLynxSX'}],
             'ElectricalSeries': {'description': 'Acquisition traces for the '
                                                 'ElectricalSeries.',
                                  'name': 'ElectricalSeries'},
             'ElectrodeGroup': [{'description': 'no description',
                                 'device': 'DeviceEcephys',
                                 'location': 'unknown',
                                 'name': 'ElectrodeGroup'}]},
 'NWBFile': {'identifier': '7307e29e-19c2-4a58-9c49-3b18c259e975',
             'notes': '{"sampling_rate": "32000.0", "InputRange": "[6000]", '
                      '"FileVersion": "3.4", "TimeClosed": "2022/07/22 '
                      '19:24:16", "TimeCreated": "2022/07/22 17:07:07", '
                      '"RecordSize": "1044", "FileType": "NCS", '
                      '"bit_to_microVolt": "[0.18310546875]", "ADMaxValue": '
                      '"32767", "NumADChannels": "1", "recording_closed": '
                      '"2022-07-22 19:24:16", "input_inverted": "True"}',
             'session_description': 'Auto-generated by neuroconv',
             'session_id': '59e15b7c-7810-4c39-9890-efe596f6f951',
             'session_start_time': datetime.datetime(2022, 7, 22, 17, 7, 7, tzinfo=tzfile('/etc/localtime'))}}.
[dark]~/src/neuralynx_nwb ❱ du -sh /tmp/sub-M322/ses-20220722/ieeg/sub-M322_ses-20220722_ieeg.nwb
13M /tmp/sub-M322/ses-20220722/ieeg/sub-M322_ses-20220722_ieeg.nwb
[dark]~/src/neuralynx_nwb ❱ du -shL ~/.local/share/datalad/vStr_phase_stim/raw/M322/M322-2022-07-22/
9.9G    /home/chymera/.local/share/datalad/vStr_phase_stim/raw/M322/M322-2022-07-22/
[dark]~/src/neuralynx_nwb ❱ cat convert.sh
#!/usr/bin/env bash

data_selection="$1"

python -c "from neuralynx_nwb import newconvert; newconvert.reposit_data(session_dir=\"${1}\", out_dir=\"${2}\", timezone='US/Boston')"
[dark]~/src/neuralynx_nwb ❱ cat neuralynx_nwb/newconvert.py
import os
import logging
import pprint

from datetime import datetime
from dateutil import tz

from neuroconv.datainterfaces import NeuralynxRecordingInterface
from neuroconv.tools.spikeinterface import write_recording
from spikeinterface.extractors import NeuralynxRecordingExtractor

logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)

def lab_metadata(in_dir):
    subject_session = [i for i in in_dir.split("/") if i][-1]
    subject_session = subject_session.replace('-', "_")
    path_subject, raw_path_session = subject_session.split("_", 1)
    path_session = raw_path_session.replace('_','')
    exp_metadata = os.path.join(in_dir, subject_session+"_Expkeys.m")
    logging.info(f"Reading lab metadata from {exp_metadata}")
    # ideally we wouldn't have to do this regex magick and the metadata would be available as JSON.
    m={}
    #with open(exp_metadata, "r") as f:
    #   for line in f:
    #       print(line)
    if not 'subject' in m:
        m['subject'] = path_subject
    elif m['subject'] != path_subject:
        logger.info(
                f'The path-derived ({path_subject}) and operator specified ({metadata["subject"]}) subject names differ. Using the latter.'
                )
    if not 'session' in m:
        m['session'] = path_session
    elif m['session'] != path_session:
        logger.info(
                f'The path-derived ({path_session}) and operator specified ({metadata["session"]}) session names differ. Using the latter.'
                )
    return m

def _get_session_start_time(session_dir, timezone=""):
    """
    Get session start times from `.ncs` file headers based on a simple Python heuristic.

    Notes
    -----
    * Using `subprocess.run()` since the encoding autodetection and limited read of `head` is tricky to emulate in Python.
    """
    import subprocess
    import re
    start_times = []
    for i in os.listdir(session_dir):
        if i.endswith(".ncs"):
            result = subprocess.run(
                    ["head","-10",os.path.join(session_dir,i)],
                    capture_output=True,
                    )
            try:
                file_head = str(result.stdout)
                time_dict = re.search("-TimeCreated (?P<year>[0-9]+)/(?P<month>[0-9]+)/(?P<day>[0-9]+) (?P<hour>[0-9]+):(?P<minute>[0-9]+):(?P<second>[0-9]+)",file_head).groupdict()
            except TypeError:
                pass
            else:
                start_time = datetime(
                        int(time_dict["year"]),
                        int(time_dict["month"]),
                        int(time_dict["day"]),
                        int(time_dict["hour"]),
                        int(time_dict["minute"]),
                        int(time_dict["second"]),
                        tzinfo=tz.gettz(timezone),
                        )
                start_times.append(start_time)
    if len(set(start_times)) > 1:
        logging.critical(f'Session Data indicates conflicting start dates: {start_times}')
        raise ValueError("Need unambiguous start date.")
    else:
        return start_times[0]

def reposit_data(session_dir,
    lab_name='MVDMLab',
    institution='Dartmouth College',
    keywords=[
        'DANDI Pilot',
        ],
    experimenter='Manish Mohapatra',
    experiment_description='...',
    debug=True,
    session_description='Extracellular ephys recording in the ventral Striatum',
    keep_original_times=True,
    output_filename='neuralynx_nwb_testfile',
    timezone="",
    out_dir='~/.local/share/scratch',
    ):
    """
    Reposit data from neuralynx and lab-specific sidecar metadata to BIDS+NWB

    Parameters
    ----------

    session_dir : str
        A path to the session directory to reposit, e.g. `~/.local/share/datalad/vStr_phase_stim/raw/M235/M235-2021-07-16`.
    out_dir : str, optional
        A path to a directory in which to reposit the data.
    """

    session_dir = os.path.abspath(os.path.expanduser(session_dir))
    #now = datetime.today().strftime('%Y%m%d%H%M%S')
    m = lab_metadata(session_dir)

    # Interface approach
    interface = NeuralynxRecordingInterface(folder_path=session_dir, verbose=False)
    metadata = interface.get_metadata()
    session_start_time = _get_session_start_time(session_dir)
    metadata["NWBFile"].update(session_start_time=session_start_time)
    logging.info(f'Recording the following metadata:\n{pprint.pformat(metadata)}.')

    # Output paths, after all metadata is read in, in case we need esoteric fields for BIDS.
    out_dir = os.path.abspath(os.path.expanduser(out_dir))
    ieeg_filename = f'sub-{m["subject"]}_ses-{m["session"]}_ieeg.nwb'
    ieeg_dir = os.path.join(out_dir,
        f'sub-{m["subject"]}',
        f'ses-{m["session"]}',
        'ieeg',
        )
    ieeg_path = os.path.join(ieeg_dir,ieeg_filename)
    os.makedirs(ieeg_dir, exist_ok=True)

    # Run conversion
    interface.run_conversion(nwbfile_path=ieeg_path, metadata=metadata, stub_test=True)  # stub_test for fast testing
    #interface.run_conversion(nwbfile_path=f"/tmp/path-{now}.nwb", metadata=metadata)

    # Extractor approach
    #recording = NeuralynxRecordingExtractor(folder_path=session_dir, stream_id='0')
    #recording = NeuralynxRecordingExtractor(folder_path=session_dir)
    #recording["NWBFile"]["session_start_time"] = now
    #write_recording(recording=recording, nwbfile_path=out_file)
CodyCBakerPhD commented 1 year ago

The last line, interface.run_conversion(nwbfile_path=ieeg_path, metadata=metadata, stub_test=True)

stub_test=True limits the total data written to very small amounts for quick testing that the pipeline works.

Remove the keyword argument or set to False to enable a full conversion

TheChymera commented 1 year ago

Ah, thank you, I thought the parameter ran a stub of tests :3