SainsburyWellcomeCentre / aeon_mecha

Project Aeon's main library for interfacing with acquired data. Contains modules for raw data file io, data querying, data processing, data qc, database ingestion, and building computational data pipelines.
BSD 3-Clause "New" or "Revised" License
4 stars 6 forks source link

KeyError instances from #340

Open JaerongA opened 6 months ago

JaerongA commented 6 months ago

This is related to #327 . The following 9 chunks from social0.2 (from AEON3 and AEON4) have errored out during acquisition.Environment ingestion.

[{'experiment_name': 'social0.2-aeon3',
  'chunk_start': datetime.datetime(2024, 2, 25, 17, 0)},
 {'experiment_name': 'social0.2-aeon3',
  'chunk_start': datetime.datetime(2024, 2, 28, 12, 0)},
 {'experiment_name': 'social0.2-aeon3',
  'chunk_start': datetime.datetime(2024, 3, 1, 15, 0)},
 {'experiment_name': 'social0.2-aeon4',
  'chunk_start': datetime.datetime(2024, 2, 5, 14, 0)},
 {'experiment_name': 'social0.2-aeon4',
  'chunk_start': datetime.datetime(2024, 2, 13, 13, 0)},
 {'experiment_name': 'social0.2-aeon4',
  'chunk_start': datetime.datetime(2024, 2, 13, 14, 0)},
 {'experiment_name': 'social0.2-aeon4',
  'chunk_start': datetime.datetime(2024, 2, 13, 14, 48, 13)},
 {'experiment_name': 'social0.2-aeon4',
  'chunk_start': datetime.datetime(2024, 2, 13, 14, 56, 28)},
 {'experiment_name': 'social0.2-aeon4',
  'chunk_start': datetime.datetime(2024, 2, 25, 17, 0)}]

For example, running the following code yields a KeyError.

from aeon.schema.schemas import social02
from aeon.io.api import load
from aeon.dj_pipeline import acquisition
import pandas as pd
import datetime

key = {'experiment_name': 'social0.2-aeon3',
  'chunk_start': datetime.datetime(2024, 2, 25, 17, 0)}

chunk_start, chunk_end = (acquisition.Chunk & key).fetch1("chunk_start", "chunk_end")
raw_data_dir = acquisition.Experiment.get_data_directory(key)
stream_reader = getattr(social02.Environment, "SubjectVisits")
stream_data = load(
    root=raw_data_dir.as_posix(),
    reader=stream_reader,
    start=pd.Timestamp(chunk_start),
    end=pd.Timestamp(chunk_end),
)
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
File ~/.conda/envs/aeon/lib/python3.11/site-packages/pandas/core/indexes/base.py:3802, in Index.get_loc(self, key, method, tolerance)
   3801 try:
-> 3802     return self._engine.get_loc(casted_key)
   3803 except KeyError as err:

File ~/.conda/envs/aeon/lib/python3.11/site-packages/pandas/_libs/index.pyx:516, in pandas._libs.index.DatetimeEngine.get_loc()

File ~/.conda/envs/aeon/lib/python3.11/site-packages/pandas/_libs/index.pyx:545, in pandas._libs.index.DatetimeEngine.get_loc()

File ~/.conda/envs/aeon/lib/python3.11/site-packages/pandas/_libs/index.pyx:203, in pandas._libs.index.IndexEngine._get_loc_duplicates()

File ~/.conda/envs/aeon/lib/python3.11/site-packages/pandas/_libs/index.pyx:211, in pandas._libs.index.IndexEngine._maybe_get_bool_indexer()

File ~/.conda/envs/aeon/lib/python3.11/site-packages/pandas/_libs/index.pyx:107, in pandas._libs.index._unpack_bool_indexer()

KeyError: 1708880400000000000

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
File ~/.conda/envs/aeon/lib/python3.11/site-packages/pandas/core/indexes/datetimes.py:736, in DatetimeIndex.get_loc(self, key, method, tolerance)
    735 try:
--> 736     return Index.get_loc(self, key, method, tolerance)
    737 except KeyError as err:

File ~/.conda/envs/aeon/lib/python3.11/site-packages/pandas/core/indexes/base.py:3804, in Index.get_loc(self, key, method, tolerance)
   3803 except KeyError as err:
-> 3804     raise KeyError(key) from err
   3805 except TypeError:
   3806     # If we have a listlike key, _check_indexing_error will raise
   3807     #  InvalidIndexError. Otherwise we fall through and re-raise
   3808     #  the TypeError.

KeyError: Timestamp('2024-02-25 17:00:00')

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
File ~/ProjectAeon/aeon_mecha/aeon/io/api.py:141, in load(root, reader, start, end, time, tolerance, epoch)
    140 try:
--> 141     return data.loc[start:end]
    142 except KeyError:

File ~/.conda/envs/aeon/lib/python3.11/site-packages/pandas/core/indexing.py:1073, in _LocationIndexer.__getitem__(self, key)
   1072 maybe_callable = com.apply_if_callable(key, self.obj)
-> 1073 return self._getitem_axis(maybe_callable, axis=axis)

File ~/.conda/envs/aeon/lib/python3.11/site-packages/pandas/core/indexing.py:1290, in _LocIndexer._getitem_axis(self, key, axis)
   1289     self._validate_key(key, axis)
-> 1290     return self._get_slice_axis(key, axis=axis)
   1291 elif com.is_bool_indexer(key):

File ~/.conda/envs/aeon/lib/python3.11/site-packages/pandas/core/indexing.py:1324, in _LocIndexer._get_slice_axis(self, slice_obj, axis)
   1323 labels = obj._get_axis(axis)
-> 1324 indexer = labels.slice_indexer(slice_obj.start, slice_obj.stop, slice_obj.step)
   1326 if isinstance(indexer, slice):

File ~/.conda/envs/aeon/lib/python3.11/site-packages/pandas/core/indexes/datetimes.py:809, in DatetimeIndex.slice_indexer(self, start, end, step, kind)
    804 if (
    805     check_str_or_none(start)
    806     or check_str_or_none(end)
    807     or self.is_monotonic_increasing
    808 ):
--> 809     return Index.slice_indexer(self, start, end, step, kind=kind)
    811 mask = np.array(True)

File ~/.conda/envs/aeon/lib/python3.11/site-packages/pandas/core/indexes/base.py:6559, in Index.slice_indexer(self, start, end, step, kind)
   6557 self._deprecated_arg(kind, "kind", "slice_indexer")
-> 6559 start_slice, end_slice = self.slice_locs(start, end, step=step)
   6561 # return a slice

File ~/.conda/envs/aeon/lib/python3.11/site-packages/pandas/core/indexes/base.py:6767, in Index.slice_locs(self, start, end, step, kind)
   6766 if start is not None:
-> 6767     start_slice = self.get_slice_bound(start, "left")
   6768 if start_slice is None:

File ~/.conda/envs/aeon/lib/python3.11/site-packages/pandas/core/indexes/base.py:6686, in Index.get_slice_bound(self, label, side, kind)
   6684     except ValueError:
   6685         # raise the original KeyError
-> 6686         raise err
   6688 if isinstance(slc, np.ndarray):
   6689     # get_loc may return a boolean array, which
   6690     # is OK as long as they are representable by a slice.

File ~/.conda/envs/aeon/lib/python3.11/site-packages/pandas/core/indexes/base.py:6680, in Index.get_slice_bound(self, label, side, kind)
   6679 try:
-> 6680     slc = self.get_loc(label)
   6681 except KeyError as err:

File ~/.conda/envs/aeon/lib/python3.11/site-packages/pandas/core/indexes/datetimes.py:738, in DatetimeIndex.get_loc(self, key, method, tolerance)
    737 except KeyError as err:
--> 738     raise KeyError(orig_key) from err

KeyError: Timestamp('2024-02-25 17:00:00')

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
File ~/.conda/envs/aeon/lib/python3.11/site-packages/pandas/_libs/index.pyx:548, in pandas._libs.index.DatetimeEngine.get_loc()

File pandas/_libs/hashtable_class_helper.pxi:2263, in pandas._libs.hashtable.Int64HashTable.get_item()

File pandas/_libs/hashtable_class_helper.pxi:2273, in pandas._libs.hashtable.Int64HashTable.get_item()

KeyError: 1708880400000000000

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
File ~/.conda/envs/aeon/lib/python3.11/site-packages/pandas/core/indexes/base.py:3802, in Index.get_loc(self, key, method, tolerance)
   3801 try:
-> 3802     return self._engine.get_loc(casted_key)
   3803 except KeyError as err:

File ~/.conda/envs/aeon/lib/python3.11/site-packages/pandas/_libs/index.pyx:516, in pandas._libs.index.DatetimeEngine.get_loc()

File ~/.conda/envs/aeon/lib/python3.11/site-packages/pandas/_libs/index.pyx:550, in pandas._libs.index.DatetimeEngine.get_loc()

KeyError: Timestamp('2024-02-25 17:00:00')

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
File ~/.conda/envs/aeon/lib/python3.11/site-packages/pandas/core/indexes/datetimes.py:736, in DatetimeIndex.get_loc(self, key, method, tolerance)
    735 try:
--> 736     return Index.get_loc(self, key, method, tolerance)
    737 except KeyError as err:

File ~/.conda/envs/aeon/lib/python3.11/site-packages/pandas/core/indexes/base.py:3804, in Index.get_loc(self, key, method, tolerance)
   3803 except KeyError as err:
-> 3804     raise KeyError(key) from err
   3805 except TypeError:
   3806     # If we have a listlike key, _check_indexing_error will raise
   3807     #  InvalidIndexError. Otherwise we fall through and re-raise
   3808     #  the TypeError.

KeyError: Timestamp('2024-02-25 17:00:00')

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
Cell In[5], line 13
     11 raw_data_dir = acquisition.Experiment.get_data_directory(key)
     12 stream_reader = getattr(social02.Environment, "SubjectVisits")
---> 13 stream_data = load(
     14     root=raw_data_dir.as_posix(),
     15     reader=stream_reader,
     16     start=pd.Timestamp(chunk_start),
     17     end=pd.Timestamp(chunk_end),
     18 )

File ~/ProjectAeon/aeon_mecha/aeon/io/api.py:151, in load(root, reader, start, end, time, tolerance, epoch)
    149             warnings.warn(f"data index for {reader.pattern} contains duplicate keys!")
    150             data = data[~data.index.duplicated(keep="first")]
--> 151         return data.loc[start:end]
    152 return data

File ~/.conda/envs/aeon/lib/python3.11/site-packages/pandas/core/indexing.py:1073, in _LocationIndexer.__getitem__(self, key)
   1070 axis = self.axis or 0
   1072 maybe_callable = com.apply_if_callable(key, self.obj)
-> 1073 return self._getitem_axis(maybe_callable, axis=axis)

File ~/.conda/envs/aeon/lib/python3.11/site-packages/pandas/core/indexing.py:1290, in _LocIndexer._getitem_axis(self, key, axis)
   1288 if isinstance(key, slice):
   1289     self._validate_key(key, axis)
-> 1290     return self._get_slice_axis(key, axis=axis)
   1291 elif com.is_bool_indexer(key):
   1292     return self._getbool_axis(key, axis=axis)

File ~/.conda/envs/aeon/lib/python3.11/site-packages/pandas/core/indexing.py:1324, in _LocIndexer._get_slice_axis(self, slice_obj, axis)
   1321     return obj.copy(deep=False)
   1323 labels = obj._get_axis(axis)
-> 1324 indexer = labels.slice_indexer(slice_obj.start, slice_obj.stop, slice_obj.step)
   1326 if isinstance(indexer, slice):
   1327     return self.obj._slice(indexer, axis=axis)

File ~/.conda/envs/aeon/lib/python3.11/site-packages/pandas/core/indexes/datetimes.py:809, in DatetimeIndex.slice_indexer(self, start, end, step, kind)
    801 # GH#33146 if start and end are combinations of str and None and Index is not
    802 # monotonic, we can not use Index.slice_indexer because it does not honor the
    803 # actual elements, is only searching for start and end
    804 if (
    805     check_str_or_none(start)
    806     or check_str_or_none(end)
    807     or self.is_monotonic_increasing
    808 ):
--> 809     return Index.slice_indexer(self, start, end, step, kind=kind)
    811 mask = np.array(True)
    812 deprecation_mask = np.array(True)

File ~/.conda/envs/aeon/lib/python3.11/site-packages/pandas/core/indexes/base.py:6559, in Index.slice_indexer(self, start, end, step, kind)
   6516 """
   6517 Compute the slice indexer for input labels and step.
   6518 
   (...)
   6555 slice(1, 3, None)
   6556 """
   6557 self._deprecated_arg(kind, "kind", "slice_indexer")
-> 6559 start_slice, end_slice = self.slice_locs(start, end, step=step)
   6561 # return a slice
   6562 if not is_scalar(start_slice):

File ~/.conda/envs/aeon/lib/python3.11/site-packages/pandas/core/indexes/base.py:6767, in Index.slice_locs(self, start, end, step, kind)
   6765 start_slice = None
   6766 if start is not None:
-> 6767     start_slice = self.get_slice_bound(start, "left")
   6768 if start_slice is None:
   6769     start_slice = 0

File ~/.conda/envs/aeon/lib/python3.11/site-packages/pandas/core/indexes/base.py:6686, in Index.get_slice_bound(self, label, side, kind)
   6683         return self._searchsorted_monotonic(label, side)
   6684     except ValueError:
   6685         # raise the original KeyError
-> 6686         raise err
   6688 if isinstance(slc, np.ndarray):
   6689     # get_loc may return a boolean array, which
   6690     # is OK as long as they are representable by a slice.
   6691     assert is_bool_dtype(slc.dtype)

File ~/.conda/envs/aeon/lib/python3.11/site-packages/pandas/core/indexes/base.py:6680, in Index.get_slice_bound(self, label, side, kind)
   6678 # we need to look up the label
   6679 try:
-> 6680     slc = self.get_loc(label)
   6681 except KeyError as err:
   6682     try:

File ~/.conda/envs/aeon/lib/python3.11/site-packages/pandas/core/indexes/datetimes.py:738, in DatetimeIndex.get_loc(self, key, method, tolerance)
    736     return Index.get_loc(self, key, method, tolerance)
    737 except KeyError as err:
--> 738     raise KeyError(orig_key) from err

KeyError: Timestamp('2024-02-25 17:00:00')
ttngu207 commented 6 months ago

Thanks @JaerongA .

@jkbhagatio, these are the files to be manually fixed and placed into the preprocessed directory

ttngu207 commented 6 months ago

@JaerongA any other chunks/files from other streams? (potentially ingestion errors from the streams schema)?

JaerongA commented 6 months ago

@ttngu207 Yes, there are 16 keys that errored out in the streams schema.

Here's the code that yielded key errors.

from aeon.schema import schemas
from aeon.io.api import load
from aeon.dj_pipeline import acquisition, streams
import pandas as pd
import datetime

key = {'experiment_name': 'social0.1-aeon3',
 'chunk_start': datetime.datetime(2023, 12, 1, 14, 0),
 'device_serial_number': 'COM15',
 'rfid_reader_install_time': datetime.datetime(2023, 11, 22, 10, 19, 33)}

chunk_start, chunk_end, dir_type = (acquisition.Chunk & key).fetch1(
    "chunk_start", "chunk_end", "directory_type"
)
raw_data_dir = acquisition.Experiment.get_data_directory(key, directory_type=dir_type)

device_name = (streams.RfidReader & key).fetch1('rfid_reader_name')

devices_schema = getattr(
    schemas,
    (acquisition.Experiment.DevicesSchema & {"experiment_name": key["experiment_name"]}).fetch1(
        "devices_schema_name"
    ),
)
stream_reader = getattr(getattr(devices_schema, device_name), "RfidEvents")

stream_data = load(
    root=raw_data_dir.as_posix(),
    reader=stream_reader,
    start=pd.Timestamp(chunk_start),
    end=pd.Timestamp(chunk_end),
)

keys

keys = [
    {
        "experiment_name": "social0.1-aeon3",
        "chunk_start": datetime.datetime(2023, 12, 1, 14, 0),
        "device_serial_number": "COM15",
        "rfid_reader_install_time": datetime.datetime(2023, 11, 22, 10, 19, 33),
    },
    {
        "experiment_name": "social0.1-aeon3",
        "chunk_start": datetime.datetime(2023, 11, 30, 15, 47, 19),
        "device_serial_number": "COM12",
        "rfid_reader_install_time": datetime.datetime(2023, 11, 22, 10, 19, 33),
    },
    {
        "experiment_name": "social0.1-aeon3",
        "chunk_start": datetime.datetime(2023, 12, 2, 15, 58, 34),
        "device_serial_number": "COM13",
        "rfid_reader_install_time": datetime.datetime(2023, 11, 22, 10, 19, 33),
    },
    {
        "experiment_name": "social0.1-aeon3",
        "chunk_start": datetime.datetime(2023, 12, 3, 13, 5, 15),
        "device_serial_number": "COM15",
        "rfid_reader_install_time": datetime.datetime(2023, 11, 22, 10, 19, 33),
    },
    {
        "experiment_name": "social0.1-aeon3",
        "chunk_start": datetime.datetime(2023, 11, 30, 13, 0),
        "device_serial_number": "COM13",
        "rfid_reader_install_time": datetime.datetime(2023, 11, 22, 10, 19, 33),
    },
    {
        "experiment_name": "social0.1-aeon3",
        "chunk_start": datetime.datetime(2023, 11, 30, 14, 0),
        "device_serial_number": "COM17",
        "rfid_reader_install_time": datetime.datetime(2023, 11, 22, 10, 19, 33),
    },
    {
        "experiment_name": "social0.1-aeon3",
        "chunk_start": datetime.datetime(2023, 12, 2, 15, 0),
        "device_serial_number": "COM13",
        "rfid_reader_install_time": datetime.datetime(2023, 11, 22, 10, 19, 33),
    },
    {
        "experiment_name": "social0.1-aeon3",
        "chunk_start": datetime.datetime(2023, 12, 3, 13, 30, 30),
        "device_serial_number": "COM15",
        "rfid_reader_install_time": datetime.datetime(2023, 11, 22, 10, 19, 33),
    },
    {
        "experiment_name": "social0.1-aeon3",
        "chunk_start": datetime.datetime(2023, 11, 30, 13, 55, 41),
        "device_serial_number": "COM13",
        "rfid_reader_install_time": datetime.datetime(2023, 11, 22, 10, 19, 33),
    },
    {
        "experiment_name": "social0.1-aeon3",
        "chunk_start": datetime.datetime(2023, 11, 30, 15, 0),
        "device_serial_number": "COM17",
        "rfid_reader_install_time": datetime.datetime(2023, 11, 22, 10, 19, 33),
    },
    {
        "experiment_name": "social0.1-aeon3",
        "chunk_start": datetime.datetime(2023, 11, 30, 15, 47, 19),
        "device_serial_number": "COM17",
        "rfid_reader_install_time": datetime.datetime(2023, 11, 22, 10, 19, 33),
    },
    {
        "experiment_name": "social0.1-aeon3",
        "chunk_start": datetime.datetime(2023, 12, 3, 13, 0),
        "device_serial_number": "COM15",
        "rfid_reader_install_time": datetime.datetime(2023, 11, 22, 10, 19, 33),
    },
    {
        "experiment_name": "social0.1-aeon3",
        "chunk_start": datetime.datetime(2023, 12, 2, 14, 0),
        "device_serial_number": "COM13",
        "rfid_reader_install_time": datetime.datetime(2023, 11, 22, 10, 19, 33),
    },
    {
        "experiment_name": "social0.1-aeon3",
        "chunk_start": datetime.datetime(2023, 11, 30, 14, 0),
        "device_serial_number": "COM12",
        "rfid_reader_install_time": datetime.datetime(2023, 11, 22, 10, 19, 33),
    },
    {
        "experiment_name": "social0.1-aeon3",
        "chunk_start": datetime.datetime(2023, 11, 30, 15, 0),
        "device_serial_number": "COM12",
        "rfid_reader_install_time": datetime.datetime(2023, 11, 22, 10, 19, 33),
    },
    {
        "experiment_name": "social0.1-aeon3",
        "chunk_start": datetime.datetime(2023, 12, 1, 14, 0),
        "device_serial_number": "COM12",
        "rfid_reader_install_time": datetime.datetime(2023, 11, 22, 10, 19, 33),
    },
]
ttngu207 commented 6 months ago

Thanks @JaerongA

Looks like the errors are mostly from SubjectVisits and Rfid streams

jkbhagatio commented 6 months ago

Once the corrected data has been put in 'processed' I will let @ttngu207 @JaerongA know, to proceed with ingestion