Open Mitchellb16 opened 2 months ago
This code will fix this problem in the meantime
# Define a function to apply to each cell of the DataFrame
def extract_value(cell):
try:
# Use np.squeeze to convert array of arrays to single value
return np.squeeze(cell)
except (ValueError, TypeError):
# If unable to extract, return the original value
return cell
# Apply the function to all cells in the DataFrame and drop columns that have NaN
analyze_epochs_clean = analyze_epochs.map(extract_value).dropna(axis=1)
I am not sure I understood what's the issue is, could you paste an example of what the output is currently and how it could be improved?
The dataframe that is returned by ecg_intervalrelated() has columns that are arrays of arrays, i.e. each cell is formatted like [[value]].
So when we call
type(analyze_epochs.HRV_MeanNN[0])
we get numpy.ndarray
but in the ECG_Rate_Mean column
type(analyze_epochs.ECG_Rate_Mean[0])
we get numpy.float64
Many functions want the data of a column to be formatted like the latter. For example, calling pandas plotting functions:
analyze_epochs.ECG_Rate_Mean.plot()
runs just fine, whereas
analyze_epochs.HRV_MeanNN.plot()
returns this error:
TypeError Traceback (most recent call last)
Cell In[12], line 1
----> 1 analyze_epochs.HRV_MeanNN.plot()
File ~/anaconda3/envs/phys-env/lib/python3.12/site-packages/pandas/plotting/_core.py:1030, in PlotAccessor.__call__(self, *args, **kwargs)
1027 label_name = label_kw or data.columns
1028 data.columns = label_name
-> 1030 return plot_backend.plot(data, kind=kind, **kwargs)
File ~/anaconda3/envs/phys-env/lib/python3.12/site-packages/pandas/plotting/_matplotlib/__init__.py:71, in plot(data, kind, **kwargs)
69 kwargs["ax"] = getattr(ax, "left_ax", ax)
70 plot_obj = PLOT_CLASSES[kind](data, **kwargs)
---> 71 plot_obj.generate()
72 plot_obj.draw()
73 return plot_obj.result
File ~/anaconda3/envs/phys-env/lib/python3.12/site-packages/pandas/plotting/_matplotlib/core.py:499, in MPLPlot.generate(self)
497 @final
498 def generate(self) -> None:
--> 499 self._compute_plot_data()
500 fig = self.fig
501 self._make_plot(fig)
File ~/anaconda3/envs/phys-env/lib/python3.12/site-packages/pandas/plotting/_matplotlib/core.py:698, in MPLPlot._compute_plot_data(self)
696 # no non-numeric frames or series allowed
697 if is_empty:
--> 698 raise TypeError("no numeric data to plot")
700 self.data = numeric_data.apply(type(self)._convert_to_ndarray)
TypeError: no numeric data to plot
So the goal would be to make all the columns look like the ECG_Rate_Mean column
Label ECG_Rate_Mean HRV_MeanNN HRV_SDNN HRV_SDANN1 HRV_SDNNI1 HRV_SDANN2 HRV_SDNNI2 HRV_SDANN5 HRV_SDNNI5 ... HRV_ShanEn HRV_FuzzyEn HRV_MSEn HRV_CMSEn HRV_RCMSEn HRV_CD HRV_HFD HRV_KFD HRV_LZC Result
1 1 309.270961 [[194.01973684210526]] [[11.692288281733303]] [[nan]] [[nan]] [[nan]] [[nan]] [[nan]] [[nan]] ... [[4.969829777788752]] [[0.553501197143664]] [[0.3023398153051495]] [[0.5160015858082174]] [[0.47375541520904296]] [[0.9551228546531801]] [[1.4211529594205168]] [[1.57705002598466]] [[0.6576765803624827]] 1
2 2 331.799217 [[180.8048780487805]] [[3.889488920027716]] [[nan]] [[nan]] [[nan]] [[nan]] [[nan]] [[nan]] ... [[4.465374986373309]] [[0.6737756837362977]] [[0.5565117746613248]] [[0.6740184947096288]] [[0.7788313298597798]] [[1.3277555897307036]] [[1.4958067765285579]] [[1.9160438008808878]] [[0.6977800980678385]] 1
3 3 338.470362 [[177.23493975903614]] [[1.094156734540991]] [[nan]] [[nan]] [[nan]] [[nan]] [[nan]] [[nan]] ... [[2.9651779060178822]] [[1.164101079947016]] [[1.1193441620306803]] [[1.093515223546829]] [[1.3785313363717893]] [[1.316759824479642]] [[1.668715093844506]] [[2.521786615224251]] [[0.9216924479055795]] 1
4 4 338.595333 [[177.21428571428572]] [[1.933044471664326]] [[nan]] [[nan]] [[nan]] [[nan]] [[nan]] [[nan]] ... [[3.6583790275546058]] [[0.8101207469655183]] [[1.180222840763136]] [[0.898173376947552]] [[0.8624968864578968]] [[1.409783438421504]] [[1.6399397761432095]] [[1.9890208511328105]] [[0.6848911524405815]] 1
looking at the source code, I believe this function in ecg_intervalrelated() is our culprit
def _ecg_intervalrelated_hrv(data, sampling_rate, output={}):
# Sanitize input
colnames = data.columns.values
if len([i for i in colnames if "ECG_R_Peaks" in i]) == 0:
raise ValueError(
"NeuroKit error: ecg_intervalrelated(): Wrong input,"
"we couldn't extract R-peaks. Please make sure"
"your DataFrame contains an `ECG_R_Peaks` column."
)
# Transform rpeaks from "signal" format to "info" format.
rpeaks = np.where(data["ECG_R_Peaks"].values)[0]
rpeaks = {"ECG_R_Peaks": rpeaks}
results = hrv(rpeaks, sampling_rate=sampling_rate)
for column in results.columns:
# Add and convert to float
output[column] = results[[column]].values
return output
I think the double brackets around results[[column]].values may be the problem. I can fork and test it out this afternoon.
fixed this! If you approve, I can open a PR.
Old:
df, info = nk2.ecg_process(data["ECG"], sampling_rate=100)
nk2.ecg_intervalrelated(df, sampling_rate=100)
epochs = nk2.epochs_create(df, events=[0, 15000], sampling_rate=100, epochs_end=150)
nk2.ecg_intervalrelated(epochs)
Label ECG_Rate_Mean HRV_MeanNN HRV_SDNN HRV_SDANN1 HRV_SDNNI1 HRV_SDANN2 HRV_SDNNI2 HRV_SDANN5 HRV_SDNNI5 ... HRV_SampEn HRV_ShanEn HRV_FuzzyEn HRV_MSEn HRV_CMSEn HRV_RCMSEn HRV_CD HRV_HFD HRV_KFD HRV_LZC
1 1 86.389814 [[69.49767441860465]] [[5.167181138835641]] [[nan]] [[nan]] [[nan]] [[nan]] [[nan]] [[nan]] ... [[1.2527629684953678]] [[4.290505175709077]] [[1.1667625041387144]] [[1.3234302834078784]] [[1.3340298186240032]] [[1.5730504365078102]] [[1.6535777692661076]] [[1.8035044265520663]] [[2.336829556302458]] [[0.8288764443746864]]
2 2 86.394396 [[69.46046511627907]] [[4.648089725942579]] [[nan]] [[nan]] [[nan]] [[nan]] [[nan]] [[nan]] ... [[1.8817856208857746]] [[4.080221616903997]] [[1.3473463161429127]] [[1.7371494891482802]] [[1.4976794053994398]] [[1.9578060920316949]] [[1.5369379755191352]] [[1.8952794259141006]] [[3.094110204759245]] [[0.9730288694833277]]
New:
df, info = nk2_new.ecg_process(data["ECG"], sampling_rate=100)
nk2_new.ecg_intervalrelated(df, sampling_rate=100)
epochs = nk2_new.epochs_create(df, events=[0, 15000], sampling_rate=100, epochs_end=150)
nk2_new.ecg_intervalrelated(epochs)
Label ECG_Rate_Mean HRV_MeanNN HRV_SDNN HRV_SDANN1 HRV_SDNNI1 HRV_SDANN2 HRV_SDNNI2 HRV_SDANN5 HRV_SDNNI5 ... HRV_SampEn HRV_ShanEn HRV_FuzzyEn HRV_MSEn HRV_CMSEn HRV_RCMSEn HRV_CD HRV_HFD HRV_KFD HRV_LZC
1 1 86.389814 69.497674 5.167181 NaN NaN NaN NaN NaN NaN ... 1.252763 4.290505 1.166763 1.323430 1.334030 1.573050 1.653578 1.803504 2.33683 0.828876
2 2 86.394396 69.460465 4.648090 NaN NaN NaN NaN NaN NaN ... 1.881786 4.080222 1.347346 1.737149 1.497679 1.957806 1.536938 1.895279 3.09411 0.973029
I just changed the last line of _ecg_intervalrelated_hrv():
def _ecg_intervalrelated_hrv(data, sampling_rate, output={}):
# Sanitize input
colnames = data.columns.values
if len([i for i in colnames if "ECG_R_Peaks" in i]) == 0:
raise ValueError(
"NeuroKit error: ecg_intervalrelated(): Wrong input,"
"we couldn't extract R-peaks. Please make sure"
"your DataFrame contains an `ECG_R_Peaks` column."
)
# Transform rpeaks from "signal" format to "info" format.
rpeaks = np.where(data["ECG_R_Peaks"].values)[0]
rpeaks = {"ECG_R_Peaks": rpeaks}
results = hrv(rpeaks, sampling_rate=sampling_rate)
for column in results.columns:
# Add and convert to float
output[column] = results[column].to_numpy()[0]
return output
Just wondering if anyone could explain the rationale for the current formatting output by ecg_intervalrelated, being the HRV features (columns) are series of arrays of arrays of single floats.
This prevents some dataframe functionality, e.g. quick plotting with df.HRV_MeanNN.plot(), but I wanted to know if there are other benefits to it that I'm not seeing. Thanks!