Describe the bug
In the Plugin PeakletClassificationSOM the fields for som_sub_type, loc_x_som and loc_y_som don't seem to give the expected values. If the algorithm is run outside the plugin the values seem correct, I reran the functions withing the plugin with the same data but could not find the bug.
Weirdly enough this does not seem to affect the final peaks results as those are consistent with what is to be expected from the algorithm, so the right data seems to be passed on but the wrong data is stored when I check the corresponding data_types.
To Reproduce
Insert the MWE of how to reproduce the error
import sys
import numpy as np
import matplotlib.pyplot as plt
import strax
import straxen
from joblib import dump, load
from PIL import Image
from matplotlib.colors import LinearSegmentedColormap
from matplotlib import colors
st = straxen.contexts.xenonnt_som(xedocs_version = "global_v13",
_raw_paths='/stor2/data/raw_records',
)
peaklets = st.get_array('049210','peaklets')
peaklet_cls_som = st.get_array('049210','peaklet_classification_som')
peaklet_cls_som['som_sub_type'][:3]
# Result array([4, 4, 5,])
peaklets['type'] = peaklet_cls_som['straxen_type']
# Rest of the relevant code after this chunk.
####### The following block of code are simply the SOM plugging rewritten outside of a class ######
def compute(peaklets, peaklets_classifcation, som_data):
# Current classification
#peaklets_classifcation = super().compute(peaklets)
peaklet_with_som = np.zeros(len(peaklets), dtype=dtype)
strax.copy_to_buffer(peaklets_classifcation, peaklet_with_som, "_copy_peaklets_information")
peaklet_with_som["old_type"] = peaklets_classifcation["type"]
#del peaklets_classifcation
# SOM classification
peaklets_w_type = peaklets.copy()
peaklets_w_type["type"] = peaklet_with_som["type"]
_is_s1_or_s2 = np.isin(peaklets_w_type["type"], [1, 2])
peaklets_w_type = peaklets_w_type[_is_s1_or_s2]
print("length of peaklets_w_type = " + str(len(peaklets_w_type)))
som_sub_type, x_som, y_som = recall_populations(
peaklets_w_type, som_data['weight_cube'],
som_data['som_img'],
som_data['norm_factors']
)
print(som_sub_type[:10])
strax_type = som_type_to_type(
som_sub_type, som_data['s1_array'], som_data['s2_array'],
som_data['s3_array'],
som_data['s0_array']
)
peaklet_with_som["som_sub_type"][_is_s1_or_s2] = som_sub_type
print(peaklet_with_som["som_sub_type"][_is_s1_or_s2][:10])
peaklet_with_som["loc_x_som"][_is_s1_or_s2] = x_som
peaklet_with_som["loc_y_som"][_is_s1_or_s2] = y_som
peaklet_with_som["som_type"][_is_s1_or_s2] = strax_type
if True:
peaklet_with_som["type"][_is_s1_or_s2] = strax_type
else:
peaklet_with_som["type"] = peaklet_with_som["old_type"]
return peaklet_with_som
def som_type_to_type(som_type, s1_array, s2_array, s3_array, s0_array):
"""
Converts the SOM type into either S1 or S2 type (1, 2)
som_type: array with integers corresponding to the different SOM types
s1_array: array containing the number corresponding to the SOM types which should
be converted to S1's
"""
som_type_copy = som_type.copy()
som_type_copy[np.isin(som_type_copy, s1_array)] = 1234
som_type_copy[np.isin(som_type_copy, s2_array)] = 5678
som_type_copy[np.isin(som_type_copy, s3_array)] = -5
som_type_copy[np.isin(som_type_copy, s0_array)] = -250
som_type_copy[som_type_copy == 1234] = 1
som_type_copy[som_type_copy == 5678] = 2
som_type_copy[som_type_copy == -5] = 3
som_type_copy[som_type_copy == -250] = 0
return som_type_copy
def recall_populations(dataset, weight_cube, som_cls_img, norm_factors):
"""Master function that should let the user provide a weightcube, a reference img as a np.array,
a dataset and a set of normalization factors.
In theory, if these 5 things are provided, this function should output
the original data back with one added field with the name "SOM_type"
weight_cube: SOM weight cube (3D array)
som_cls_img: SOM reference image as a numpy array
dataset: Data to preform the recall on (Should be peaklet level data)
normfactos: A set of 11 numbers to normalize the data so we can preform a recall
"""
xdim, ydim, zdim = weight_cube.shape
img_xdim, img_ydim, img_zdim = som_cls_img.shape
unique_colors = np.unique(np.reshape(som_cls_img, [xdim * ydim, 3]), axis=0)
# Checks that the reference image matches the weight cube
assert (
xdim == img_xdim
), f"Dimensions mismatch between SOM weight cube ({xdim}) and reference image ({img_xdim})"
assert (
ydim == img_ydim
), f"Dimensions mismatch between SOM weight cube ({ydim}) and reference image ({img_ydim})"
assert all(dataset["type"] != 0), "Dataset contains unclassified peaklets"
# Get the deciles representation of data for recall
decile_transform_check = data_to_log_decile_log_area_aft(dataset, norm_factors)
# preform a recall of the dataset with the weight cube
# assign each population color a number (can do from previous function)
ref_map = generate_color_ref_map(som_cls_img, unique_colors, xdim, ydim)
som_cls_array = np.empty(len(dataset["area"]))
som_cls_array[:] = np.nan
# Make new numpy structured array to save the SOM cls data
data_with_SOM_cls = rfn.append_fields(dataset, "SOM_type", som_cls_array)
# preforms the recall and assigns SOM_type label
output_data, x_som, y_som = som_cls_recall(
data_with_SOM_cls, decile_transform_check, weight_cube, ref_map
)
return output_data["SOM_type"], x_som, y_som
def som_cls_recall(array_to_fill, data_in_som_fmt, weight_cube, reference_map):
som_xdim, som_ydim, _ = weight_cube.shape
# for data_point in data_in_SOM_fmt:
distances = cdist(
weight_cube.reshape(-1, weight_cube.shape[-1]), data_in_som_fmt, metric="euclidean"
)
w_neuron = np.argmin(distances, axis=0)
x_idx, y_idx = np.unravel_index(w_neuron, (som_xdim, som_ydim))
array_to_fill["SOM_type"] = reference_map[x_idx, y_idx]
return array_to_fill, x_idx, y_idx
def generate_color_ref_map(color_image, unique_colors, xdim, ydim):
ref_map = np.zeros((xdim, ydim))
for color in np.arange(len(unique_colors)):
mask = np.all(np.equal(color_image, unique_colors[color, :]), axis=2)
indices = np.argwhere(mask) # generates a 2d mask
for loc in np.arange(len(indices)):
ref_map[indices[loc][0], indices[loc][1]] = color
return ref_map
def data_to_log_decile_log_area_aft(peaklet_data, normalization_factor):
"""Converts peaklet data into the current best inputs for the SOM, log10(deciles) + log10(area)
+ AFT Since we are dealing with logs, anything less than 1 will be set to 1."""
# turn deciles into approriate 'normalized' format
# (maybe also consider L1 normalization of these inputs)
decile_data = compute_quantiles(peaklet_data, 10)
data = peaklet_data.copy()
decile_data[decile_data < 1] = 1
decile_log = np.log10(decile_data)
decile_log_over_max = np.divide(decile_log, normalization_factor[:10])
# Now lets deal with area
data["area"] = data["area"] + normalization_factor[11] + 1
peaklet_log_area = np.log10(data["area"])
peaklet_aft = (
np.sum(data["area_per_channel"][:, : straxen.n_top_pmts], axis=1) / peaklet_data["area"]
)
peaklet_aft = np.where(peaklet_aft > 0, peaklet_aft, 0)
peaklet_aft = np.where(peaklet_aft < 1, peaklet_aft, 1)
deciles_area_aft = np.concatenate(
(
decile_log_over_max,
np.reshape(peaklet_log_area, (len(peaklet_log_area), 1)) / normalization_factor[10],
np.reshape(peaklet_aft, (len(peaklet_log_area), 1)),
),
axis=1,
)
return deciles_area_aft
def compute_quantiles(peaks: np.ndarray, n_samples: int):
"""Compute waveforms and quantiles for a given number of nodes(attributes) :param peaks:
:param n_samples: number of nodes or attributes
:return: quantiles
"""
data = peaks["data"].copy()
data[data < 0.0] = 0.0
dt = peaks["dt"]
q = compute_wf_attributes(data, dt, n_samples)
return q
#@export
@numba.jit(nopython=True, cache=True)
def compute_wf_attributes(data, sample_length, n_samples: int):
"""
Compute waveform attribures
Quantiles: represent the amount of time elapsed for
a given fraction of the total waveform area to be observed in n_samples
i.e. n_samples = 10, then quantiles are equivalent deciles
Waveforms: downsampled waveform to n_samples
:param data: waveform e.g. peaks or peaklets
:param n_samples: compute quantiles for a given number of samples
:return: waveforms and quantiles of size n_samples
"""
assert data.shape[0] == len(sample_length), "ararys must have same size"
num_samples = data.shape[1]
quantiles = np.zeros((len(data), n_samples), dtype=np.float64)
# Cannot compute with with more samples than actual waveform sample
assert num_samples > n_samples, "cannot compute with more samples than the actual waveform"
assert num_samples % n_samples == 0, "number of samples must be a multiple of n_samples"
# Compute quantiles
inter_points = np.linspace(0.0, 1.0 - (1.0 / n_samples), n_samples)
cumsum_steps = np.zeros(n_samples + 1, dtype=np.float64)
frac_of_cumsum = np.zeros(num_samples + 1)
sample_number_div_dt = np.arange(0, num_samples + 1, 1)
for i, (samples, dt) in enumerate(zip(data, sample_length)):
if np.sum(samples) == 0:
continue
# reset buffers
frac_of_cumsum[:] = 0
cumsum_steps[:] = 0
frac_of_cumsum[1:] = np.cumsum(samples)
frac_of_cumsum[1:] = frac_of_cumsum[1:] / frac_of_cumsum[-1]
cumsum_steps[:-1] = np.interp(inter_points, frac_of_cumsum, sample_number_div_dt * dt)
cumsum_steps[-1] = sample_number_div_dt[-1] * dt
quantiles[i] = cumsum_steps[1:] - cumsum_steps[:-1]
return quantiles
############ End of reinserting code ##################
check_output = compute(peaklets[:3], peaklet_cls_som[:3], som_data)
check_output['som_sub_type']
# Result array([3, 11, 7])
Describe the bug In the Plugin PeakletClassificationSOM the fields for som_sub_type, loc_x_som and loc_y_som don't seem to give the expected values. If the algorithm is run outside the plugin the values seem correct, I reran the functions withing the plugin with the same data but could not find the bug.
Weirdly enough this does not seem to affect the final peaks results as those are consistent with what is to be expected from the algorithm, so the right data seems to be passed on but the wrong data is stored when I check the corresponding data_types.
To Reproduce Insert the MWE of how to reproduce the error
Expected behavior peaklet_cls_som['som_sub_type'][:3]
Result array([3, 11, 7,])
Screenshots If applicable, add screenshots to help explain your problem.
Versions Please add the output of: