XENONnT / straxen

Streaming analysis for XENON
BSD 3-Clause "New" or "Revised" License
21 stars 33 forks source link

som_sub_type in the peaklet_classification_som do not show expected values #1481

Closed LuisSanchez25 closed 1 week ago

LuisSanchez25 commented 1 week ago

Describe the bug In the Plugin PeakletClassificationSOM the fields for som_sub_type, loc_x_som and loc_y_som don't seem to give the expected values. If the algorithm is run outside the plugin the values seem correct, I reran the functions withing the plugin with the same data but could not find the bug.

Weirdly enough this does not seem to affect the final peaks results as those are consistent with what is to be expected from the algorithm, so the right data seems to be passed on but the wrong data is stored when I check the corresponding data_types.

To Reproduce Insert the MWE of how to reproduce the error

import sys
import numpy as np
import matplotlib.pyplot as plt
import strax
import straxen
from joblib import dump, load
from PIL import Image
from matplotlib.colors import LinearSegmentedColormap
from matplotlib import colors

st = straxen.contexts.xenonnt_som(xedocs_version = "global_v13",
                                    _raw_paths='/stor2/data/raw_records',
                                   )

peaklets = st.get_array('049210','peaklets')
peaklet_cls_som = st.get_array('049210','peaklet_classification_som')

peaklet_cls_som['som_sub_type'][:3] 
# Result array([4,  4,  5,])

peaklets['type'] = peaklet_cls_som['straxen_type']

# Rest of the relevant code after this chunk.
####### The following block of code are simply the SOM plugging rewritten outside of a class ######

def compute(peaklets, peaklets_classifcation, som_data):
        # Current classification
        #peaklets_classifcation = super().compute(peaklets)

        peaklet_with_som = np.zeros(len(peaklets), dtype=dtype)
        strax.copy_to_buffer(peaklets_classifcation, peaklet_with_som, "_copy_peaklets_information")
        peaklet_with_som["old_type"] = peaklets_classifcation["type"]
        #del peaklets_classifcation

        # SOM classification
        peaklets_w_type = peaklets.copy()
        peaklets_w_type["type"] = peaklet_with_som["type"]
        _is_s1_or_s2 = np.isin(peaklets_w_type["type"], [1, 2])

        peaklets_w_type = peaklets_w_type[_is_s1_or_s2]

        print("length of peaklets_w_type = " + str(len(peaklets_w_type)))

        som_sub_type, x_som, y_som = recall_populations(
            peaklets_w_type, som_data['weight_cube'], 
            som_data['som_img'], 
            som_data['norm_factors']
        )
        print(som_sub_type[:10])
        strax_type = som_type_to_type(
            som_sub_type, som_data['s1_array'], som_data['s2_array'], 
                                                         som_data['s3_array'], 
                                                        som_data['s0_array']
        )
        peaklet_with_som["som_sub_type"][_is_s1_or_s2] = som_sub_type
        print(peaklet_with_som["som_sub_type"][_is_s1_or_s2][:10])
        peaklet_with_som["loc_x_som"][_is_s1_or_s2] = x_som
        peaklet_with_som["loc_y_som"][_is_s1_or_s2] = y_som
        peaklet_with_som["som_type"][_is_s1_or_s2] = strax_type
        if True:
            peaklet_with_som["type"][_is_s1_or_s2] = strax_type
        else:
            peaklet_with_som["type"] = peaklet_with_som["old_type"]

        return peaklet_with_som

def som_type_to_type(som_type, s1_array, s2_array, s3_array, s0_array):
    """
    Converts the SOM type into either S1 or S2 type (1, 2)
    som_type:    array with integers corresponding to the different SOM types
    s1_array:    array containing the number corresponding to the SOM types which should
                 be converted to S1's
    """
    som_type_copy = som_type.copy()
    som_type_copy[np.isin(som_type_copy, s1_array)] = 1234
    som_type_copy[np.isin(som_type_copy, s2_array)] = 5678
    som_type_copy[np.isin(som_type_copy, s3_array)] = -5
    som_type_copy[np.isin(som_type_copy, s0_array)] = -250
    som_type_copy[som_type_copy == 1234] = 1
    som_type_copy[som_type_copy == 5678] = 2
    som_type_copy[som_type_copy == -5] = 3
    som_type_copy[som_type_copy == -250] = 0

    return som_type_copy

def recall_populations(dataset, weight_cube, som_cls_img, norm_factors):
    """Master function that should let the user provide a weightcube, a reference img as a np.array,
    a dataset and a set of normalization factors.

    In theory, if these 5 things are provided, this function should output
    the original data back with one added field with the name "SOM_type"
    weight_cube:      SOM weight cube (3D array)
    som_cls_img:      SOM reference image as a numpy array
    dataset:          Data to preform the recall on (Should be peaklet level data)
    normfactos:       A set of 11 numbers to normalize the data so we can preform a recall

    """

    xdim, ydim, zdim = weight_cube.shape
    img_xdim, img_ydim, img_zdim = som_cls_img.shape
    unique_colors = np.unique(np.reshape(som_cls_img, [xdim * ydim, 3]), axis=0)
    # Checks that the reference image matches the weight cube
    assert (
        xdim == img_xdim
    ), f"Dimensions mismatch between SOM weight cube ({xdim}) and reference image ({img_xdim})"
    assert (
        ydim == img_ydim
    ), f"Dimensions mismatch between SOM weight cube ({ydim}) and reference image ({img_ydim})"

    assert all(dataset["type"] != 0), "Dataset contains unclassified peaklets"
    # Get the deciles representation of data for recall
    decile_transform_check = data_to_log_decile_log_area_aft(dataset, norm_factors)
    # preform a recall of the dataset with the weight cube
    # assign each population color a number (can do from previous function)
    ref_map = generate_color_ref_map(som_cls_img, unique_colors, xdim, ydim)
    som_cls_array = np.empty(len(dataset["area"]))
    som_cls_array[:] = np.nan
    # Make new numpy structured array to save the SOM cls data
    data_with_SOM_cls = rfn.append_fields(dataset, "SOM_type", som_cls_array)
    # preforms the recall and assigns SOM_type label
    output_data, x_som, y_som = som_cls_recall(
        data_with_SOM_cls, decile_transform_check, weight_cube, ref_map
    )
    return output_data["SOM_type"], x_som, y_som

def som_cls_recall(array_to_fill, data_in_som_fmt, weight_cube, reference_map):
    som_xdim, som_ydim, _ = weight_cube.shape
    # for data_point in data_in_SOM_fmt:
    distances = cdist(
        weight_cube.reshape(-1, weight_cube.shape[-1]), data_in_som_fmt, metric="euclidean"
    )
    w_neuron = np.argmin(distances, axis=0)
    x_idx, y_idx = np.unravel_index(w_neuron, (som_xdim, som_ydim))
    array_to_fill["SOM_type"] = reference_map[x_idx, y_idx]
    return array_to_fill, x_idx, y_idx

def generate_color_ref_map(color_image, unique_colors, xdim, ydim):
    ref_map = np.zeros((xdim, ydim))
    for color in np.arange(len(unique_colors)):
        mask = np.all(np.equal(color_image, unique_colors[color, :]), axis=2)
        indices = np.argwhere(mask)  # generates a 2d mask
        for loc in np.arange(len(indices)):
            ref_map[indices[loc][0], indices[loc][1]] = color
    return ref_map

def data_to_log_decile_log_area_aft(peaklet_data, normalization_factor):
    """Converts peaklet data into the current best inputs for the SOM, log10(deciles) + log10(area)
    + AFT Since we are dealing with logs, anything less than 1 will be set to 1."""
    # turn deciles into approriate 'normalized' format
    # (maybe also consider L1 normalization of these inputs)
    decile_data = compute_quantiles(peaklet_data, 10)
    data = peaklet_data.copy()
    decile_data[decile_data < 1] = 1

    decile_log = np.log10(decile_data)
    decile_log_over_max = np.divide(decile_log, normalization_factor[:10])
    # Now lets deal with area
    data["area"] = data["area"] + normalization_factor[11] + 1
    peaklet_log_area = np.log10(data["area"])
    peaklet_aft = (
        np.sum(data["area_per_channel"][:, : straxen.n_top_pmts], axis=1) / peaklet_data["area"]
    )
    peaklet_aft = np.where(peaklet_aft > 0, peaklet_aft, 0)
    peaklet_aft = np.where(peaklet_aft < 1, peaklet_aft, 1)
    deciles_area_aft = np.concatenate(
        (
            decile_log_over_max,
            np.reshape(peaklet_log_area, (len(peaklet_log_area), 1)) / normalization_factor[10],
            np.reshape(peaklet_aft, (len(peaklet_log_area), 1)),
        ),
        axis=1,
    )
    return deciles_area_aft

def compute_quantiles(peaks: np.ndarray, n_samples: int):
    """Compute waveforms and quantiles for a given number of nodes(attributes) :param peaks:

    :param n_samples: number of nodes or attributes
    :return: quantiles

    """

    data = peaks["data"].copy()
    data[data < 0.0] = 0.0
    dt = peaks["dt"]
    q = compute_wf_attributes(data, dt, n_samples)
    return q

#@export
@numba.jit(nopython=True, cache=True)
def compute_wf_attributes(data, sample_length, n_samples: int):
    """
    Compute waveform attribures
    Quantiles: represent the amount of time elapsed for
    a given fraction of the total waveform area to be observed in n_samples
    i.e. n_samples = 10, then quantiles are equivalent deciles
    Waveforms: downsampled waveform to n_samples
    :param data: waveform e.g. peaks or peaklets
    :param n_samples: compute quantiles for a given number of samples
    :return: waveforms and quantiles of size n_samples
    """

    assert data.shape[0] == len(sample_length), "ararys must have same size"

    num_samples = data.shape[1]

    quantiles = np.zeros((len(data), n_samples), dtype=np.float64)

    # Cannot compute with with more samples than actual waveform sample
    assert num_samples > n_samples, "cannot compute with more samples than the actual waveform"
    assert num_samples % n_samples == 0, "number of samples must be a multiple of n_samples"

    # Compute quantiles
    inter_points = np.linspace(0.0, 1.0 - (1.0 / n_samples), n_samples)
    cumsum_steps = np.zeros(n_samples + 1, dtype=np.float64)
    frac_of_cumsum = np.zeros(num_samples + 1)
    sample_number_div_dt = np.arange(0, num_samples + 1, 1)
    for i, (samples, dt) in enumerate(zip(data, sample_length)):
        if np.sum(samples) == 0:
            continue
        # reset buffers
        frac_of_cumsum[:] = 0
        cumsum_steps[:] = 0
        frac_of_cumsum[1:] = np.cumsum(samples)
        frac_of_cumsum[1:] = frac_of_cumsum[1:] / frac_of_cumsum[-1]
        cumsum_steps[:-1] = np.interp(inter_points, frac_of_cumsum, sample_number_div_dt * dt)
        cumsum_steps[-1] = sample_number_div_dt[-1] * dt
        quantiles[i] = cumsum_steps[1:] - cumsum_steps[:-1]

    return quantiles

############ End of reinserting code ##################

check_output = compute(peaklets[:3], peaklet_cls_som[:3], som_data)
check_output['som_sub_type']
# Result array([3, 11, 7])

Expected behavior peaklet_cls_som['som_sub_type'][:3]

Result array([3, 11, 7,])

Screenshots If applicable, add screenshots to help explain your problem.

Versions Please add the output of:

straxen.print_versions()
version 3.0.0
LuisSanchez25 commented 1 week ago

This was a mistake on my part, was using the wrong model to compare.