Open NGoetz opened 2 months ago
Do you mean something like this (created with ChatGPT for CentralityClasses)? (all documentation deleted to make it shorter). If this is what you mean, then we have to change all variables.
import numpy as np
import warnings
from typing import List, Union
class CentralityClasses:
def __init__(self, events_multiplicity: Union[List[float], np.ndarray], centrality_bins: Union[List[float], np.ndarray]):
if not isinstance(events_multiplicity, (list, np.ndarray)):
raise TypeError("'events_multiplicity' is not list or numpy.ndarray")
if not isinstance(centrality_bins, (list, np.ndarray)):
raise TypeError("'centrality_bins' is not list or numpy.ndarray")
# Check if centrality_bins is sorted
if not all(centrality_bins[i] <= centrality_bins[i + 1] for i in range(len(centrality_bins) - 1)):
warnings.warn("'centrality_bins' is not sorted. Sorting automatically.")
centrality_bins = sorted(centrality_bins)
# Check for uniqueness of values
# Remove duplicates from the list
unique_bins: List[float] = []
seen: set = set()
multiple_same_entries: bool = False
for item in centrality_bins:
if item not in seen:
unique_bins.append(item)
seen.add(item)
else:
multiple_same_entries = True
if multiple_same_entries:
warnings.warn("'centrality_bins' contains duplicate values. They are removed automatically.")
# Check for negative values and values greater than 100
if any(value < 0.0 or value > 100.0 for value in centrality_bins):
raise ValueError("'centrality_bins' contains values less than 0 or greater than 100.")
self.events_multiplicity_: Union[List[float], np.ndarray] = events_multiplicity
self.centrality_bins_: List[float] = unique_bins
self.dNchdetaMin_: List[float] = []
self.dNchdetaMax_: List[float] = []
self.dNchdetaAvg_: List[float] = []
self.dNchdetaAvgErr_: List[float] = []
self.__create_centrality_classes()
def __create_centrality_classes(self) -> None:
number_events: int = len(self.events_multiplicity_)
if number_events < 4:
raise ValueError("The number of events has to be larger than 4")
# create four sub samples for the error determination
event_sample_A: List[float] = []
event_sample_B: List[float] = []
event_sample_C: List[float] = []
event_sample_D: List[float] = []
# distribute the numbers evenly
for i, multiplicity in enumerate(self.events_multiplicity_):
if multiplicity < 0:
raise ValueError("Multiplicity in 'events_multiplicity' is negative")
if i % 4 == 0:
event_sample_A.append(multiplicity)
elif i % 4 == 1:
event_sample_B.append(multiplicity)
elif i % 4 == 2:
event_sample_C.append(multiplicity)
elif i % 4 == 3:
event_sample_D.append(multiplicity)
event_sample_A = sorted(event_sample_A, reverse=True)
event_sample_B = sorted(event_sample_B, reverse=True)
event_sample_C = sorted(event_sample_C, reverse=True)
event_sample_D = sorted(event_sample_D, reverse=True)
MinRecord: int = int(number_events / 4 * self.centrality_bins_[0] / 100.0)
for i in range(1, len(self.centrality_bins_)):
MaxRecord: int = int(number_events / 4 * self.centrality_bins_[i] / 100.0)
AvgA: float = np.mean(event_sample_A[MinRecord:MaxRecord])
AvgB: float = np.mean(event_sample_B[MinRecord:MaxRecord])
AvgC: float = np.mean(event_sample_C[MinRecord:MaxRecord])
AvgD: float = np.mean(event_sample_D[MinRecord:MaxRecord])
Avg: float = (AvgA + AvgB + AvgC + AvgD) / 4.0
Err: float = np.sqrt(((AvgA - Avg)**2 + (AvgB - Avg)**2 + (AvgC - Avg)**2 + (AvgD - Avg)**2) / 3.0)
self.dNchdetaAvg_.append(Avg)
self.dNchdetaAvgErr_.append(Err)
MinRecord = MaxRecord
# sort events by multiplicity and determine boundaries of centrality classes
global_event_record: List[float] = sorted(self.events_multiplicity_, reverse=True)
MinRecord = int(number_events * self.centrality_bins_[0] / 100.0)
for i in range(1, len(self.centrality_bins_)):
MaxRecord = int(number_events * self.centrality_bins_[i] / 100.0)
self.dNchdetaMax_.append(global_event_record[MinRecord])
self.dNchdetaMin_.append(global_event_record[MaxRecord - 1])
MinRecord = MaxRecord
def get_centrality_class(self, dNchdEta: float) -> int:
# check if the multiplicity is in the most central bin
if dNchdEta >= self.dNchdetaMin_[0]:
return 0
# check if the multiplicity is in the most peripheral bin
elif dNchdEta < self.dNchdetaMin_[len(self.dNchdetaMin_) - 2]:
return len(self.dNchdetaMin_) - 1
# check if the multiplicity is in one of the intermediate bins
else:
for i in range(1, len(self.dNchdetaMin_) - 1):
if self.dNchdetaMin_[i - 1] > dNchdEta >= self.dNchdetaMin_[i]:
return i
def output_centrality_classes(self, fname: str) -> None:
# Check if fname is a string
if not isinstance(fname, str):
raise TypeError("'fname' should be a string.")
# Write the information to the file
with open(fname, 'w') as out_stream:
out_stream.write("# CentralityMin CentralityMax dNchdEtaMin dNchdEtaMax dNchdEtaAvg dNchdEtaAvgErr\n")
for i in range(1, len(self.dNchdetaMin_)):
out_stream.write(
f"{self.centrality_bins_[i - 1]} - {self.centrality_bins_[i]} "
f"{self.dNchdetaMin_[i - 1]} {self.dNchdetaMax_[i - 1]} "
f"{self.dNchdetaAvg_[i - 1]} {self.dNchdetaAvgErr_[i - 1]}\n"
)
Yes this is what I mean. In fact we are not changing the variables, we just decorate the signature. For the interpreter this is the same, but with a linter we have then better control over the types used everywhere.
I see, but wouldn't it be more careful to apply this to all variables in the functions as well?
Strictly speaking yes. But if we add type hints to all class methods and functions signatures and returns, all is left are local variables which types are easily inferred from context.
So yes, it would be more careful, but is less crucial.
I tested the mypy
package with the CentralityClasses class. The branch https://github.com/smash-transport/sparkx/tree/roch/statical_typing is an example for this. I also had to introduce the types for the other class member variables in the __init__
function to make it not complain about missing types.
We could probably add a check as a GitHub action. If we do this for the whole codebase.
Yes this looks how I imagined. I also think that adding this as a GitHub action in the last step is the best way to go.
OK, then I will just continue with the next classes for the rest of the afternoon.
Well you don't have to do this now and alone, I would also do some at the next SMASH/SPARKX sprint, but you are of course invited to do as much as you like!
I will just start and push the latest version to that branch. When you have some time, you can just continue to edit the branch.
The combination of type hints and linters like
mypy
allow to leverage the power of statically typing even in the dynamically typed world of Python. This would greatly benefit the stability and robustness of the codebase, at the cost of decorating all function and class signatures, and creating some merge conflicts. I can take this over with assistance of Copilot or someone of you (logical or). However, this is a project wide design decision we should make collectively.