smeisegeier / pandas-plots

MIT License
0 stars 0 forks source link

add quartils, iqr to numerics #41

Open smeisegeier opened 1 week ago

smeisegeier commented 2 days ago
import pandas as pd
from scipy import stats
_df1 = (db_diff
    # .filter("Anzahl_Tage_Diagnose_Tod >= 0 and diff >= 0")
    .filter("has_count and has_diff")
    .project("Anzahl_Tage_Diagnose_Tod, diff")
    .to_df()
)

# Calculate IQR and pass `rng=(25, 75)` to get the interquartile range
iqr_value = stats.iqr(_df1["Anzahl_Tage_Diagnose_Tod"])

# Using the iqr function, we still calculate the bounds manually
q1 = stats.scoreatpercentile(_df1['Anzahl_Tage_Diagnose_Tod'], 25)
q3 = stats.scoreatpercentile(_df1['Anzahl_Tage_Diagnose_Tod'], 75)

# Calculate upper bound directly
upper_bound = q3 + 1.5 * iqr_value

print(f"upper_bound für Anzahl_Tage_Diagnose_Tod: {upper_bound:_.0f}")
smeisegeier commented 19 hours ago
    # * only show numerics
    for col in df.select_dtypes("number").columns:
        _u, _h = get_uniques_header(col)

        # * extra care for scipy metrics, these are very vulnarable to nan
        print(
            f"{_h} min: {round(df[col].min(),3):_} | max: {round(df[col].max(),3):_} | median: {round(df[col].median(),3):_} | mean: {round(df[col].mean(),3):_} | std: {round(df[col].std(),3):_} | cv: {round(df[col].std() / df[col].mean(),3):_} | sum: {round(df[col].sum(),3):_} | skew: {round(stats.skew(df[col].dropna().tolist()),3)} | kurto: {round(stats.kurtosis(df[col].dropna().tolist()),3)}"
        )