pigbreeder / CodeMemo

1 stars 0 forks source link

jupyter pandas #1

Open pigbreeder opened 2 years ago

pigbreeder commented 2 years ago

最简单绘多图

https://stackoverflow.com/questions/40071096/how-to-plot-multiple-lines-in-one-figure-in-pandas-python-based-on-data-from-mul

from IPython.core.interactiveshell import InteractiveShell InteractiveShell.ast_node_interactivity = "all"

testpppppp commented 1 year ago

nan转成None

df = df.where(df.notnull(), None) df = df.astype(object).where(pd.notnull(df), None) #应对odps表为空的处理

float包含nan转int

arr = pd.array([1, 2, np.nan], dtype=pd.Int64Dtype())

找出有null的行

df[df.isna().any(axis=1)]

所有的string都trim

df_obj = df.select_dtypes(['object']) df[df_obj.columns] = df_obj.apply(lambda x: x.str.strip())

百分比输出

t1.style.format({'L1_intra_pct': "{:.2%}",'L1_inter_pct': "{:.2%}",'true_pct': "{:.2%}"})

groupby

df.groupby("A").filter(lambda x: len(x) > 1) # filter
df.groupby('B').apply(lambda x: x.sample(frac=0.5)) # sample

time

df['created_time'] = pd.to_datetime(df['created_at'],unit='s').dt.strftime('%Y-%m-%d %H:%M:%S')
testpppppp commented 1 year ago

tqdm

jupyter 中用from tqdm.autonotebook import tqdm from tqdm.notebook import tqdm pandas

tqdm.pandas()
df.progress_apply()

用完消失

trange(10,leave=False,desc='test')

print可能会导致输出多行进度条,可以将print语句改为tqdm.write,代码如下

for i in tqdm(range(10),ascii=True):
    tqdm.write("come on")
    time.sleep(0.1)

自由操控

from random import random
from time import sleep
from tqdm import tqdm

epochs = 2
train_data_num = 10
for i in range(epochs):
    with tqdm(total=train_data_num) as t:
        for j in range(10):
            # Description will be displayed on the left
            t.set_description('Epoch %i' % i)
            # Postfix will be displayed on the right,
            # formatted automatically based on argument's datatype
            t.set_postfix(loss=random(),acc=random())
            sleep(0.1)
            t.update(1)
testpppppp commented 1 year ago

绘图hist成比率

import seaborn as sns
sns.histplot(df1.top1_lzd_prob,stat="probability")

双轴

import matplotlib.pyplot as plt
fig, ax1 = plt.subplots(figsize=(12, 8))
tdf = pd.DataFrame(datas,columns=['threshold','accu','name','cnt'])
a=sns.barplot(data = tdf,x='threshold',y='accu',hue='name',palette=sns.color_palette('bright')[:2],ax=ax1)
aa = show_values(a,space=.15)
ax2 = ax1.twinx()
testpppppp commented 1 year ago

sns绘图

# kde/hist  同一个图
# https://stackoverflow.com/questions/46045750/seaborn-distplot-displot-with-multiple-distributions
sns.displot(data=df3, x='pred_score', hue='label', kind='kde', fill=True,palette=sns.color_palette('bright')[:2], height=8, aspect=1.5)
####################

# bar
a=sns.barplot(data = tdf,x='threshold',y='accu',hue='name',palette=sns.color_palette('bright')[:2],ax=ax1)

# https://stackoverflow.com/questions/63945535/seaborn-plotting-histogram-and-lineplot-on-the-same-figure-with-2-y-axis
####################

# write value on plot
# https://juejin.cn/post/7116375204573642759
def show_values(axs, orient="v", space=.01):
    def _single(ax):
        if orient == "v":
            for p in ax.patches:
                _x = p.get_x() + p.get_width() / 2
                _y = p.get_y() + p.get_height() + (p.get_height()*0.01)
                value = '{:.1f}'.format(p.get_height())
                ax.text(_x, _y, value, ha="center") 
        elif orient == "h":
            for p in ax.patches:
                _x = p.get_x() + p.get_width() + float(space)
                _y = p.get_y() + p.get_height() - (p.get_height()*0.5)
                value = '{:.1f}'.format(p.get_width())
                ax.text(_x, _y, value, ha="left")

    if isinstance(axs, np.ndarray):
        for idx, ax in np.ndenumerate(axs):
            _single(ax)
    else:
        _single(axs)
p = sns.barplot(x="day", y="tip", data=data, ci=None)

#show values on barplot
show_values(p)
####################
testpppppp commented 1 year ago

series转df

df3 = pd.DataFrame(df2.values.T,columns=df2.index) 

df转dt

area_dict = dict(zip(lakes.id, lakes.value))

classification_report转df

    clsf_report = pd.DataFrame(classification_report(y_true = base_label_trun, y_pred = predict_label, output_dict=True)).transpose()
testpppppp commented 11 months ago

groupby 返回多个值,同时有name pandas中groupby函数中参数ax_index和group_keys的区别 df.groupby(['venture','path_en'],as_index=False).apply(lambda x:pd.Series({'a':len(x[x['label']==1])/len(x)}))

testpppppp commented 10 months ago

所有cell处理

data_frame_trimmed = data_frame.apply(lambda x: x.str.strip() if x.dtype == "object" else x)