Open lzachmann opened 2 years ago
Used the following environment during initial testing....
#!/usr/bin/env bash
IMAGE=docker.io/rocker/ml:latest
docker run --rm -it \
--name iggy \
-v $(pwd):/content \
-w /content \
$IMAGE \
/bin/bash
Python scripts used for debugging, interrogating metaflow objects, and summary of results:
from metaflow import Metaflow
import pandas as pd
import matplotlib.pyplot as plt
import argparse
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--which-flow',
default="Flow('IggyBaselineFlowMLP')",
type=str,
help="The name of the Metaflow Flow object")
args = parser.parse_args()
which_flow = args.which_flow
# print(list(Metaflow()))
flow_names = [str(x) for x in list(Metaflow())]
focal_flow = flow_names.index(which_flow)
flow = list(Metaflow())[focal_flow]
run = flow.latest_run # print(flow.latest_run)
step = "feature_selection_and_train_model" # "train_model"
# print(f"Printing run info...")
# print(list(run))
# print(f"\nPrinting task data for run...")
# print(run[step].task.data)
print(f"\nPrinting task data for run...")
print(run[step].task.data.eval_result)
df = pd.DataFrame(run[step].task.data.eval_result, index=[0])
print(df)
print(run[step].task.data.tax_district)
# mlp = run[step].task.data.model
# X_train = run[step].task.data.dataset.data[0]
# y_train = run[step].task.data.dataset.data[1]
# print(mlp.score(X_train, y_train))
# plt.plot(mlp.loss_curve_)
# try:
# plt.plot(mlp.validation_scores_)
# except:
# print("Validation scores not available...")
# plt.savefig(f"tmp/{which_flow}_loss.png")
# df.to_csv(f"tmp/{which_flow}_eval_results.csv", index=False)
import os
from glob import glob
import pandas as pd
PATH = "op"
EXT = "stats.csv" # "*.csv"
all_csv_files = [file
for path, subdir, files in os.walk(PATH)
for file in glob(os.path.join(path, EXT))]
print(all_csv_files)
def perc_change(new, old):
return -1 * (new - old) / old * 100
data_all = pd.concat((pd.read_csv(i) for i in all_csv_files)).reset_index(drop = True) # Import
data_all["dir"] = all_csv_files
# Measure improvement
baseline_test_unscaled_mae = data_all.test_unscaled_mae[0]
data_all["impr_test_unscaled_mae"] = data_all.apply(lambda x: perc_change(x.test_unscaled_mae, baseline_test_unscaled_mae), axis=1)
baseline_test_unscaled_mae_no_log = data_all.test_unscaled_mae_no_log[0]
data_all["impr_test_unscaled_mae_no_log"] = data_all.apply(lambda x: perc_change(x.test_unscaled_mae_no_log, baseline_test_unscaled_mae_no_log), axis=1)
print(data_all)
data_all.to_csv(f"op/stats_50_features_new.csv", index=False)
#!/usr/bin/env bash
# For baseline
python iggy_baseline_flow_mlp.py run
# For Iggy Enriched data
python iggy_enrich_flow_mlp.py run
# For Running Per District parallelized model training.
python iggy_perdistrict_flow_mlp.py run
Post-hoc correlation analysis...
library(tidyverse)
d <- read_csv('~/Downloads/resids_full.csv')
feat <- read_csv('~/Downloads/selected_features.csv')
ggplot(d) +
geom_point(aes(x = y_hat_val, y = y_val, color = eps)) +
geom_abline(intercept = 0, slope = 1, color = 'orange') +
labs(x = 'Predicted', y = 'Observed') +
scale_color_viridis_c('y_hat_val - y_val', direction = -1)
d_discrete <- d %>%
purrr::map_lgl(~all(.x %in% c(0,1)))
cor_d <- d %>%
select(-y_hat_val, -y_val) %>%
cor()
# corr_check(d %>% select(-y_hat_val, -y_val), 0.85)
cor_eps <- cor_d[1:(nrow(cor_d) - 1), 'eps']
thresh <- 0.05
other_vars = names(cor_eps)[which(abs(cor_eps) > thresh)]
other_vars %in% feat[[1]]
d_long <- d %>%
select(all_of(c('eps', other_vars))) %>%
pivot_longer(-eps, names_to = 'feature', values_to = 'feature_value')
ggplot(d_long) +
facet_wrap(~feature) +
geom_boxplot(aes(x = factor(feature_value), y = eps)) +
labs(x = 'Feature value', y = 'Epsilon')
Notes on getting started, may want to revisit and update the upstream at some point