Getting started - Githubissues

lzachmann commented 2 years ago

Notes on getting started, may want to revisit and update the upstream at some point

#!/usr/bin/env bash

# Un-compress the benchmark dataset in ./data/benchmark
cd iggy-metaflow-demo
tar -xvf data/benchmark/iggy_re_salesprice_pinellas_20211203.tar.gz -C data/benchmark/
cd ..

# Once downloaded, place the Iggy sample data in ./iggy-data and un-compress it
tar -xzvf ./iggy-data/iggy-package-wkt-20211110214810_fl_pinellas_quadkeys.tar.gz -C ./iggy-data

# From the root directory of the repo, set up virtual environment and install dependencies
python3 -m venv env
source env/bin/activate
pip install -r iggy-metaflow-demo/requirements.txt

# export user ID (for S3 datastore)
export USERNAME=lzachmann

# Run the demo with Metaflow:

# For baseline
python iggy_baseline_flow.py run
# For Iggy Enriched data
python iggy_enrich_flow.py run
# For Running Per District parallelized model training.
python iggy_perdistrict_flow.py run

# Testing
python iggy_baseline_flow_ljz.py run

# # Notes
# No feature importance using MLP
# Made some ad hoc changes to select feature to permit injecting features manually
# Would like to pick at the model residuals a bit more to understand why we are failing to predict a few extreme high cost per sqft observations
# Added some OP plot functionality
# Need to see if bumping to 70 features yields better results, esp for the iggy enriched data

lzachmann commented 2 years ago

Used the following environment during initial testing....

#!/usr/bin/env bash

IMAGE=docker.io/rocker/ml:latest

docker run --rm -it \
    --name iggy \
    -v $(pwd):/content \
    -w /content \
    $IMAGE \
    /bin/bash

lzachmann commented 2 years ago

Python scripts used for debugging, interrogating metaflow objects, and summary of results:

from metaflow import Metaflow
import pandas as pd
import matplotlib.pyplot as plt
import argparse

if __name__ == "__main__":

    parser = argparse.ArgumentParser()
    parser.add_argument('--which-flow',
        default="Flow('IggyBaselineFlowMLP')",
        type=str,
        help="The name of the Metaflow Flow object")

    args = parser.parse_args()
    which_flow = args.which_flow

    # print(list(Metaflow()))
    flow_names = [str(x) for x in list(Metaflow())]

    focal_flow = flow_names.index(which_flow)
    flow = list(Metaflow())[focal_flow]
    run = flow.latest_run # print(flow.latest_run)

    step = "feature_selection_and_train_model" # "train_model"

    # print(f"Printing run info...")
    # print(list(run))

    # print(f"\nPrinting task data for run...")
    # print(run[step].task.data)

    print(f"\nPrinting task data for run...")
    print(run[step].task.data.eval_result)
    df = pd.DataFrame(run[step].task.data.eval_result, index=[0])
    print(df)

    print(run[step].task.data.tax_district)

    # mlp = run[step].task.data.model
    # X_train = run[step].task.data.dataset.data[0]
    # y_train = run[step].task.data.dataset.data[1]
    # print(mlp.score(X_train, y_train))

    # plt.plot(mlp.loss_curve_)
    # try:
    #     plt.plot(mlp.validation_scores_)
    # except:
    #     print("Validation scores not available...")

    # plt.savefig(f"tmp/{which_flow}_loss.png")

    # df.to_csv(f"tmp/{which_flow}_eval_results.csv", index=False)

import os
from glob import glob
import pandas as pd

PATH = "op"
EXT = "stats.csv" # "*.csv"
all_csv_files = [file
                 for path, subdir, files in os.walk(PATH)
                 for file in glob(os.path.join(path, EXT))]
print(all_csv_files)

def perc_change(new, old):
    return -1 * (new - old) / old * 100

data_all = pd.concat((pd.read_csv(i) for i in all_csv_files)).reset_index(drop = True) # Import
data_all["dir"] = all_csv_files

# Measure improvement
baseline_test_unscaled_mae = data_all.test_unscaled_mae[0]
data_all["impr_test_unscaled_mae"] = data_all.apply(lambda x: perc_change(x.test_unscaled_mae, baseline_test_unscaled_mae), axis=1)
baseline_test_unscaled_mae_no_log = data_all.test_unscaled_mae_no_log[0]
data_all["impr_test_unscaled_mae_no_log"] = data_all.apply(lambda x: perc_change(x.test_unscaled_mae_no_log, baseline_test_unscaled_mae_no_log), axis=1)
print(data_all)  
data_all.to_csv(f"op/stats_50_features_new.csv", index=False)

lzachmann commented 2 years ago

#!/usr/bin/env bash

# For baseline 
python iggy_baseline_flow_mlp.py run
# For Iggy Enriched data 
python iggy_enrich_flow_mlp.py run 
# For Running Per District parallelized model training. 
python iggy_perdistrict_flow_mlp.py run

lzachmann commented 2 years ago

Post-hoc correlation analysis...

library(tidyverse)

d <- read_csv('~/Downloads/resids_full.csv')
feat <- read_csv('~/Downloads/selected_features.csv')

ggplot(d) +
  geom_point(aes(x = y_hat_val, y = y_val, color = eps)) +
  geom_abline(intercept = 0, slope = 1, color = 'orange') +
  labs(x = 'Predicted', y = 'Observed') +
  scale_color_viridis_c('y_hat_val - y_val', direction = -1)

d_discrete <- d %>% 
  purrr::map_lgl(~all(.x %in% c(0,1)))
cor_d <- d %>% 
  select(-y_hat_val, -y_val) %>% 
  cor()
# corr_check(d %>% select(-y_hat_val, -y_val), 0.85)

cor_eps <- cor_d[1:(nrow(cor_d) - 1), 'eps']
thresh <- 0.05
other_vars = names(cor_eps)[which(abs(cor_eps) > thresh)]
other_vars %in% feat[[1]]

d_long <- d %>% 
  select(all_of(c('eps', other_vars))) %>% 
  pivot_longer(-eps, names_to = 'feature', values_to = 'feature_value')
ggplot(d_long) +
  facet_wrap(~feature) +
  geom_boxplot(aes(x = factor(feature_value), y = eps)) +
  labs(x = 'Feature value', y = 'Epsilon')

lzachmann / iggy-enrich-demos

Getting started #1