Triamus / play

play repo for experiments (mainly with git)
1 stars 0 forks source link

star-like data wrangling #11

Open Triamus opened 6 years ago

Triamus commented 6 years ago

title: "data wrangling on star like data v 1.0" author: "me" classoption: landscape toc: true toc_depth: 2 header-includes:

\pagebreak

release notes

required libraries

library(pander)
library(dplyr)
library(readr)
library(tidyr)
library(xtable)
library(ggplot2)
library(scales)
library(grid)
library(lazyeval)
library(data.table)

global options

# output options
options("scipen"=100)
#opts_chunk$set(tidy.opts=list(width.cutoff=60),tidy=TRUE)
options(xtable.floating = FALSE)
options(xtable.timestamp = "")
options(xtable.comment = FALSE)
# tex / pandoc options for pdf creation
x <- Sys.getenv("PATH")
y <- paste(x, "E:\\Datenordner\\Downloads\\miktex\\miktex\\bin", sep=";")
Sys.setenv(PATH = y)
# always stringsAsFactors = F; if factors needed, declare them explicitly
options(stringsAsFactors = F)
# set work directory for read / write operations
path <- "E:/Datenordner/raw_data"
# if data is some place else, set root.dir option
opts_knit$set(root.dir = path)
#setwd(paste(path, sep="/"))

\pagebreak

overview

\pagebreak

create simulated data

data structure

knitr::include_graphics("./fdw_star.png")

country table e.g. country of domicile

country_table <- 
  data.frame(key_var = c("DE", "FR", "UK", "US", "CN", "JP"),
             country_name = c("Germany", "France", "United_Kingdom",
                              "United_States", "China", "Japan"),
             country_region = c("Europe", "Europe", "Europe",
                                "Americas", "Asia", "Asia"),
             country_currency = c("EUR", "EUR", "GBP", "USD", 
                                  "CNY", "JPY"),
             stringsAsFactors = F
  )

month table

month_table <- 
  data.frame(key_var = 1:12,
             month_desc = c("Jan", "Feb", "Mar", "Apr", "May", "Jun", 
                            "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"),
             month_quarter = c("Q1","Q1","Q1","Q2","Q2","Q2",
                               "Q3","Q3","Q3","Q4","Q4","Q4"),
             stringsAsFactors = F
  )

ubr table

n_ubr <- 1000

ubr_table <-
  data.frame(key_var = sample(x = n_ubr, size = n_ubr, rep = F),
             ubr_level12 = sample(x = (1*n_ubr + 1) : (2*n_ubr), 
                                  size = n_ubr, rep = F),
             ubr_level11 = sample(x = (2*n_ubr + 1) : (3*n_ubr), 
                                  size = n_ubr, rep = F),
             ubr_level10 = sample(x = (3*n_ubr + 1) : (4*n_ubr), 
                                  size = n_ubr, rep = F),
             ubr_level09 = sample(x = (4*n_ubr + 1) : (5*n_ubr), 
                                  size = n_ubr, rep = F),
             ubr_level08 = sample(x = (6*n_ubr + 1) : (7*n_ubr), 
                                  size = n_ubr, rep = F),
             ubr_level07 = sample(x = (7*n_ubr + 1) : (8*n_ubr), 
                                  size = n_ubr, rep = F),
             ubr_level06 = sample(x = (8*n_ubr + 1) : (9*n_ubr), 
                                  size = n_ubr, rep = F),
             ubr_level05 = sample(x = (9*n_ubr + 1) : (10*n_ubr), 
                                  size = n_ubr, rep = F),
             ubr_level04 = sample(x = (10*n_ubr + 1) : (11*n_ubr), 
                                  size = n_ubr, rep = F),
             ubr_level03 = sample(x = (11*n_ubr + 1) : (12*n_ubr), 
                                  size = n_ubr, rep = F),
             ubr_level02 = sample(x = (12*n_ubr + 1) : (13*n_ubr), 
                                  size = n_ubr, rep = F),
             ubr_level01 = sample(x = (13*n_ubr + 1) : (14*n_ubr), 
                                  size = n_ubr, rep = F),
             stringsAsFactors = F
  )

source system table

inst_id_table <-
  data.frame(key_var = c(100, 200, 300, 400, 500, 600, 700,
                     "9ERG", "ADJB2", "CCS", "CDS"),
             inst_id_desc = c("NA", "NA", "NA", "NA", "NA", "NA", "NA",
                              "who knows", "adjustment basel 2", 
                              "collateral shift", "credit default swap"),
             stringsAsFactors = F
  )

product type table

product_type_table <-
  data.frame(key_var = sample(x = 10000:100000, size = 10, rep = F),
             product_type_desc = c("loan", "interest rate swap", 
                                   "interest rate forward",
                                   "repo", "option", "interest rate future",
                                   "asset-backed security", "cash", 
                                   "non-cash collateral", "other"),
             stringsAsFactors = F
  )

counterparty type table

counterparty_type_table <-
  data.frame(key_var = c("A1", "A2", "A3", "B1", "B2", "B3", "C1", "C2", "C3"),
             counterparty_type_desc = c("Securitisation - Originator", "German SME",
                                        "Large Corporate", "Other SME", 
                                        "Development Bank", "Commercial Bank", 
                                        "Other FI", "Sovereign", "Central Counterparty"),
             stringsAsFactors = F
  )

corep class table

corep_table <-
  data.frame(key_var = 
               c("SOV",
                       "INST",
                       "CORP",
                       "CORP_SL",
                       "CORP_SL_RE",
                       "CORP_SL_NRE",
                       "CORP_SME",
                       "CORP_SME_RE",
                       "CORP_SME_NRE",
                       "CORP_SME_SUPPFA",
                       "CORP_NSME",
                       "CORP_OTH",
                       "CORP_OTH_RE",
                       "CORP_OTH_NRE",
                       "RTL",
                       "RTL_RE",
                       "RTL_RE_SME",
                       "RTL_RE_NSME",
                       "RTL_QR",
                       "RTL_OTH",
                       "RTL_OTH_SME",
                       "RTL_OTH_NSME",
                       "RTL_SME",
                       "RTL_SME_SUPPFAC",
                       "RTL_NSME",
                       "EQU",
                       "SEC",
                       "OTH",
                       "HR",
                       "COVB",
                       "ST",
                       "CIU",
                       "RGOV",
                       "PSE",
                       "MDB",
                       "IORG",
                       "SECM",
                       "SECM_SME",
                       "SECM_NSME"),
             stringsAsFactors = F)

simulation function

create_obs <- function(n_obs, month, year, run) {
  # generate transaction data randomly
  # internally the prob vector is scaled to sum to 1
  id <- sample(seq(from = 1, to = n_obs, by = 1), n_obs, rep = F)
  # assigning higher probabilities for more important countries
  country <- sample(country_table$key_var, n_obs, rep = T,
                    prob = c(4, 1, 2, 3, 1, 1))
  month <- month
  year <- year
  ubrtrn <- sample(ubr_table$key_var, n_obs, rep = T)
  inst_id <- sample(inst_id_table$key_var, n_obs, rep = T)
  product_type <- sample(product_type_table$key_var, n_obs, rep = T)
  cpy_type <- sample(counterparty_type_table$key_var, n_obs, rep = T)
  corep <- sample(corep_table$key_var, n_obs, rep = T)
  rwa <- runif(n = n_obs, min = 0, max = 100000000)
  el <- runif(n = n_obs, min = 0, max = 10000000)
  ec <- runif(n = n_obs, min = 0, max = 10000000)
  # assigning higher probabilities for low PDs to reflect better portfolio
  # distribution
  pd <- sample(seq(from = 0, to = 1, by = 0.001), n_obs, rep = T,
               prob = c(seq(from = 500, to = 1, by = -1),
                        rep(x = 1, times = 501)
                        )
               )
  lgd <- sample(seq(from = 0, to = 1, by = 0.2), n_obs, rep = T)
  data <- data.frame(id = id,
                     month = month,
                     year = year,
                     country = country,
                     ubrtrn = ubrtrn,
                     inst_id = inst_id,
                     product_type = product_type,
                     corep = corep,
                     cpy_type = cpy_type,
                     rwa = rwa,
                     el = el,
                     ec = ec,
                     pd = pd,
                     lgd = lgd,
                     stringsAsFactors = F)

  return(data)
}

create data

base data

set.seed(123)
data_201501 <- create_obs(n_obs=10000, month=1, year=2015)
dt_201501 <- data.table(data_201501)
str(data_201501)
str(dt_201501)
dt_201501
# take deep copy rather than reference
data_201502 <- copy(data_201501)
dt_201502 <- copy(dt_201501)
# define columns to change over time
cols <- c("rwa", "el", "ec", "pd", "lgd")
foo <- function(x, start, end, step, repl) { 
  x * sample(seq(from = start, to = end, by = step), size = 1, replace = repl)
  }

# vectorize function by removing size argument
foo_vec <- function(x, start, end, step, repl) { 
  x * sample(seq(from = start, to = end, by = step), replace = repl)
  }

dplyr

# using standard evaluation version of mutate_each function to operate on vector
# of strings representing columns
data_201502 <- 
  data_201502 %>%
  mutate_each_(funs(foo(., start = 0.9, end = 1.1, step = 0.01, repl = T)), cols)

# adjust month-end
data_201502 <- 
  data_201502 %>% 
  mutate(month = 2,
         year = 2015)

data.table

dt_201502[, (cols) := foo_vec(.SD, start = 0.9, end = 1.1, 
                              step = 0.01, repl = T), .SDcols = cols]

# adjust month-end
dt_201502[, c("year", "month") := .(2015, 2)]

dplyr

data_201502 <-
  data_201502 %>%
  mutate(pd = replace(pd, which(pd > 1), 1),
         lgd = replace(lgd, which(lgd > 1), 1))

data.table

dt_201502[pd > 1, pd := 1]
dt_201502[lgd > 1, lgd := 1]

stress data

data_str_201501 <- copy(data_201501)
dt_str_201501 <- copy(dt_201501)
data_str_201502 <- copy(data_201502)
dt_str_201502 <- copy(dt_201502)

# different seed to change values
set.seed(321)

# dplyr
data_str_201501 <- 
  data_str_201501 %>%
  mutate_each_(funs(foo(., start = 1, end = 2, step = 0.01, repl = T)), cols)

data_str_201502 <- 
  data_str_201502 %>%
  mutate_each_(funs(foo(., start = 1, end = 2, step = 0.01, repl = T)), cols)

data_str_201501 <-
  data_str_201501 %>%
  mutate(pd = replace(pd, which(pd > 1), 1),
         lgd = replace(lgd, which(lgd > 1), 1))

data_str_201502 <-
  data_str_201502 %>%
  mutate(pd = replace(pd, which(pd > 1), 1),
         lgd = replace(lgd, which(lgd > 1), 1))

# data.table
dt_str_201501[, (cols) := foo_vec(.SD, start = 1, end = 2, 
                                  step = 0.01, repl = T), .SDcols = cols]

dt_str_201502[, (cols) := foo_vec(.SD, start = 1, end = 2, 
                                  step = 0.01, repl = T), .SDcols = cols]

dt_str_201501[pd > 1, pd := 1]
dt_str_201501[lgd > 1, lgd := 1]
dt_str_201502[pd > 1, pd := 1]
dt_str_201502[lgd > 1, lgd := 1]

basic operations

slicing, dicing, wrangling

dplyr

select columns

# first two rows, all columns
slice(data_201501, 1:2)
data_201501[1:2, ]
head(data_201501, 2)
# last two rows, all columns
tail(data_201501, 2)
# select columns base r
data_201501[1:2, c("id", "rwa")]
data_201501[1:2, 1:5]

## select columns dplyr
head(select(data_201501, id, rwa), 2)
head(select(data_201501, id, rwa:ec), 2)
head(select(data_201501, -(rwa:ec)), 2)
# regular expressions
head(select(data_201501, starts_with("r")), 2)
head(select(data_201501, ends_with("c")), 2)
head(select(data_201501, contains("product")), 2)
head(select(data_201501, matches(".r.")), 2)
head(select(data_201501, one_of("rwa", "ec", "el")), 2)
# select columns as variable
cols <- c("id", "rwa")
col_nums <- match(cols, names(data_201501))
head(select(data_201501, col_nums), 2)
# using stAndard evaluation equivalent
head(select_(data_201501, "id", "rwa"), 2)

rename

head(rename(data_201501, new_col = rwa), 2)

filter rows

head(filter(data_201501, pd > 0.3 & !(country %in% c("DE", "US"))), 2)

# filter with variable using interp from lazyeval library
# determine metrics and filters dynamically e.g. as input in shiny app
library(lazyeval)
metric_1 <- "pd"
metric_2 <- "rwa"
metric_3 <- "lgd"
metric_4 <- "ec"
criteria <- 
  interp(~ ((var1 <= 0.5 & var2 > 0) | 
                        (var3 <= 0.5 & var4 > 0)), 
                   var1 = as.name(metric_1),
                   var2 = as.name(metric_2),
                   var3 = as.name(metric_3),
                   var4 = as.name(metric_4))

head(filter_(data_201501, criteria), 2)

# mix variable and hard-coded filter with "~" operator
tag1 <- "A3"
head(filter_(data_201501, ~cpy_type == tag1 & corep == "OTH"), 2)

arrange / order

head(arrange(data_201501, country, product_type), 2)
# descending
head(arrange(data_201501, country, desc(rwa)), 2)

combining / chaining operations

data_201501 %>%
  filter(country == "DE") %>%
  select(id, ec) %>%
  arrange(desc(ec)) %>%
  head(., 2)

distinct values

head(distinct(data_201501, id), 2)

adding / removing / adjusting columns

data_201501 %>%
  mutate(new_col = rwa - el,
         ec = ec * 2,
         new_col_2 = new_col / 2) %>%
  head(., 2)

# if you only want to keep the new variables, use transmute()
head(transmute(data_201501,
               new_col = rwa - el,
               new_col_2 = new_col / 2), 2)

summarise

summarise(data_201501,
          rwa_min = min(rwa, na.rm = T),
          rwa_median = median(rwa, na.rm = T),
          rwa_mean = mean(rwa, na.rm = T),
          rwa_max = max(rwa, na.rm = T))

random sampling

head(sample_n(data_201501, 2))
head(sample_frac(data_201501, 0.005, replace = T), 2)

data.table

select columns

dt_201501[1]
dt_201501[1, ]
dt_201501[1:2, ]
dt_201501$id[1:5]
# extracting one column does not work
dt_201501[, 1]
# instead give column name
head(dt_201501[, country], 2)
# above subset returns a vector; Try DT[,.(country)] instead. .() is an
# alias for list() and ensures a data.table is returned.
dt_201501[1:2, .(country)]
dt_201501[1:2, list(country)]
mycol <- "country"
# returns a vector
head(dt_201501[[mycol]], 2)
# with = F returns table
dt_201501[1:2, mycol, with = FALSE]
# grab multiple columns
dt_201501[1:2, list(country, cpy_type)]
# penultimate row of DT using `.N`
dt_201501[.N-1]
# dimensions and names
colnames(dt_201501)
dim(dt_201501)
# select row 2 twice and row 3 for selected columns
dt_201501[c(2,2,3), .(id, country)]

rename

dt_temp <- copy(dt_201501)
setnames(dt_temp, c("rwa", "ec"), c("rwa_new", "ec_new"))
dt_temp[1:2]

filter

dt_201501[pd > 0.5 & lgd > 0.2][1:2,]
# use variable filter with criteria parsed as text
criteria <- "pd > 0.5 & lgd > 0.2"
dt_201501[eval(parse(text = criteria))][1:2,]
# use get function
mycol <- "rwa"
dt_201501[get(mycol) > 1000000][1:2,]

arrange / order

head(dt_201501[order(rwa), ], 2)
head(dt_201501[order(-rwa), ], 2)

combining / chaining operations

head(dt_201501[inst_id == "ADJB2" & rwa > 13^7, ][order(rwa)], 2)

distinct values and duplicates

dt_with_key <- copy(dt_201501)
# use setkeyv to set more than one key
setkeyv(dt_with_key, c("country", "inst_id"))
key(dt_with_key)
head(dt_with_key, 3)
duplicated(dt_with_key, by=c("country", "inst_id"))[1:5]
# since key is already set, also works without explicitly setting it
duplicated(dt_with_key)[1:5]
# will not work on ad-hoc keys -> result is wrong
head(duplicated(dt_201501, by=c("country", "inst_id")), 2)
# for table with no key entire row will be evaluated as key
duplicated(dt_201501)[1:5]
dt_with_key_unique <- unique(dt_with_key)
dt_with_key_unique[1:5, .(country, inst_id)]
# return index of first duplicate if there is one, otherwise 0
# note by=key(NULL) sets any pre-set key to NULL
anyDuplicated(dt_with_key, by=key(NULL))
anyDuplicated(dt_with_key)
uniqueN(dt_with_key)

adding / removing / adjusting columns

# set country code with less than 2 characters to unknown
dt_201501[nchar(country) < 2, country := "Unknown"]
# set na values to zero
dt_201501[is.na(rwa), rwa := 0]
# add new columns via custom function by group
vars <- c("rwa", "ec", "el")
dt_temp <- copy(dt_201501)
dt_temp[, paste0(vars,"_","sum") := lapply(.SD, sum), 
        .SDcols = vars, by = country]
dt_temp[1:2]

# alternatively with pre-determined functions
funs <- c("min", "max", "mean", "sum") # <- define your function
for(i in funs){
  dt_temp[, paste0(vars, "_", i) := lapply(.SD, eval(i)), .SDcols = vars, 
          by = cpy_type] 
  }

# remove column
dt_temp[, el_sum := NULL]

summarise

dt_temp[inst_id == "ADJB2", list(rwa_min = min(rwa, na.rm = T), 
                                 rwa_avg = mean(rwa, na.rm = T), 
                                 rwa_max = max(rwa, na.rm = T),
                                 count = .N), 
        by = c("country")]

random sampling

grouped operations

general

dplyr

# summarise by group
data_201501 %>%
  group_by(country) %>%
  summarise(count = n(),
            rwa_min = min(rwa, na.rm = T), 
            rwa_avg = mean(rwa, na.rm = T), 
            rwa_max = max(rwa, na.rm = T)) %>%
  head(., 2)

data.table

dt_temp[inst_id == "ADJB2", list(rwa_min = min(rwa, na.rm = T), 
                                 rwa_avg = mean(rwa, na.rm = T), 
                                 rwa_max = max(rwa, na.rm = T),
                                 count = .N), 
        by = c("country")][1:2]

merge / join / lookup

general

create base table

data_base_table <- 
  data_201501 %>%
  select(id, month, year, country, ubrtrn, inst_id, product_type,
         corep, cpy_type)

dt_base_table <- dt_201501[, .(id, month, year, country, ubrtrn, inst_id,
                               product_type, corep, cpy_type)]

dplyr

# left join -> see vignette for more
data_merged_table <-
  left_join(x = data_base_table,
            y = data_201501[, c("id", "rwa", "el", "ec", "pd", "lgd")],
            by = c("id" = "id"))

data.table

## left join -> see vignette for more
setkey(dt_base_table, id)
setkey(dt_201501, id)
dt_merged_table <-
  dt_base_table[dt_201501,
                list(id, month, year, country, ubrtrn, inst_id, product_type,
                     corep, cpy_type, rwa, el, ec, pd, lgd)]
# check how to avoid having to state all column names of base table,
# maybe using setdiff

## not using pre-key setting but rather ad-hoc key setting
# clear keys
setkey(dt_base_table, NULL)
setkey(dt_201501, NULL)
# generic syntax
dt_merged_table <- 
  dt_base_table[dt_201501,
                list(id, month, year, country, ubrtrn, inst_id, product_type,
                     corep, cpy_type, rwa, el, ec, pd, lgd),
                on = c(id = "id")]

# using data.table::merge function
dt_merged_table <- 
  merge(dt_base_table, dt_201501[, list(id, rwa, el, ec, pd, lgd)],
        by.x = "id", by.y = "id")

sort

general

create unsorted table

data_unsorted <- copy(data_201501)
dt_unsorted <- copy(dt_201501)

dplyr

data_sorted <- arrange(data_unsorted, desc(year), desc(month), country, inst_id)

data.table

dt_sorted <- dt_unsorted[order(-year, -month, country, inst_id)]
# use setorder for order by reference (in-place), without making any additional copies
dt_sorted <- setorder(dt_unsorted, -year, -month, country, inst_id)

reshape

general

create long data

set.seed(123)
dt_201601 <- data.table(create_obs(n_obs=10000, month=1, year=2016))
dt_201602 <- data.table(create_obs(n_obs=10000, month=2, year=2016))
dt_201603 <- data.table(create_obs(n_obs=10000, month=3, year=2016))
data <- rbind(dt_201601, dt_201602, dt_201603)

tidyr

# spread rwa i.e. long to wide
temp <- spread(data = data[,.(id, year, month, rwa)], 
               key = month, 
               value = rwa)
# rename cols
names(temp) <- c("id", "year", "rwa_1", "rwa_2", "rwa_3")
str(temp)
# gather i.e. wide to long
temp <- gather(data = temp, key = month, value = rwa, rwa_1, rwa_2, rwa_3)
str(temp)

data.table

# long to wide
temp <- dcast(data = data[,.(id, year, month, rwa)],
              formula = id + year ~ month, 
              value.var = "rwa")
# rename cols
names(temp) <- c("id", "year", "rwa_1", "rwa_2", "rwa_3")
str(temp)

# wide to long
temp <- melt(data = temp, 
             id.vars = c("id", "year"),
             measure.vars = c("rwa_1", "rwa_2", "rwa_3"),
             variable.name = "month",
             value.name = "rwa")
str(temp)

import / export

general

export with base r

# Write to a file, suppress row names
write.csv(data_201501, "./data.csv", row.names=FALSE)

# Same, except that instead of "NA", output dot as in sas
write.csv(data_201501, "./data.csv", row.names=FALSE, na=".")

# Use tabs, suppress row names
write.table(data_201501, "./data.csv", sep="\t", row.names=FALSE)

# use readr write_delim which is about twice as fast as write.csv, 
# and never writes row names.
# will not introduce quotes around text as write.table
write_delim(x=data_201501, path="./data.csv", delim="\t", append=FALSE)

readr (hadley.wickham)

data_import <- read_delim("./data.csv", delim="\t", col_names=TRUE)

data.table

# use fread auto setting to detect all parameters automatically or set explicitly
dt_import <- fread("./data.csv", sep="\t", header="auto")

miscellaneous

apply multiple pre-defined named functions to selected columns

general

dplyr

data.table

# http://stackoverflow.com/questions/29620783/data-table-in-r-apply-multiple-functions-to-multiple-columns

summary_functions <- function(x) list(mean = mean(x, na.rm = T), 
                                      median = median(x, na.rm = T))

# this gives a vector output
dt_201501[, unlist(lapply(.SD, summary_functions)), 
          .SDcols = c("rwa", "ec", "el")]

# this gives a data.table output
dt_201501[, as.list(unlist(lapply(.SD, summary_functions))), 
          .SDcols = c("rwa", "ec", "el")]

# simplify call
summary_functions <- function(x) c(mean = mean(x, na.rm = T), 
                                   median = median(x, na.rm = T))
# this gives a vector output
dt_201501[, sapply(.SD, summary_functions), 
          .SDcols = c("rwa", "ec", "el")]

a case study with data.table

knitr::include_graphics("./split-apply-combine.png")
knitr::include_graphics("./split-apply-combine-flat.png")

transform meta tables to data.table with keys

country_dt <- data.table(country_table, key = c("key_var"))
month_dt <- data.table(month_table, key = c("key_var"))
ubr_dt <- data.table(ubr_table, key = c("key_var"))
inst_id_dt <- data.table(inst_id_table, key = c("key_var"))
product_type_dt <- data.table(product_type_table, key = c("key_var"))
counterparty_type_dt <- data.table(counterparty_type_table, key = c("key_var"))
corep_dt <- data.table(corep_table, key = c("key_var"))

join base and stress and calculate delta on the fly

setkey(dt_201501, id)
setkey(dt_str_201501, id)
setkey(dt_201502, id)
setkey(dt_str_201502, id)
n <- dim(dt_str_201501)[1]
sample_rows <- sample(x = n, size = n/100, rep = F)
cols <- c("rwa", "el", "ec")

dt_str_201501[sample_rows, (cols) := lapply(.SD, foo, start=2, end=4, 
                                            step=1, repl=T),
              .SDcols = cols]

dt_str_201502[sample_rows, (cols) := lapply(.SD, foo, start=2, end=4, 
                                            step=1, repl=T),
              .SDcols = cols]
# using direct syntax (interpreted as left join, i.e. X[Y] is a join, looking up X's 
# rows using Y (or Y's key if it has one) as an index.)
# using direct syntax will modify original object by reference. if a copy is needed,
# copy base table first and perform join afterwards (memory-inefficient)
# for unkeyed tables use parameter on = "id" as ad-hoc key
dt_join_201501 <- copy(dt_201501)
dt_join_201501[dt_str_201501, c("rwa_delta", "ec_delta", "el_delta", "pd_delta", 
                                "lgd_delta") := 
              .(i.rwa - rwa, i.ec - ec, i.el - el, i.pd - pd, i.lgd - lgd), 
            with = F, on = "id"]

# alternative syntax which some may find easier to debug
dt_join_201501 <- copy(dt_201501)
dt_join_201501[dt_str_201501, `:=` (rwa_delta = i.rwa - rwa,
                                    ec_delta = i.ec - ec,
                                    el_delta = i.el - el,
                                    pd_delta = i.pd - pd,
                                    lgd_delta = i.lgd - lgd),
               on = "id"]

dt_201501 <- data.table(create_obs(n_obs=10000, month=1, year=2015))
setkey(dt_201501, id)

# using merge function to create new table, use all=T if full join is wanted.
# cannot perform delta computation and join in one step?
dt_join_201501 <- 
  merge(dt_201501, dt_str_201501[, .(id, rwa, el, ec, pd, lgd)], 
        suffixes=c("_base", "_stress"))

dt_join_201501[, `:=` (rwa_delta = rwa_stress - rwa_base,
                       ec_delta = ec_stress - ec_base,
                       el_delta = el_stress - el_base,
                       pd_delta = pd_stress - pd_base,
                       lgd_delta = lgd_stress - lgd_base)]

# remove unnecessary cols
dt_join_201501[, c("rwa_stress", "ec_stress", 
                   "el_stress", "pd_stress", "lgd_stress") := NULL]

# compute relative delta
dt_join_201501[, `:=` (rwa_delta_rel = rwa_delta / rwa_base,
                       ec_delta_rel = ec_delta / ec_base,
                       el_delta_rel = el_delta / el_base)]

exploratory analysis

impact by various aggregations for different vars

dt_join_201501[, .(rwa_base = sum(rwa_base, na.rm = T),
                   rwa_delta = sum(rwa_delta, na.rm = T),
                   rwa_delta_min = min(rwa_delta, na.rm = T), 
                   rwa_delta_mean = mean(rwa_delta, na.rm = T),
                   rwa_delta_median = median(rwa_delta, na.rm = T),
                   rwa_delta_max = max(rwa_delta, na.rm = T),
                   count = .N), by = c("country")]

scale <- 1*10^6

ggplot(dt_join_201501, aes(country, rwa_delta/scale)) +
  geom_boxplot(fill = "white", colour = "darkblue", 
               outlier.colour = "red", outlier.shape = 1) +
    scale_y_continuous(labels=comma) +
    labs(title="RWA Delta (in mn)", x="Country", y="RWA Delta (in mn) \n")

# do the same with qplot for inst_id and ec
qplot(data = dt_join_201501, x = inst_id, y = ec_delta/scale, fill = inst_id, 
  geom = "boxplot", xlab = "\n inst_id", ylab = "EC Delta (in mn) \n", 
  main="EC Delta (in mn) \n") + 
  theme(legend.position="bottom") +
  # adding the mean with black circle
  geom_point(stat = "summary", fun.y = "mean", size = I(3), color = I("black")) + 
  geom_point(stat = "summary", fun.y = "mean", size = I(2.2), color = I("orange"))

ggplot(dt_join_201501, aes(x = pd_base)) + 
  geom_histogram(binwidth=.01, colour="darkblue", fill="white")

qplot(data = dt_join_201501, x = el_delta/scale, geom = "histogram", y = ..density.., 
      binwidth = 0.5, colour = I("white"), fill = I("orange"), 
      xlab = "\n EL Delta (in mn)", 
      ylab = "Density", main = "Histogram of EL Delta (in mn) \n")

ggplot(dt_join_201501, aes(pd_base, fill = country)) +
  geom_density(alpha = 0.1)

dt_join_201501 %>%
  ggplot(aes(x=rwa_base/1000000, y=rwa_delta/1000000)) +
  geom_point(shape=20, col="darkblue") +
  scale_y_continuous(labels=comma) +
  labs(title="RWA Base vs. RWA Stress Delta (in mn)", 
       x="Base (in mn)", y="Stress Delta (in mn)")

ggplot(dt_join_201501, aes(rwa_delta, fill = country)) +
  geom_density(alpha = 0.2)

pivot-like / olap-like queries

aggregation by meta table attributes

test <- copy(dt_join_201501)
setkey(test, product_type)

test[product_type_dt, c("product_type_desc") := 
              .(i.product_type_desc), 
     with = F][, .(rwa_base = sum(rwa_base, na.rm = T),
                   rwa_delta = sum(rwa_delta, na.rm = T),
                   rwa_delta_min = min(rwa_delta, na.rm = T), 
                   rwa_delta_mean = mean(rwa_delta, na.rm = T),
                   rwa_delta_median = median(rwa_delta, na.rm = T),
                   rwa_delta_max = max(rwa_delta, na.rm = T),
                   count = .N), by = c("product_type_desc")]

# do the same with ad-hoc-key
test <- copy(dt_join_201501)
setkey(test, NULL)

test[product_type_dt, c("product_type_desc") := 
              .(i.product_type_desc), with = F, 
     on = c(product_type="key_var")][, .(rwa_base = sum(rwa_base, na.rm = T),
                                         rwa_delta = sum(rwa_delta, na.rm = T),
                                         rwa_delta_min = min(rwa_delta, na.rm = T), 
                                         rwa_delta_mean = mean(rwa_delta, na.rm = T),
                                         rwa_delta_median = median(rwa_delta, na.rm = T),
                                         rwa_delta_max = max(rwa_delta, na.rm = T),
                                         count = .N), by = c("product_type_desc")]

building an ad-hoc aggregation function

# http://stackoverflow.com/questions/37706385/r-data-table-function-wrapper-around-ad-hoc-join-with-aggregation-in-a-chain
# all efforts to avoid magrittr chain failed. removing it, fails to return
# result table, assigning result and then return fails to delete aggregation var
# after return
agg_foo <- function(x, meta_tbl, x_key, meta_key, agg_by) { 
  x[meta_tbl, 
      (agg_by) := get(agg_by),
      on=setNames(meta_key, x_key)][, .(rwa_base = sum(rwa_base, na.rm = T),
                                        rwa_delta = sum(rwa_delta, na.rm = T),
                                        count = .N), by = c(agg_by)] %>%
    print(.)

  x[, (agg_by) := .(NULL)]
  }

test2 <- copy(test)
agg_foo(x=test2, meta_tbl=product_type_dt, 
        x_key="product_type", meta_key="key_var", 
        agg_by="product_type_desc")

# try without chain - check performance differenc if any
agg_foo <- function(x, meta_tbl, x_key, meta_key, agg_by) { 

  x[meta_tbl, (agg_by) := get(agg_by), on=setNames(meta_key, x_key)]

  temp <-
    x[, .(rwa_base = sum(rwa_base, na.rm = T),
          rwa_delta = sum(rwa_delta, na.rm = T),
          count = .N), by = c(agg_by)]

  x[, (agg_by) := .(NULL)]

  return(temp)
  }

test2 <- copy(test)
temp <-
  agg_foo(x=test2, meta_tbl=product_type_dt, 
          x_key="product_type", meta_key="key_var", 
          agg_by="product_type_desc")
temp

visualization with ggplot

general

data

dt_str_201501[sample_rows, (cols) := foo_vec(.SD, start = -1, end = 5, step = 1, repl=T), .SDcols = cols]

dt_str_201502[sample_rows, (cols) := foo_vec(.SD, start = -1, end = 5, step = 1, repl=T), .SDcols = cols]

dt_join_201501 <- merge(dt_201501, dt_str_201501[, .(id, rwa, el, ec, pd, lgd)], by.x = c("id"), by.y = c("id"), suffixes = c("_base", "_stress"))

dt_join_201501[, := (rwa_delta = rwa_stress - rwa_base, ec_delta = ec_stress - ec_base, el_delta = el_stress - el_base, pd_delta = pd_stress - pd_base, lgd_delta = lgd_stress - lgd_base)]

remove unnecessary cols

dt_join_201501[, c("rwa_stress", "ec_stress", "el_stress", "pd_stress", "lgd_stress") := NULL]

compute relative delta

dt_join_201501[, := (rwa_delta_rel = rwa_delta / rwa_base, ec_delta_rel = ec_delta / ec_base, el_delta_rel = el_delta / el_base)]


```{r}
dt1 <- data.table(a=rep(10, 30))
dt2 <- data.table(b=rep(10, 30))

dt1[, c := foo_vec(.SD, start = -2, end = 10, step = 1,
                   repl = T), .SDcols = c("a")]

dt2[, d := foo_vec(.SD, start = -2, end = 10, step = 1,
                   repl = T), .SDcols = c("b")]

temp <- cbind(dt1, dt2)

temp %>%
  ggplot(aes(x = c, 
             y = d)) +
  geom_point(shape=20, col="darkblue") +
  scale_y_continuous(labels=comma) +
  labs(title="RWA Base vs. RWA Stress Delta (in mn)",
       x="Base (in mn)", y="Stress Delta (in mn)")
scale <- 1*10^6
dt_join_201501 %>%
  ggplot(aes(x = rwa_base / scale, 
             y = (rwa_base + rwa_delta) / scale)) +
  geom_point(shape=20, col="darkblue") +
  scale_y_continuous(labels=comma) +
  labs(title="RWA Base vs. RWA Stress (in mn)",
       x="Base (in mn)", y="Stress (in mn)")

dt_join_201501 %>%
  ggplot(aes(x = rwa_base / scale, 
             y = (rwa_delta) / scale)) +
  geom_point(shape=20, col="darkblue") +
  scale_y_continuous(labels=comma) +
  labs(title="RWA Base vs. RWA Stress Delta (in mn)",
       x="Base (in mn)", y="Stress Delta (in mn)") + 
  facet_wrap(~country)

# Adding a Smoother to a Plot
# alternative smoothing algorithms available
dt_join_201501 %>%
  ggplot(aes(x = rwa_base / scale, 
             y = (rwa_base + rwa_delta) / scale)) +
  geom_point(shape=20, col="darkblue") +
  scale_y_continuous(labels=comma) + 
  geom_smooth(method = "lm", colour = "red")

# Boxplots and Jittered Points
dt_join_201501 %>%
  ggplot(aes(x = country, 
             y = (rwa_delta / scale))) +
  geom_point(shape=20, col="darkblue") +
  scale_y_continuous(labels=comma)

ggplot(dt_join_201501, aes(country, rwa_delta/scale)) +
  geom_boxplot(fill = "white", colour = "darkblue", 
               outlier.colour = "red", outlier.shape = 1) +
    scale_y_continuous(labels = comma) +
    labs(title="RWA Delta (in mn)", x="Country", y="RWA Delta (in mn) \n")

# Histograms and Frequency Polygons
ggplot(dt_join_201501, aes(pd_base)) + 
  geom_histogram(binwidth = 0.01, fill = "darkblue")

ggplot(dt_join_201501, aes(pd_base)) + 
  geom_freqpoly(binwidth = 0.01, colour = "darkblue")

ggplot(dt_join_201501, aes(pd_base + pd_delta, colour = country)) + 
  geom_freqpoly()

ggplot(dt_join_201501, aes((pd_base + pd_delta), fill = country)) + 
  geom_histogram(binwidth = 0.01) + 
  facet_wrap(~country, ncol = 2)

# Bar Charts

ggplot(dt_join_201501, aes(country)) + 
  geom_bar(fill = "darkblue")

utilities

Sys.Date()
Sys.time()
Sys.timezone()
getwd()

# create dummy file
cat("file A\n", file = "./a.txt")

# list all files starting with a-b or r
list.files(path = ".", pattern = "^[a-br]", all.files = FALSE,
           full.names = FALSE, recursive = FALSE,
           ignore.case = FALSE, include.dirs = FALSE, no.. = FALSE)

# list directories, can include subdirectories via recursive = T
list.dirs(path = ".", full.names = T, recursive = F)

# file existence and edit rights
file.exists("./a.txt")
file.exists("./nosuchfile")

# test for existence.
file.access("./a.txt", mode = 0)
# test for execute permission.
file.access("./a.txt", mode = 1)
# test for write permission.
file.access("./a.txt", mode = 2)
# test for read permission.
file.access("./a.txt", mode = 4)

# file info
file.info("./a.txt")

# copy
file.copy("a.txt", "b.txt", overwrite = T)

# append
file.append("./a.txt", "./b.txt")
file.rename("a.txt", "a_new.txt")

# remove
file.remove(c("./a_new.txt", "./b.txt"))
file.remove("./nosuchfile")

# create and remove directory
dir.create("./folder1")
unlink(c("./folder1"), recursive = T)

benchmarking

fn_test <- function(x){return(rnorm(n = x, mean = 0, sd = 1))}
system.time(fn_test(1*10^6))
library(microbenchmark)
res <- microbenchmark(fn_test(1*10^6), times = 10L)
boxplot(res)
if (require("ggplot2")) {
  autoplot(res)
}

aggregation functions

library(data.table)
library(dplyr)
options(datatable.verbose=F)
n_size <- 1*10^6
sample_metrics <- sample(seq(from = 1, to = 100, by = 1), n_size, rep = T)
sample_dimensions <- sample(letters[10:12], n_size, rep = T)
df <- 
  data.frame(
    a = c(NA, sample_metrics),
    b = c(sample_metrics, NA),
    c = c(NA, sample_dimensions),
    d = c(sample_dimensions, NA),
    x = c(NA, sample_metrics),
    y = c(sample_dimensions, NA),
    stringsAsFactors = F)

dt <- as.data.table(df)
fn_dt_agg1 <- 
  function(dt, metric, metric_name, dimension, dimension_name) {

  # setnames in combination with lapply allows passing of variable names in SDcols
  temp <- dt[, setNames(lapply(.SD, function(x) {sum(x, na.rm = T)}), 
                        metric_name), 
             keyby = dimension, .SDcols = metric]

  #setorderv(temp, dimension) in case order is different than dimension

  temp[]
  }

res_dt1 <- 
  fn_dt_agg1(
    dt = dt, metric = c("a", "b"), metric_name = c("a", "b"),
    dimension = c("c", "d"), dimension_name = c("c", "d"))
fn_dt_agg2 = 
  function(dt, metric, metric_name, dimension, dimension_name,
           agg_type) {

  j_call = as.call(c(
    as.name("."),
    sapply(setNames(metric, metric_name), 
           function(var) as.call(list(as.name(agg_type), 
                                      as.name(var), na.rm = TRUE)), 
           simplify = FALSE)
    ))

  dt[, eval(j_call), keyby = dimension][]
  }

res_dt2 <- 
  fn_dt_agg2(
    dt = dt, metric = c("a", "b"), metric_name = c("a", "b"),
    dimension = c("c", "d"), dimension_name = c("c", "d"),
    agg_type = c("sum"))

all.equal(res_dt1, res_dt2)
fn_dt_agg3 <- 
  function(dt, metric, metric_name, dimension, dimension_name, agg_type) {

  e <- eval(parse(text=paste0("function(x) {", 
                              agg_type, "(", "x, na.rm = T)}"))) 

  temp <- dt[, setNames(lapply(.SD, e), 
                        metric_name), 
             keyby = dimension, .SDcols = metric]

  temp[]
  }

res_dt3 <- 
  fn_dt_agg3(
    dt = dt, metric = c("a", "b"), metric_name = c("a", "b"),
    dimension = c("c", "d"), dimension_name = c("c", "d"), 
    agg_type = "sum")

all.equal(res_dt1, res_dt3)
fn_dt_agg4 <- 
  function(dt, metric, metric_name, dimension, dimension_name, agg_type) {

    e <- function(x) getFunction(agg_type)(x, na.rm = T)

    temp <- dt[, setNames(lapply(.SD, e), 
                          metric_name), 
               keyby = dimension, .SDcols = metric]
    temp[]
  }

res_dt4 <- 
  fn_dt_agg4(
    dt = dt, metric = c("a", "b"), metric_name = c("a", "b"),
    dimension = c("c", "d"), dimension_name = c("c", "d"), 
    agg_type = "sum")

all.equal(res_dt1, res_dt4)
fn_df_agg1 <-
  function(df, metric, metric_name, dimension, dimension_name, agg_type) {

    all_vars <- c(dimension, metric)
    all_vars_new <- c(dimension_name, metric_name)

    # Convert character vector to list of symbols
    dots_group <- lapply(dimension, as.name)

    e <- eval(parse(text=paste0("function(x) {", 
                                agg_type, "(", "x, na.rm = T)}")))

    df %>%
      select_(.dots = all_vars) %>%
      group_by_(.dots = dots_group) %>%
      summarise_each_(funs(e), metric) %>%
      rename_(.dots = setNames(all_vars, all_vars_new))
  }

res_df1 <- 
  fn_df_agg1(
    df = df, metric = c("a", "b"), metric_name = c("a", "b"),
    dimension = c("c", "d"), dimension_name = c("c", "d"),
    agg_type = "sum")

all.equal(res_dt1, as.data.table(res_df1))
test_agg_type <- c("min")
test_agg_type <- c("max")
test_agg_type <- c("median")
test_agg_type <- c("mean")
test_agg_type <- c("sum")

#library(microbenchmark)
bench_res <- 
  microbenchmark(
    fn_dt_agg1 = 
      fn_dt_agg1(
        dt = dt, metric = c("a", "b"), metric_name = c("a", "b"), 
        dimension = c("c", "d"), dimension_name = c("c", "d")), 
    fn_dt_agg2 = 
      fn_dt_agg2(
        dt = dt, metric = c("a", "b"), metric_name = c("a", "b"), 
        dimension = c("c", "d"), dimension_name = c("c", "d"),
        agg_type = test_agg_type),
    fn_dt_agg3 =
      fn_dt_agg3(
        dt = dt, metric = c("a", "b"), metric_name = c("a", "b"),
        dimension = c("c", "d"), dimension_name = c("c", "d"),
        agg_type = test_agg_type),
    fn_dt_agg4 = 
      fn_dt_agg4(
        dt = dt, metric = c("a", "b"), metric_name = c("a", "b"),
        dimension = c("c", "d"), dimension_name = c("c", "d"), 
        agg_type = test_agg_type),
    fn_df_agg1 =
      fn_df_agg1(
        df = df, metric = c("a", "b"), metric_name = c("a", "b"),
        dimension = c("c", "d"), dimension_name = c("c", "d"),
        agg_type = test_agg_type),
    times = 100L)

bench_res
boxplot(bench_res)
if (require("ggplot2")) {
  autoplot(bench_res)
  }
fn_dt_agg <- 
  function(dt, metric, metric_name = metric, dimension = c(), 
           agg_type = c("sum"), na.rm = TRUE) {

    e <- function(x) getFunction(agg_type)(x, na.rm = na.rm)

    dt[, setNames(lapply(.SD, e), metric_name), 
       keyby = dimension, .SDcols = metric][]
  }

test_agg_type <- c("sum")
fn_dt_agg(
        dt = dt, metric = c("a", "b"), metric_name = c("a", "b"),
        dimension = c("c", "d"), dimension_name = c("c", "d"), 
        agg_type = test_agg_type)

# call with default parameters
fn_dt_agg(dt = dt, metric = c("a", "b"))

fn_dt_agg(dt = dt, metric = c("a", "b"), dimension = c(test = a > 1))

testing

http://stackoverflow.com/questions/29282994/how-to-write-a-testthat-unit-test-for-a-function-that-returns-a-data-frame

resources

general

dplyr

data table