Open twoolston opened 2 months ago
My best guess is that the newest version of catboost has an issue. Is there something new that in their update that you'd like to see incorporated?
Wow, thanks for the quick response. Not in particular - always looking for the latest and greatest ML. Got a minute for a phone call, the company I’m putting my time into is located in Scottsdale.
On Aug 14, 2024, at 9:47 PM, Adrian Antico @.***> wrote:
My best guess is that the newest version of catboost has an issue. Is there something new that in their update that you'd like to see incorporated?
— Reply to this email directly, view it on GitHub https://github.com/AdrianAntico/AutoQuant/issues/86#issuecomment-2290314700, or unsubscribe https://github.com/notifications/unsubscribe-auth/AA5EN7BXPZO2ATBLAOJVHOLZRQCEBAVCNFSM6AAAAABMQSIAHKVHI2DSMVQWIX3LMV43OSLTON2WKQ3PNVWWK3TUHMZDEOJQGMYTINZQGA. You are receiving this because you authored the thread.
I'm happy to. Send me an email and I can share my number
@.***
On Aug 20, 2024, at 4:07 PM, Adrian Antico @.***> wrote:
I'm happy to. Send me an email and I can share my number
— Reply to this email directly, view it on GitHub https://github.com/AdrianAntico/AutoQuant/issues/86#issuecomment-2299674402, or unsubscribe https://github.com/notifications/unsubscribe-auth/AA5EN7GJEQPUPJGTYWI725LZSOOYJAVCNFSM6AAAAABMQSIAHKVHI2DSMVQWIX3LMV43OSLTON2WKQ3PNVWWK3TUHMZDEOJZGY3TINBQGI. You are receiving this because you authored the thread.
It looks like github masks emails listed in comments. You can find my email on my main profile page, under my pic.
Sorry Adrian I had this working, futzing around with gpu and nvidia, etc on azure, and then installed the latest version of Catboost.
Now Autoquant returns this error:
Error in dyn.load(file, DLLpath = DLLpath, ...) : unable to load shared object '/home/flipai/R/x86_64-pc-linux-gnu-library/4.3/catboost/libs/libcatboostr.so': /home/flipai/R/x86_64-pc-linux-gnu-library/4.3/catboost/libs/libcatboostr.so: cannot open shared object file: No such file or directory
I am running the pure cut and paste job of your walmart example from the auto quant package example. Note, time fill is exported from Rodeo, not AutoQuant.
Set up your output file path for saving results as a .csv
Path <- '~/flip/production/boxcar/functions'
Run on GPU or CPU (some options in the grid tuning force usage of CPU for some runs)
TaskType = 'GPU'
Define number of CPU threads to allow data.table to utilize
data.table::setDTthreads(percent = max(1L, parallel::detectCores()-2L))
Load data
data <- data.table::fread('https://www.dropbox.com/s/2str3ek4f4cheqi/walmart_train.csv?dl=1')
data <- Rappture::DM.pgQuery(Host = 'localhost', DataBase = 'AutoQuant', SELECT = NULL, FROM = 'WalmartFull', User = 'postgres', Port = 5432, Password = 'Aa1028#@')
Ensure series have no missing dates (also remove series with more than 25% missing values)
data <- Rodeo::TimeSeriesFill( data, DateColumnName = 'Date', GroupVariables = c('Store','Dept'), TimeUnit = 'weeks', FillType = 'maxmax', MaxMissingPercent = 0.25, SimpleImpute = TRUE)
Set negative numbers to 0
data <- data[, Weekly_Sales := data.table::fifelse(Weekly_Sales < 0, 0, Weekly_Sales)]
Remove IsHoliday column
data[, IsHoliday := NULL]
Create xregs (this is the include the categorical variables instead of utilizing only the interaction of them)
xregs <- data[, .SD, .SDcols = c('Date', 'Store', 'Dept')]
Change data types
data[, ':=' (Store = as.character(Store), Dept = as.character(Dept))] xregs[, ':=' (Store = as.character(Store), Dept = as.character(Dept))]
Subset data so we have an out of time sample
data1 <- data.table::copy(data[, ID := 1L:.N, by = c('Store','Dept')][ID <= 125L][, ID := NULL]) data[, ID := NULL]
Define values for SplitRatios and FCWindow Args
N1 <- data1[, .N, by = c('Store','Dept')][1L, N] N2 <- xregs[, .N, by = c('Store','Dept')][1L, N]
Setup Grid Tuning & Feature Tuning data.table using a cross join of vectors
Tuning <- data.table::CJ( TimeWeights = c('None',0.999), MaxTimeGroups = c('weeks','months'), TargetTransformation = c('TRUE','FALSE'), Difference = c('TRUE','FALSE'), HoldoutTrain = c(6,18), Langevin = c('TRUE','FALSE'), NTrees = c(2500,5000), Depth = c(6,9), RandomStrength = c(0.75,1), L2_Leaf_Reg = c(3.0,4.0), RSM = c(0.75,'NULL'), GrowPolicy = c('SymmetricTree','Lossguide','Depthwise'), BootStrapType = c('Bayesian','MVS','No'))
Remove options that are not compatible with GPU (skip over this otherwise)
Tuning <- Tuning[Langevin == 'TRUE' | (Langevin == 'FALSE' & RSM == 'NULL' & BootStrapType %in% c('Bayesian','No'))]
Randomize order of Tuning data.table
Tuning <- Tuning[order(runif(.N))]
Load grid results and remove rows that have already been tested
if(file.exists(file.path(Path, 'Walmart_CARMA_Metrics.csv'))) { Metrics <- data.table::fread(file.path(Path, 'Walmart_CARMA_Metrics.csv')) temp <- data.table::rbindlist(list(Metrics,Tuning), fill = TRUE) temp <- unique(temp, by = c(4:(ncol(temp)-1))) Tuning <- temp[is.na(RunTime)][, .SD, .SDcols = names(Tuning)] rm(Metrics,temp) }
Define the total number of runs
TotalRuns <- Tuning[,.N]
Kick off feature + grid tuning
for(Run in seq_len(TotalRuns)) {
print run number
for(zz in seq_len(100)) print(Run)
Use fresh data for each run
xregs_new <- data.table::copy(xregs) data_new <- data.table::copy(data1)
Timer start
StartTime <- Sys.time()
Run carma system
CatBoostResults <- AutoQuant::AutoCatBoostCARMA(
Timer End
EndTime <- Sys.time()
Prepare data for evaluation
Results <- CatBoostResults$Forecast data.table::setnames(Results, 'Weekly_Sales', 'bla') Results <- merge(Results, data, by = c('Store','Dept','Date'), all = FALSE) Results <- Results[is.na(bla)][, bla := NULL]
Create totals and subtotals
Results <- data.table::groupingsets( x = Results, j = list(Predictions = sum(Predictions), Weekly_Sales = sum(Weekly_Sales)), by = c('Date', 'Store', 'Dept'), sets = list(c('Date', 'Store', 'Dept'), c('Store', 'Dept'), 'Store', 'Dept', 'Date'))
Fill NAs with 'Total' for totals and subtotals
for(cols in c('Store','Dept')) Results[, eval(cols) := data.table::fifelse(is.na(get(cols)), 'Total', get(cols))]
Add error measures
Results[, Weekly_MAE := abs(Weekly_Sales - Predictions)] Results[, Weekly_MAPE := Weekly_MAE / Weekly_Sales]
Weekly results
Weekly_MAPE <- Results[, list(Weekly_MAPE = mean(Weekly_MAPE)), by = list(Store,Dept)]
Monthly results
temp <- data.table::copy(Results) temp <- temp[, Date := lubridate::floor_date(Date, unit = 'months')] temp <- temp[, lapply(.SD, sum), by = c('Date','Store','Dept'), .SDcols = c('Predictions', 'Weekly_Sales')] temp[, Monthly_MAE := abs(Weekly_Sales - Predictions)] temp[, Monthly_MAPE := Monthly_MAE / Weekly_Sales] Monthly_MAPE <- temp[, list(Monthly_MAPE = mean(Monthly_MAPE)), by = list(Store,Dept)]
Collect metrics for Total (feel free to switch to something else or no filter at all)
Metrics <- data.table::data.table( RunNumber = Run, Total_Weekly_MAPE = Weekly_MAPE[Store == 'Total' & Dept == 'Total', Weekly_MAPE], Total_Monthly_MAPE = Monthly_MAPE[Store == 'Total' & Dept == 'Total', Monthly_MAPE], Tuning[Run], RunTime = EndTime - StartTime)
Append to file (not overwrite)
data.table::fwrite(Metrics, file = file.path(Path, 'Walmart_CARMA_Metrics.csv'), append = TRUE)
Remove objects (clear space before new runs)
rm(CatBoostResults, Results, temp, Weekly_MAE, Weekly_MAPE, Monthly_MAE, Monthly_MAPE)
Garbage collection because of GPU
gc() }