Closed aaron-scheffler closed 5 years ago
This is not exactly the same issue, but similar, since you can stumble upon the same problem by defining just one variable (column) as categorical. When using MultinomialLoss()
or OvALoss()
, the number of variables for a single categorical feature are considered as the number of categories by get_yidxs
. This is the same reason for why the comment above is not working.
Here's a minimal working example where it fails (I borrowed some functions from censored.jl). It fails when using both, or either commenting the variables cat1 or cat2 from Xtrain defined in main().
using DataFrames
using LowRankModels
using Compat
srand(0)
function censored_regularization_path(train_glrm::GLRM, test_glrm::GLRM; params=Params(), reg_params=logspace(2,-2,5),
holdout_proportion=.1, verbose=true,
ch::ConvergenceHistory=ConvergenceHistory("reg_path"))
m,n = size(train_glrm.A)
ntrain = sum(map(length, train_glrm.observed_features))
ntest = sum(map(length, test_glrm.observed_features))
train_error = @compat Array{Float64}(length(reg_params))
test_error = @compat Array{Float64}(length(reg_params))
@compat solution = @compat Array{Tuple{Float64,Float64}}(length(reg_params))
train_time = @compat Array{Float64}(length(reg_params))
for iparam=1:length(reg_params)
reg_param = reg_params[iparam]
# evaluate train and test error
if verbose println("fitting train GLRM for reg_param $reg_param") end
scale!(train_glrm.rx, reg_param)
scale!(train_glrm.ry, reg_param)
train_glrm.X, train_glrm.Y = randn(train_glrm.k,m), randn(train_glrm.k,n)
X, Y, ch = fit!(train_glrm; params=params, ch=ch, verbose=verbose)
train_time[iparam] = ch.times[end]
if verbose println("computing train and test error for reg_param $reg_param:") end
train_error[iparam] = objective(train_glrm, X, Y, include_regularization=false) / ntrain
if verbose println("\ttrain error: $(train_error[iparam])") end
test_error[iparam] = objective(test_glrm, X, Y, include_regularization=false) / ntest
if verbose println("\ttest error: $(test_error[iparam])") end
solution[iparam] = (sum(X)+sum(Y), sum(abs.(X))+sum(abs.(Y)))
if verbose println("\tsum of solution, one norm of solution: $(solution[iparam])") end
end
return train_error, test_error, train_time, reg_params, solution
end
function lossfunctions(df::DataFrame)
losses = Array{LowRankModels.Loss}(0)
for name in names(df)
if contains(convert(String, name), "num")
push!(losses, QuadLoss())
elseif contains(convert(String, name), "ord")
push!(losses, OrdinalHingeLoss(minimum(df[name]),
maximum(df[name]),
1.0));
elseif contains(convert(String, name), "bin")
push!(losses, LogisticLoss())
elseif contains(convert(String, name), "cat")
push!(losses, MultinomialLoss(maximum(df[name]), 1.0))
end
end
return losses
end
function main()
samples = 100;
Xtrain = DataFrame(num1 = randn(samples),
num2 = randn(samples),
num3 = randn(samples),
ord1 = rand(1:10, samples),
ord2 = rand(1:10, samples),
ord3 = rand(1:10, samples),
cat1 = rand(11:20, samples),
cat2 = rand(1:10, samples),
bin1 = rand(Bool, samples),
bin2 = rand(Bool, samples),
bin3 = rand(Bool, samples))
features = size(Xtrain, 2);
observed = observations(Xtrain)
losses = lossfunctions(Xtrain)
(train_observed_features, train_observed_examples,
test_observed_features, test_observed_examples) =
get_train_and_test(observed, samples, features, .2)
r = QuadReg(.1)
k = 7;
train_glrm = GLRM(Xtrain, losses, r, r, k, observed_features=train_observed_features, observed_examples=train_observed_examples,)
test_glrm = GLRM(Xtrain, losses, r, r, k, observed_features=test_observed_features, observed_examples=test_observed_examples)
train_error, test_error, train_time, reg_params, solution =
censored_regularization_path(train_glrm, test_glrm,
params=ProxGradParams(1, max_iter=50, abs_tol=.001, min_stepsize=.1),
reg_params=logspace(2,-2,3))
end
main()
This bug has been fixed.
When specifying the loss function for several different categorical variables with varying numbers of factors, I encountered the following error which can be traced back to src: "Y must be of size (k,d) where d is the sum of the embedding dimensions of all the losses. (1 for real-valued losses, and the number of categories for categorical losses".
Error is generated when size(Y)!=(k,sum(map(embedding_dim, losses))). However, size of Y is determined by Y = randn(k,embedding_dim(losses)) so if you are specifying multiple loss functions, there is bound to be an inequality because without the map function, embedding_dim extracts the number of categories specified in only the first loss function.
If this is a misunderstanding, I apologize.