Specifying MultinomialLoss() for multiple columns

aaron-scheffler commented 6 years ago

When specifying the loss function for several different categorical variables with varying numbers of factors, I encountered the following error which can be traced back to src: "Y must be of size (k,d) where d is the sum of the embedding dimensions of all the losses. (1 for real-valued losses, and the number of categories for categorical losses".

Error is generated when size(Y)!=(k,sum(map(embedding_dim, losses))). However, size of Y is determined by Y = randn(k,embedding_dim(losses)) so if you are specifying multiple loss functions, there is bound to be an inequality because without the map function, embedding_dim extracts the number of categories specified in only the first loss function.

If this is a misunderstanding, I apologize.

kmundnic commented 6 years ago

This is not exactly the same issue, but similar, since you can stumble upon the same problem by defining just one variable (column) as categorical. When using MultinomialLoss() or OvALoss(), the number of variables for a single categorical feature are considered as the number of categories by get_yidxs. This is the same reason for why the comment above is not working.

Here's a minimal working example where it fails (I borrowed some functions from censored.jl). It fails when using both, or either commenting the variables cat1 or cat2 from Xtrain defined in main().

using DataFrames
using LowRankModels
using Compat

srand(0)

function censored_regularization_path(train_glrm::GLRM, test_glrm::GLRM; params=Params(), reg_params=logspace(2,-2,5),
                                         holdout_proportion=.1, verbose=true,
                                         ch::ConvergenceHistory=ConvergenceHistory("reg_path"))
    m,n = size(train_glrm.A)
    ntrain = sum(map(length, train_glrm.observed_features))
    ntest = sum(map(length, test_glrm.observed_features))
    train_error = @compat Array{Float64}(length(reg_params))
    test_error = @compat Array{Float64}(length(reg_params))
    @compat solution = @compat Array{Tuple{Float64,Float64}}(length(reg_params))
    train_time = @compat Array{Float64}(length(reg_params))
    for iparam=1:length(reg_params)
        reg_param = reg_params[iparam]
        # evaluate train and test error
        if verbose println("fitting train GLRM for reg_param $reg_param") end
        scale!(train_glrm.rx, reg_param)
        scale!(train_glrm.ry, reg_param)
        train_glrm.X, train_glrm.Y = randn(train_glrm.k,m), randn(train_glrm.k,n)
        X, Y, ch = fit!(train_glrm; params=params, ch=ch, verbose=verbose)
        train_time[iparam] = ch.times[end]
        if verbose println("computing train and test error for reg_param $reg_param:") end
        train_error[iparam] = objective(train_glrm, X, Y, include_regularization=false) / ntrain
        if verbose println("\ttrain error: $(train_error[iparam])") end
        test_error[iparam] = objective(test_glrm, X, Y, include_regularization=false) / ntest
        if verbose println("\ttest error:  $(test_error[iparam])") end
        solution[iparam] = (sum(X)+sum(Y), sum(abs.(X))+sum(abs.(Y)))
        if verbose println("\tsum of solution, one norm of solution:  $(solution[iparam])") end
    end
    return train_error, test_error, train_time, reg_params, solution
end

function lossfunctions(df::DataFrame)
    losses = Array{LowRankModels.Loss}(0)

    for name in names(df)
        if contains(convert(String, name), "num")
            push!(losses, QuadLoss())
        elseif contains(convert(String, name), "ord")
            push!(losses, OrdinalHingeLoss(minimum(df[name]),
                                           maximum(df[name]), 
                                           1.0));
        elseif contains(convert(String, name), "bin")
            push!(losses, LogisticLoss())
        elseif contains(convert(String, name), "cat")
            push!(losses, MultinomialLoss(maximum(df[name]), 1.0))
        end
    end

    return losses

end

function main()

    samples = 100;

    Xtrain = DataFrame(num1 = randn(samples),
                       num2 = randn(samples),
                       num3 = randn(samples),
                       ord1 = rand(1:10, samples),
                       ord2 = rand(1:10, samples),
                       ord3 = rand(1:10, samples),
                       cat1 = rand(11:20, samples),
                       cat2 = rand(1:10, samples),
                       bin1 = rand(Bool, samples),
                       bin2 = rand(Bool, samples),
                       bin3 = rand(Bool, samples))

    features = size(Xtrain, 2);
    observed = observations(Xtrain)

    losses = lossfunctions(Xtrain)

    (train_observed_features, train_observed_examples,
        test_observed_features,  test_observed_examples) =
        get_train_and_test(observed, samples, features, .2)

        r = QuadReg(.1)
        k = 7;

        train_glrm = GLRM(Xtrain, losses, r, r, k, observed_features=train_observed_features, observed_examples=train_observed_examples,)
        test_glrm = GLRM(Xtrain, losses, r, r, k, observed_features=test_observed_features, observed_examples=test_observed_examples)

    train_error, test_error, train_time, reg_params, solution =
        censored_regularization_path(train_glrm, test_glrm,
                                     params=ProxGradParams(1, max_iter=50, abs_tol=.001, min_stepsize=.1),
                                     reg_params=logspace(2,-2,3))

end

main()

madeleineudell commented 5 years ago

This bug has been fixed.

madeleineudell / LowRankModels.jl

Specifying MultinomialLoss() for multiple columns #81