Tractables / ProbabilisticCircuits.jl

Probabilistic Circuits from the Juice library
https://tractables.github.io/ProbabilisticCircuits.jl/dev
Apache License 2.0
104 stars 11 forks source link

MAP on HCLTs - Failed to compile PTX code #131

Closed loreloc closed 1 year ago

loreloc commented 1 year ago

Hi, I am trying performing MAP inference on HCLTs using ProbabilisticCircuits.jl v0.4.0, with CUDA 11.7 on a RTX A6000. However, there are some issues compilation-wise. It throws the following exception when calling map_downward!:

ERROR: LoadError: Failed to compile PTX code (ptxas received signal 11)
Invocation arguments: --generate-line-info --verbose --gpu-name sm_86

I have also attached the .ptx file and here is the source code.

ptx file here

using CUDA
using ProbabilisticCircuits
using ProbabilisticCircuits: BitsProbCircuit, CuBitsProbCircuit, loglikelihoods, full_batch_em, mini_batch_em
using MLDatasets
[map_hclt.zip](https://github.com/Juice-jl/ProbabilisticCircuits.jl/files/11302214/map_hclt.zip)

function mnist_cpu()
    train_cpu = collect(transpose(reshape(MNIST.traintensor(UInt8), 28*28, :)))
    test_cpu = collect(transpose(reshape(MNIST.testtensor(UInt8), 28*28, :)))
    train_cpu, test_cpu
end

function mnist_gpu()
    cu.(mnist_cpu())
end

function truncate(data::Matrix; bits)
    data .÷ 2^bits
end

function run(; batch_size =
[map_hclt.zip](https://github.com/Juice-jl/ProbabilisticCircuits.jl/files/11302210/map_hclt.zip)
 512, num_epochs1 = 3, num_epochs2 = 3, num_epochs3 = 1, 
             pseudocount = 0.1, latents = 32, param_inertia1 = 0.2, param_inertia2 = 0.9, param_inertia3 = 0.95)
    train, test = mnist_cpu()
    train_gpu, test_gpu = mnist_gpu()
    trunc_train = cu(truncate(train; bits = 4))
    println("Generating HCLT structure with $latents latents... ");
    @time pc = hclt(trunc_train[1:5000,:], latents; num_cats = 256, pseudocount = 0.1, input_type = Categorical);
    init_parameters(pc; perturbation = 0.4);
    println("Number of free parameters: $(num_parameters(pc))")

    print("Moving circuit to GPU... ")
    CUDA.@time bpc = CuBitsProbCircuit(pc)
    softness    = 0
    @time mini_batch_em(bpc, train_gpu, num_epochs1; batch_size, pseudocount, 
                 softness, param_inertia = param_inertia1, param_inertia_end = param_inertia2, debug = false)
    ll1 = loglikelihood(bpc, test_gpu; batch_size)
    println("test LL: $(ll1)") 
    @time mini_batch_em(bpc, train_gpu, num_epochs2; batch_size, pseudocount, 
                 softness, param_inertia = param_inertia2, param_inertia_end = param_inertia3)
    ll2 = loglikelihood(bpc, test_gpu; batch_size)
    println("test LL: $(ll2)")
    @time full_batch_em(bpc, train_gpu, num_epochs3; batch_size, pseudocount, softness)
    ll3 = loglikelihood(bpc, test_gpu; batch_size)
    println("test LL: $(ll3)")
    print("update parameters")
    @time ProbabilisticCircuits.update_parameters(bpc)

    try_map(bpc)

    ll1, ll2, ll3, batch_size, pseudocount, latents
end

function try_map(bpc)
    @info "MAP"
    train_gpu, _ = mnist_gpu();
    data = Array{Union{Missing, UInt32}}(train_gpu[1:10, :]);
    data[:, 1:400] .= missing;
    data_gpu = cu(data);
    MAP(bpc, data_gpu; batch_size=10)  # Error is thrown here, inside "map_downward!"
end

run()
khosravipasha commented 1 year ago

I think most likely issue is with the data type when get error like this, usually julia's kernel compiler for cuda does not give nice errors where there is type issues.

Also note that MAP is not exact on HCLTs since they are not deterministic circuits.

Please try change into the following and see what happens, it should work, let me know if it does not.

data = Array{Union{Missing, UInt8}}(train_gpu[1:10, :]);
loreloc commented 1 year ago

Thank you! It seems it works now.

I was trying to perform approximate MAP using the same code in the example RAT_mnist.jl.

khosravipasha commented 1 year ago

Makes sense, try to aim for having same types as original data the circuit was trained on, some of the types automatically get picked up for the input nodes. In that ones seems we used UInt32 for the data when training the circuit.