JuliaGPU / CUDA.jl

CUDA programming in Julia.
https://juliagpu.org/cuda/
Other
1.21k stars 218 forks source link

segmented reduction #176

Open pevnak opened 6 years ago

pevnak commented 6 years ago

I would like to ask, if someone would be willing to write function(s) for segmented reductions. These functions are handy for multiple-instance learning problems, where sample is represented by an unordered set of vectors (each sample can have different number of vectors). It would be nice to have these functions available for use with Flux.

My implementation of the normal Julia code for the forward pass looks like

function segmented_max(x::Matrix{T},bags::Vector) where{T}
  assert(checkbounds(x,bags))
  o = zeros(eltype(x),size(x,1),length(bags))
  fill!(o,typemin(T));
  @inbounds for i in 1:length(bags) #iterate over bags
    for j in bags[i]  #iterate over items (vectors) in bags
      for k in 1:size(x,1)
        if x[k,j]>o[k,i]
          o[k,i]=x[k,j];
        end
      end
    end
  end
  o
end

while for the reverse part it is

    assert(checkbounds(x,bags))
    maxI=zeros(Int,size(x,1),length(bags))
    gx = zeros(x)
    for i in 1:length(bags) #iterate over bags
        bagsize=length(bags[i])
        for j in bags[i]  #iterate over subbags
            for k in 1:size(x,1)
                if x[k,j]>gx[k,i]
                    gx[k,i]=x[k,j];
                    maxI[k,i]=j;
                end
            end
        end
    end

    fill!(gx,0)
    @inbounds for I in CartesianRange(size(Δ))
        if maxI[I]>0 
            gx[I[1],maxI[I]]=Δ[I];    
        end
    end
    gx
end

I have to confess that I have never programmed for Cuda, therefore I am writing here. Thanks for any help.

pevnak commented 6 years ago

I have first dirty version as

function kernel_segmented_sum(c, a, start_idx, end_idx, d)
    b = blockIdx().x
    i = threadIdx().x
    o_idx = (b-1)*d+i
    s_idx = (start_idx[b]-1)*d + i
    for k in start_idx[b]:end_idx[b]
    c[o_idx] +=  a[s_idx]
    s_idx += d
  end
  return nothing
end

function segmented_sum(a::CuArray{T,2},bags)
    start_idx = cu(map(i ->Int32(i.start),bags))
    end_idx = cu(map(i ->Int32(i.stop),bags))
    c = similar(a,size(x,1),length(bags))  # output array
    @cuda (length(bags),size(c,1)) kernel_segmented_sum(c,a,start_idx,end_idx,size(x,1))
end

Advices wanted!!!