bytedeco / javacpp-presets

The missing Java distribution of native C++ libraries
Other
2.65k stars 738 forks source link

how to free my cuda memory using nvcomp #1542

Open yijinsheng opened 1 week ago

yijinsheng commented 1 week ago

I have follow the example nvcompLZ4Example to compress my yuv data using 3090TI. but I have 2 questions about this.

  1. when I go throuth all the code , there are still some gpu memory used
  2. I print out the compress and decompress time, while the decompress time is much longer than the comress time,this doesn't make sense Here is my code:

    public class nvcompLZ4Example {
    private static void decomp_compressed_with_manager_factory_example(BytePointer device_input_ptrs, long input_buffer_len) {
        long start=System.currentTimeMillis();
        CUstream_st stream = new CUstream_st();
        int cuda_error = cudaStreamCreate(stream);
    
        long chunk_size = 1 << 16;
    
        nvcompBatchedZstdOpts_t format_opts = new nvcompBatchedZstdOpts_t();
    //        format_opts.data_type(NVCOMP_TYPE_CHAR);
        PimplManager nvcomp_manager = new ZstdManager(chunk_size, format_opts, stream, 0, nvcomp.NoComputeNoVerify);
        CompressionConfig comp_config = nvcomp_manager.configure_compression(input_buffer_len);
    
        BytePointer comp_buffer = new BytePointer();
        cuda_error = cudaMalloc(comp_buffer, comp_config.max_compressed_buffer_size());
    
        nvcomp_manager.compress(device_input_ptrs, comp_buffer, comp_config);
        long compress=System.currentTimeMillis();
        System.out.println(compress-start);
        long compressedOutputSize = nvcomp_manager.get_compressed_output_size(comp_buffer);
        System.out.println(Double.valueOf(compressedOutputSize)/Double.valueOf(input_buffer_len));
    
        // Construct a new nvcomp manager from the compressed buffer.
        // Note we could use the nvcomp_manager from above, but here we demonstrate how to create a manager
        // for the use case where a buffer is received and the user doesn't know how it was compressed
        // Also note, creating the manager in this way synchronizes the stream, as the compressed buffer must be read to
        // construct the manager
        nvcompManagerBase decomp_nvcomp_manager = create_manager(comp_buffer, stream, 0, NoComputeNoVerify);
    
        DecompressionConfig decomp_config = decomp_nvcomp_manager.configure_decompression(comp_buffer);
        BytePointer res_decomp_buffer = new BytePointer();
        cuda_error = cudaMalloc(res_decomp_buffer, decomp_config.decomp_data_size());
    
        decomp_nvcomp_manager.decompress(res_decomp_buffer, comp_buffer, decomp_config);
        long decompress=System.currentTimeMillis();
        System.out.println(decompress-compress);
    
        cuda_error = cudaFree(comp_buffer);
        cuda_error = cudaFree(res_decomp_buffer);
        cuda_error = cudaStreamSynchronize(stream);
        cuda_error = cudaStreamDestroy(stream);
    }
    
    public static void main(String[] args) throws FileNotFoundException {
        Loader.load(nvcomp.class);
        String file_path="/home/yijinsheng/output.yuv";
        byte[]  uncompressed_data=IoUtil.readBytes(new FileInputStream(file_path));
        // Initialize a random array of chars
    //        int input_buffer_len = 1000000;
    //        byte[] uncompressed_data = new byte[input_buffer_len];
    //
    //        for (int i = 0; i < input_buffer_len; i++) {
    //            uncompressed_data[i] = (byte) (Math.random() * 26 + 'a');
    //        }
    
        long input_buffer_len=uncompressed_data.length;
    
        BytePointer uncompressed_data_ptr = new BytePointer(uncompressed_data);
    
        BytePointer device_input_ptrs = new BytePointer();
    
        int cuda_error = cudaMalloc(device_input_ptrs, input_buffer_len);
        cuda_error = cudaMemcpy(device_input_ptrs, uncompressed_data_ptr, input_buffer_len, cudaMemcpyDefault);
    
        decomp_compressed_with_manager_factory_example(device_input_ptrs, input_buffer_len);
        cudaFree(device_input_ptrs);
        System.out.println("done");
    }
    }
    image
saudet commented 1 week ago

It's possible nvCOMP allocates memory as work area that we need to deallocate some other way...

TomaszTB commented 1 day ago

The memory you see is related to the CUDA state (the primary CUDA context, etc). You can see this memory get allocated when running cudaSetDevice(), which initializes CUDA. If you don't call that, CUDA will automatically initialize itself upon the first call to a CUDA function (in this case the call to cudaMalloc()).

To manually free this memory you can use cudaDeviceReset(), though it doesn't seem to be necessary.