Error while calling cudnnGetConvolutionForwardWorkspaceSize (CUDNN_STATUS_NOT_SUPPORTED) when using dlib dnn detector in multiple processes on the same machine

Foreword

I'm wondering if this is a cuDNN multi-process issue rather than a dlib issue, or it may be the way I'm creating and using the detector...

Expected Behavior

Thread safe face detections and descriptor extractions using cuDNN on CUDA with the cnn detector.

Current Behavior

I'm running a face detection and descriptor extraction program. The code of interest is as follows (full code at the bottom):

void extract_faces_cnn(std::vector<dlib::matrix<rgb_pixel>> &faces, std::vector<dlib::rectangle> &coords, dlib::cv_image<rgb_pixel> &img, shape_predictor &sp) {
    matrix<rgb_pixel> imgmat = mat(img);

    auto dets = cnn_detector(imgmat);
    for (auto&& face : dets)
    {
        auto shape = sp(imgmat, face); 
        matrix<rgb_pixel> face_chip;
        extract_image_chip(img, get_face_chip_details(shape,150,0.25), face_chip);
        faces.push_back(move(face_chip));
        coords.push_back(face.rect);
    }
    cnn_detector.clean(); //Seem to need this otherwise memory usage creeps up and up and gobbles all the GPU mems.
}

The code is called from an Apache Storm bolt written in Java (so it's part of a JNI wrapper). This all works fine so long as there's only once instance of the bolt. dlib works perfectly, and is nice and fast and we get lovely descriptors. Awesome. If more than one instance of the bolt is created (each one is created and run in it's own jvm), we start getting this error:

2020-10-15 14:59:38.370 STDERR Thread-0 [INFO] terminate called after throwing an instance of 'dlib::cudnn_error'
2020-10-15 14:59:38.371 STDERR Thread-0 [INFO]   what():  Error while calling cudnnGetConvolutionForwardWorkspaceSize( context(), descriptor(data), (const cudnnFilterDescriptor_t)filter_handle, (const cudnnConvolutionDescriptor_t)conv_handle, descriptor(dest_desc), (cudnnConvolutionFwdAlgo_t)forward_algo, &forward_workspace_size_in_bytes) in file /home/ubuntu/dlib/dlib/cuda/cudnn_dlibapi.cpp:1026. code: 9, reason: CUDNN_STATUS_NOT_SUPPORTED

Steps to Reproduce

On a g4dn.4xlarge instance install CUDA 11.1, cuDNN 8 (8_8.0.4.30-1+cuda11.1_amd64 and libcudnn8-dev_8.0.4.30-1+cuda11.1_amd64 deb packages to be exact)
sudo apt-get install gfortran libopenblas-dev liblapack-dev gcc-8 g++-8
git clone https://github.com/davisking/dlib.git && cd dlib && mkdir build && cd build
cmake -DCMAKE_C_COMPILER=/usr/bin/gcc-8 -DCMAKE_CXX_COMPILER=/usr/bin/g++-8 -DUSE_AVX_INSTRUCTIONS=ON -DBUILD_SHARED_LIBS=1 -DCUDA_HOST_COMPILER=/usr/bin/gcc-8 ../

Output is:

-- The C compiler identification is GNU 8.4.0
-- The CXX compiler identification is GNU 8.4.0
-- Check for working C compiler: /usr/bin/gcc-8
-- Check for working C compiler: /usr/bin/gcc-8 -- works
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Detecting C compile features
-- Detecting C compile features - done
-- Check for working CXX compiler: /usr/bin/g++-8
-- Check for working CXX compiler: /usr/bin/g++-8 -- works
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Detecting CXX compile features
-- Detecting CXX compile features - done
-- Using CMake version: 3.16.3
-- Compiling dlib version: 19.21.99
-- Enabling AVX instructions
-- Looking for pthread.h
-- Looking for pthread.h - found
-- Performing Test CMAKE_HAVE_LIBC_PTHREAD
-- Performing Test CMAKE_HAVE_LIBC_PTHREAD - Failed
-- Looking for pthread_create in pthreads
-- Looking for pthread_create in pthreads - not found
-- Looking for pthread_create in pthread
-- Looking for pthread_create in pthread - found
-- Found Threads: TRUE  
-- Found X11: /usr/include   
-- Looking for XOpenDisplay in /usr/lib/x86_64-linux-gnu/libX11.so;/usr/lib/x86_64-linux-gnu/libXext.so
-- Looking for XOpenDisplay in /usr/lib/x86_64-linux-gnu/libX11.so;/usr/lib/x86_64-linux-gnu/libXext.so - found
-- Looking for gethostbyname
-- Looking for gethostbyname - found
-- Looking for connect
-- Looking for connect - found
-- Looking for remove
-- Looking for remove - found
-- Looking for shmat
-- Looking for shmat - found
-- Looking for IceConnectionNumber in ICE
-- Looking for IceConnectionNumber in ICE - found
-- Found system copy of libpng: /usr/lib/x86_64-linux-gnu/libpng.so;/usr/lib/x86_64-linux-gnu/libz.so
-- Searching for BLAS and LAPACK
-- Searching for BLAS and LAPACK
-- Found PkgConfig: /usr/bin/pkg-config (found version \"0.29.1\") 
-- Checking for module 'cblas'
--   No package 'cblas' found
-- Checking for module 'lapack'
--   Found lapack, version 0.3.8+ds
-- Looking for cblas_ddot
-- Looking for cblas_ddot - not found
-- Looking for sys/types.h
-- Looking for sys/types.h - found
-- Looking for stdint.h
-- Looking for stdint.h - found
-- Looking for stddef.h
-- Looking for stddef.h - found
-- Check size of void*
-- Check size of void* - done
-- Found OpenBLAS library
-- Looking for sgetrf_single
-- Looking for sgetrf_single - found
-- Using OpenBLAS's built in LAPACK
-- Looking for cblas_ddot
-- Looking for cblas_ddot - found
-- Looking for sgesv
-- Looking for sgesv - not found
-- Looking for sgesv_
-- Looking for sgesv_ - not found
-- Found CUDA: /usr/local/cuda (found suitable version \"11.1\", minimum required is \"7.5\") 
-- Looking for cuDNN install...
-- Found cuDNN: /usr/lib/x86_64-linux-gnu/libcudnn.so
-- Building a CUDA test project to see if your compiler is compatible with CUDA...
-- Building a cuDNN test project to check if you have the right version of cuDNN installed...
-- Enabling CUDA support for dlib.  DLIB WILL USE CUDA
-- C++11 activated.
-- Configuring done
-- Generating done
-- Build files have been written to: /home/ubuntu/dlib/build

cmake --build . --config Release
make install
Install the code below: cmake -DCMAKE_C_COMPILER=/usr/bin/gcc-8 -DCMAKE_CXX_COMPILER=/usr/bin/g++-8 -DUSE_AVX_INSTRUCTIONS=ON ../ && cmake --build . --config Release
Copy so files into java library path so java app can load the library: sudo cp *.so /usr/local/lib/
git clone https://github.com/davisking/dlib-models.git && cp /home/ubuntu/dlib-models/*bz2 /opt/dlib/models/ && bzip2 -d /opt/dlib/models/*.bz2

Full C++ Code:

#include <dlib/dnn.h>
#include <dlib/gui_widgets.h>
#include <dlib/clustering.h>
#include <dlib/string.h>
#include <dlib/image_io.h>
#include <dlib/opencv/cv_image.h>
#include <dlib/image_processing/frontal_face_detector.h>

#include <opencv2/core.hpp>
#include <opencv2/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#include <sys/time.h>
#include <jni.h>

using namespace dlib;
using namespace std;

template <template <int,template<typename>class,int,typename> class block, int N, template<typename>class BN, typename SUBNET>
using residual = add_prev1<block<N,BN,1,tag1<SUBNET>>>;

template <template <int,template<typename>class,int,typename> class block, int N, template<typename>class BN, typename SUBNET>
using residual_down = add_prev2<avg_pool<2,2,2,2,skip1<tag2<block<N,BN,2,tag1<SUBNET>>>>>>;

template <int N, template <typename> class BN, int stride, typename SUBNET> 
using block  = BN<con<N,3,3,1,1,relu<BN<con<N,3,3,stride,stride,SUBNET>>>>>;

template <int N, typename SUBNET> using ares      = relu<residual<block,N,affine,SUBNET>>;
template <int N, typename SUBNET> using ares_down = relu<residual_down<block,N,affine,SUBNET>>;

template <typename SUBNET> using alevel0 = ares_down<256,SUBNET>;
template <typename SUBNET> using alevel1 = ares<256,ares<256,ares_down<256,SUBNET>>>;
template <typename SUBNET> using alevel2 = ares<128,ares<128,ares_down<128,SUBNET>>>;
template <typename SUBNET> using alevel3 = ares<64,ares<64,ares<64,ares_down<64,SUBNET>>>>;
template <typename SUBNET> using alevel4 = ares<32,ares<32,ares<32,SUBNET>>>;

using anet_type = loss_metric<fc_no_bias<128,avg_pool_everything<
                            alevel0<
                            alevel1<
                            alevel2<
                            alevel3<
                            alevel4<
                            max_pool<3,3,2,2,relu<affine<con<32,7,7,2,2,
                            input_rgb_image_sized<150>
                            >>>>>>>>>>>>;
// CNN Method
template <long num_filters, typename SUBNET> using con5d = con<num_filters,5,5,2,2,SUBNET>;
template <long num_filters, typename SUBNET> using con5  = con<num_filters,5,5,1,1,SUBNET>;

template <typename SUBNET> using downsampler  = relu<affine<con5d<32, relu<affine<con5d<32, relu<affine<con5d<16,SUBNET>>>>>>>>>;
template <typename SUBNET> using rcon5  = relu<affine<con5<45,SUBNET>>>;

using net_type = loss_mmod<con<1,9,9,1,1,rcon5<rcon5<rcon5<downsampler<input_rgb_image_pyramid<pyramid_down<6>>>>>>>>;

thread_local frontal_face_detector hog_detector;
thread_local shape_predictor sp_small;
thread_local shape_predictor sp_large;
thread_local anet_type net;
thread_local net_type cnn_detector;

int initialized = 0;

const int METHOD_HOG = 0;
const int METHOD_CNN = 1;

const int MODEL_SMALL = 0;
const int MODEL_LARGE = 1;

typedef timeval sys_time_t;
inline void system_time(sys_time_t* t) {
    gettimeofday(t, NULL);
}
inline long time_to_msec(const sys_time_t& t) {
    return t.tv_sec * 1000LL + t.tv_usec / 1000;
}

long get_current_time_millis() {
    sys_time_t t;
    system_time(&t);
    return time_to_msec(t);
}

jint throw_java_exception( JNIEnv *env, const char *type, const char *message )
{
    jclass exClass = env->FindClass(type);
    return env->ThrowNew(exClass, message);
}

std::string jstring2string(JNIEnv *env, jstring jStr) {
    if (!jStr)
        return "";

    const jclass stringClass = env->GetObjectClass(jStr);
    const jmethodID getBytes = env->GetMethodID(stringClass, "getBytes", "(Ljava/lang/String;)[B");
    const jbyteArray stringJbytes = (jbyteArray) env->CallObjectMethod(jStr, getBytes, env->NewStringUTF("UTF-8"));

    size_t length = (size_t) env->GetArrayLength(stringJbytes);
    jbyte* pBytes = env->GetByteArrayElements(stringJbytes, NULL);

    std::string ret = std::string((char *)pBytes, length);
    env->ReleaseByteArrayElements(stringJbytes, pBytes, JNI_ABORT);

    env->DeleteLocalRef(stringJbytes);
    env->DeleteLocalRef(stringClass);
    return ret;
}

// Run the HOG face detector on the image, and for each face extract a
// copy that has been normalized to 150x150 pixels in size and appropriately rotated
// and centered.
void extract_faces_hog(std::vector<dlib::matrix<rgb_pixel>> &faces, std::vector<dlib::rectangle> &coords, dlib::cv_image<rgb_pixel> &img, shape_predictor &sp, int numberOfTimesToUpsample) {
    for (auto face : hog_detector(img, numberOfTimesToUpsample))
    {
        auto shape = sp(img, face);
        matrix<rgb_pixel> face_chip;
        extract_image_chip(img, get_face_chip_details(shape,150,0.25), face_chip);
        faces.push_back(move(face_chip));
        coords.push_back(face);
    }
}

void extract_faces_cnn(std::vector<dlib::matrix<rgb_pixel>> &faces, std::vector<dlib::rectangle> &coords, dlib::cv_image<rgb_pixel> &img, shape_predictor &sp) {
    matrix<rgb_pixel> imgmat = mat(img);

    auto dets = cnn_detector(imgmat);
    for (auto&& face : dets)
    {
        auto shape = sp(imgmat, face); 
        matrix<rgb_pixel> face_chip;
        extract_image_chip(img, get_face_chip_details(shape,150,0.25), face_chip);
        faces.push_back(move(face_chip));
        coords.push_back(face.rect);
    }
    cnn_detector.clean();
}

#ifndef _Included_com_application_dlib_FaceRecognition
#define _Included_com_application_dlib_FaceRecognition

#ifdef __cplusplus
extern "C" {
#endif

// This gets run when the java object that uses this wrapper initialises, so it only ever gets run once:
JNIEXPORT jint JNICALL
    Java_com_application_dlib_FaceRecognition_init(JNIEnv* env, jclass _this, 
        jstring mmod_human_face_detector,
        jstring shape_predictor_5_face_landmarks,
        jstring shape_predictor_68_face_landmarks,
        jstring dlib_face_recognition_resnet_model_v1) {
    // The first thing we are going to do is load all our models.  First, since we need to
    // find faces in the image we will need a face detector:
    if (initialized == 0)
    {
        // TODO: Make locations flexible!
        hog_detector = get_frontal_face_detector();
        deserialize(jstring2string(env, mmod_human_face_detector)) >> cnn_detector;  
        // We will also use a face landmarking model to align faces to a standard pose:  (see face_landmark_detection_ex.cpp for an introduction)
        deserialize(jstring2string(env, shape_predictor_5_face_landmarks)) >> sp_small;
        deserialize(jstring2string(env, shape_predictor_68_face_landmarks)) >> sp_large;
        // And finally we load the DNN responsible for face recognition.
        deserialize(jstring2string(env, dlib_face_recognition_resnet_model_v1)) >> net;

        initialized = 1;
    }
    return (jint)initialized;
}

JNIEXPORT jobject JNICALL
    Java_com_application_dlib_FaceRecognition_extractDetections(JNIEnv* env, jclass _this, jbyteArray imageBytes, jint method, jint model, jint numberOfTimesToUpsample) {
    // The first thing we are going to do is load all our models.  First, since we need to
    // find faces in the image we will need a face detector:
    int len = env->GetArrayLength (imageBytes);
    uint8_t* buf = new uint8_t[len];
    env->GetByteArrayRegion (imageBytes, 0, len, reinterpret_cast<jbyte*>(buf));

    std::vector<uint8_t> vec(buf, buf + len);
    cv::Mat image = cv::imdecode(vec, 1);
    cv::cvtColor(image, image, CV_BGR2RGB);

    delete[] buf;
    buf = NULL;
    vec.clear();

    if (image.empty())
    {
        throw_java_exception(env, "com/application/dlib/exceptions/InvalidImageException", "Image could not be decoded");
        return env->NewGlobalRef(NULL);
    }

    dlib::cv_image<rgb_pixel> img(image); // this image can be used inside dlib

// Pyramid_up doesn't seem to work - just makes everything black
    // for (int i = 0; i < (int) numberOfTimesToUpsample; i++) 
    // {
    //     pyramid_up(img);
    // }

    shape_predictor sp;
    if ((int) model == MODEL_LARGE)
    {
        sp = sp_large;
    }
    else
    {
        sp = sp_small;
    }

    std::vector<dlib::matrix<rgb_pixel>> faces;
    std::vector<dlib::rectangle> coords;
    switch ((int) method)
    {
        case METHOD_HOG:
            extract_faces_hog(faces, coords, img, sp, numberOfTimesToUpsample);
            break;
        case METHOD_CNN:
            extract_faces_cnn(faces, coords, img, sp);
            break;
        default:
            image.release();
            throw_java_exception(env, "com/application/dlib/exceptions/InvalidExtractionTypeException", "Invalid extraction type");
            return env->NewGlobalRef(NULL);
    }

    jclass alCls = env->FindClass("java/util/ArrayList");
    if (alCls == nullptr)
    {
        return env->NewGlobalRef(NULL);
    }

    jmethodID alInit   = env->GetMethodID(alCls, "<init>", "(I)V");
    jmethodID alAddId  = env->GetMethodID(alCls, "add", "(Ljava/lang/Object;)Z");

    jobject result = env->NewObject(alCls, alInit, static_cast<int>(faces.size()));

    if (faces.size() > 0)
    {

        std::vector<dlib::matrix<float,0,1>> descriptors = net(faces, 16); //Added 16

        jclass javaLocalClass = env->FindClass("com/application/dlib/Face");
        if (javaLocalClass == NULL) {
            return env->NewGlobalRef(NULL);
        }

        jclass javaGlobalClass = reinterpret_cast<jclass>(env->NewGlobalRef(javaLocalClass));
        jmethodID javaConstructor = env->GetMethodID(javaGlobalClass, "<init>", "([F[J)V");
        if (javaConstructor == NULL) {
            return env->NewGlobalRef(NULL);
        }

        int idx = 0;

        for (dlib::matrix<float,0,1> m: descriptors) 
        {
            std::vector<float> descriptor(m.begin(), m.end());

            dlib::rectangle faceRect = coords.at(idx);
            idx++;

            jfloatArray descriptorArray = env->NewFloatArray(128);
            env->SetFloatArrayRegion (descriptorArray, 0, 128, reinterpret_cast<jfloat*>(descriptor.data()));

            jlongArray rectArray = env->NewLongArray(4*2);
            jlong faceCoords[] = {  faceRect.tl_corner().x(), faceRect.tl_corner().y(),
                                    faceRect.tr_corner().x(), faceRect.tr_corner().y(),
                                    faceRect.bl_corner().x(), faceRect.bl_corner().y(),
                                    faceRect.br_corner().x(), faceRect.br_corner().y() };
            env->SetLongArrayRegion (rectArray, 0, 4*2, faceCoords);

            jobject javaFace = env->NewObject(javaGlobalClass, javaConstructor, descriptorArray, rectArray);

            env->CallBooleanMethod(result, alAddId, javaFace);
            env->DeleteLocalRef(descriptorArray);
            env->DeleteLocalRef(rectArray);
            env->DeleteLocalRef(javaFace);
        }
    }
    return result;

    //Return the face_descriptors
    }

#ifdef __cplusplus
}
#endif
#endif

CMakeLists.txt used to build it:

cmake_minimum_required(VERSION 2.8.12)

project(dlib-jni)

set(CMAKE_CXX_STANDARD 11)
set(USE_AVX_INSTRUCTIONS ON CACHE BOOL "Use AVX instructions")
#set(CMAKE_TOOLCHAIN_FILE /home/ubuntu/vcpkg/scripts/buildsystems/vcpkg.cmake CACHE STRING "Vcpkg toolchain file") - this doesn't work

add_definitions(-DOPENCV_TRAITS_ENABLE_DEPRECATED)
find_package(OpenCV 3 REQUIRED)
include_directories(${OpenCV_INCLUDE_DIRS})

find_package(dlib REQUIRED) #possibly supporting components!
message(STATUS "Using dlib-${dlib_VERSION}")

set(BLA_VENDOR OpenBLAS)
find_package(BLAS REQUIRED)
include_directories(${BLAS_INCLUDE_DIRS})

include_directories(src)

set(DLIB_JNI_THIRDPARTY_LIBS opencv_highgui
                        opencv_imgcodecs
                        opencv_imgproc
                        opencv_core
                        lapack)

find_package(JNI REQUIRED)
message (STATUS "JNI_INCLUDE_DIRS=${JNI_INCLUDE_DIRS}")
message (STATUS "JAVA_INCLUDE_PATH =${JAVA_INCLUDE_PATH}")
message (STATUS "JNI_LIBRARIES=${JNI_LIBRARIES}")
message (STATUS "JAVA_JVM_LIBRARY=${JAVA_JVM_LIBRARY}")
include_directories(${JNI_INCLUDE_DIRS})

add_library(dlib_jni SHARED src/dlib-jni.cpp)
set_target_properties(dlib_jni PROPERTIES POSITION_INDEPENDENT_CODE ON) # This made it work
target_link_libraries(dlib_jni dlib::dlib ${DLIB_JNI_THIRDPARTY_LIBS} ${JAVA_JVM_LIBRARY})

Version: 19.21.99
Where did you get dlib: Cloned from master
Platform: Ubuntu 20.04
Compiler: gcc-8 / g++-8

davisking / dlib