cudaErrorIllegalAddress----batched-wav-nnet3-cuda.cc

Hi an error of cudaErrorIllegalAddress was occurred when I tried to decode utterances with my own customized batched-wav-nnet3-cude.cc. the only difference between my customized code and the original is that my code is a JNI toy, and all config variables are assigned in the code than command arguments. the sequence code runs ok until the line valid = cuda_pipeline->GetLattice(key, &clat); any help is appreciate.

logs

WARNING ([5.5]:SelectGpuId():cu-device.cc:221) Not in compute-exclusive mode.  Suggestion: use 'nvidia-smi -c 3' to set compute exclusive mode
LOG ([5.5]:SelectGpuIdAuto():cu-device.cc:349) Selecting from 6 GPUs
LOG ([5.5]:SelectGpuIdAuto():cu-device.cc:364) cudaSetDevice(0): Tesla V100-PCIE-32GB   free:20292M, used:12210M, total:32502M, free/total:0.624334
LOG ([5.5]:SelectGpuIdAuto():cu-device.cc:364) cudaSetDevice(1): Tesla V100-PCIE-32GB   free:20276M, used:12226M, total:32502M, free/total:0.623841
LOG ([5.5]:SelectGpuIdAuto():cu-device.cc:364) cudaSetDevice(2): Tesla V100-PCIE-32GB   free:20292M, used:12210M, total:32502M, free/total:0.624334
LOG ([5.5]:SelectGpuIdAuto():cu-device.cc:364) cudaSetDevice(3): Tesla V100-PCIE-32GB   free:20292M, used:12210M, total:32502M, free/total:0.624334
LOG ([5.5]:SelectGpuIdAuto():cu-device.cc:364) cudaSetDevice(4): Tesla V100-PCIE-32GB   free:32070M, used:432M, total:32502M, free/total:0.986709
LOG ([5.5]:SelectGpuIdAuto():cu-device.cc:364) cudaSetDevice(5): Tesla V100-PCIE-32GB   free:32070M, used:432M, total:32502M, free/total:0.986709
LOG ([5.5]:SelectGpuIdAuto():cu-device.cc:411) Trying to select device: 4 (automatically), mem_ratio: 0.986709
LOG ([5.5]:SelectGpuIdAuto():cu-device.cc:430) Success selecting device 4 free mem ratio: 0.986709
LOG ([5.5]:FinalizeActiveGpu():cu-device.cc:284) The active GPU is [4]: Tesla V100-PCIE-32GB    free:31974M, used:528M, total:32502M, free/total:0.983755 version 7.0
LOG ([5.5]:Initialize():batched-threaded-nnet3-cuda-pipeline.cc:32) BatchedThreadedNnet3CudaPipeline Initialize with 1 control threads, 5 worker threads and batch size 50
LOG ([5.5]:ComputeDerivedVars():ivector-extractor.cc:183) Computing derived variables for iVector extractor
LOG ([5.5]:ComputeDerivedVars():ivector-extractor.cc:204) Done.
LOG ([5.5]:CheckAndFixConfigs():nnet-batch-compute.cc:363) Increasing --frames-per-chunk from 50 to 51 to make it a multiple of --frame-subsampling-factor=3

******** initialize cuda_pipeline ********

**** Read Wav ****

**** scp:/data1/wjb/kaldi/egs/aishell/s5/data/test/wav-100.scp ****

**** Enter In While ****

**** Start GetLattice ****
LOG ([5.5]:ExecuteWorker():batched-threaded-nnet3-cuda-pipeline.cc:611) Error unrecoverable cuda decoder error 'cuda-decoder-kernels.cu:1698 :cudaErrorIllegalAddress'

ASSERTION_FAILED ([5.5]:ExecuteWorker():batched-threaded-nnet3-cuda-pipeline.cc:613) Assertion failed: (UNRECOVERABLE_EXCEPTION)

[ Stack-Trace: ]
kaldi::MessageLogger::LogMessage() const
kaldi::KaldiAssertFailure_(char const*, char const*, int, char const*)
kaldi::cuda_decoder::BatchedThreadedNnet3CudaPipeline::ExecuteWorker(int)
std::thread::_Impl<std::_Bind_simple<std::_Mem_fn<void (kaldi::cuda_decoder::BatchedThreadedNnet3CudaPipeline::*)(int)> (kaldi::cuda_decoder::BatchedThreadedNnet3CudaPipeline*, int)> >::_M_run()

clone

Aborted (core dumped)

and the codes


#if HAVE_CUDA == 1

#include <cuda.h>
#include <cuda_profiler_api.h>
#include <nvToolsExt.h>
#include <sstream>
#include "cudadecoder/batched-threaded-nnet3-cuda-pipeline.h"
#include "cudamatrix/cu-allocator.h"
#include "fstext/fstext-lib.h"
#include "lat/lattice-functions.h"
#include "nnet3/am-nnet-simple.h"
#include "nnet3/nnet-utils.h"
#include "util/kaldi-thread.h"
#include "kaldi-jni.h"
#include <jni.h>

using namespace kaldi;
using namespace cuda_decoder;

void GetDiagnosticsAndPrintOutput(const std::string &utt,
                                  const fst::SymbolTable *word_syms,
                                  const CompactLattice &clat,
                                  int64 *tot_num_frames, double *tot_like) {
  if (clat.NumStates() == 0) {
    KALDI_WARN << "Empty lattice.";
    return;
  }
  CompactLattice best_path_clat;
  CompactLatticeShortestPath(clat, &best_path_clat);

  Lattice best_path_lat;
  ConvertLattice(best_path_clat, &best_path_lat);

  double likelihood;
  LatticeWeight weight;
  int32 num_frames;
  std::vector<int32> alignment;
  std::vector<int32> words;
  GetLinearSymbolSequence(best_path_lat, &alignment, &words, &weight);
  num_frames = alignment.size();
  likelihood = -(weight.Value1() + weight.Value2());
  *tot_num_frames += num_frames;
  *tot_like += likelihood;
  KALDI_VLOG(2) << "Likelihood per frame for utterance " << utt << " is "
                << (likelihood / num_frames) << " over " << num_frames
                << " frames.";

  if (word_syms != NULL) {
    std::ostringstream oss_warn;
    oss_warn << utt << " ";
    for (size_t i = 0; i < words.size(); i++) {
      std::string s = word_syms->Find(words[i]);
      if (s == "")
        oss_warn << "Word-id " << words[i] << " not in symbol table.";
      oss_warn << s << " ";
    }
    KALDI_WARN << oss_warn.str();
  }
}

void FinishOneDecode(
    const BatchedThreadedNnet3CudaPipelineConfig &batched_decoder_config,
    const fst::SymbolTable *word_syms, const bool write_lattice,
    const int32 total_audio, const int32 count_per_iteration,
    BatchedThreadedNnet3CudaPipeline *cuda_pipeline,
    std::queue<std::pair<std::string, std::string>> *processed,
    Timer *timer, int32 *current_count,
    int64 *num_frames, int32 *output_iter, double *tot_like) {
  std::string &utt = processed->front().first;
  std::string &key = processed->front().second;
  CompactLattice clat;
  bool valid;

  if (batched_decoder_config.determinize_lattice) {
    std::cout << std::endl << "**** Start GetLattice ****" << std::endl;
    valid = cuda_pipeline->GetLattice(key, &clat);
    std::cout << std::endl << "**** End GetLattice ****" << std::endl;
  } else {
    Lattice lat;
    valid = cuda_pipeline->GetRawLattice(key, &lat);
    ConvertLattice(lat, &clat);
  }
  if (valid) {
    GetDiagnosticsAndPrintOutput(utt, word_syms, clat, num_frames, tot_like);
    if (write_lattice && key == utt) { /*only write output on first iteration*/
      nvtxRangePushA("Lattice Write");
      nvtxRangePop();
    }
  }
  cuda_pipeline->CloseDecodeHandle(key);
  processed->pop();
  if (++(*current_count) ==
      count_per_iteration) { /*this utt is the last in an iter*/
    double total_time = timer->Elapsed();
    KALDI_VLOG(2) << "Iteration: " << *output_iter
                  << " ~Aggregate Total Time: " << total_time
                  << " Total Audio: " << total_audio * *output_iter
                  << " RealTimeX: " << *output_iter * total_audio / total_time;
    current_count = 0;
    (*output_iter)++;
  }
  }

using namespace kaldi;
using namespace fst;

kaldi::jni::ModelData *instance(jlong handle) {
  return reinterpret_cast<kaldi::jni::ModelData *>(handle);
}

//Initializing model
JNIEXPORT jlong JNICALL Java_KaldiWrapper_initialize(JNIEnv *env, jobject obj, jstring model_path_str, jstring fst_path_str, jstring symbol_path_str, jstring wav_scp_str)
{
    using namespace kaldi;
    using namespace fst;

    bool write_lattice = false;
    int pipeline_length = 4000;
    int num_todo = -1;

    typedef kaldi::int32 int32;
    typedef kaldi::int64 int64;

    const char *m_chars = env->GetStringUTFChars(model_path_str, 0);  
    const std::string model_path(m_chars);
    std::cout << std::endl << "=============== Initializing with model =================   " << model_path << std::endl;    

    TransitionModel trans_model;    
    kaldi::nnet3::AmNnetSimple am_nnet;
    {
        bool binary;
        kaldi::Input ki(model_path, &binary);
        trans_model.Read(ki.Stream(), binary);
        am_nnet.Read(ki.Stream(), binary);
        kaldi::nnet3::SetBatchnormTestMode(true, &(am_nnet.GetNnet()));
        kaldi::nnet3::SetDropoutTestMode(true, &(am_nnet.GetNnet()));
        kaldi::nnet3::CollapseModel(kaldi::nnet3::CollapseModelConfig(), &(am_nnet.GetNnet()));
    }    

    const char *f_chars = env->GetStringUTFChars(fst_path_str, 0);
    const std::string fst_path(f_chars);
    fst::Fst<fst::StdArc> *decode_fst = fst::ReadFstKaldiGeneric(fst_path);   

    const char *s_chars = env->GetStringUTFChars(symbol_path_str, 0);
    const std::string symbol_path(s_chars);

    fst::SymbolTable *word_syms = fst::SymbolTable::ReadText(symbol_path);

    kaldi::CuAllocatorOptions g_allocator_options;
    g_allocator_options.cache_memory = true;
    g_allocator_options.memory_proportion = 0.5;
    g_allocator_options.num_subregions = 20;        
    std::cout << std::endl << "******** setup g_allocator_options ********" << std::endl;    

    /** feature_opts */
    kaldi::OnlineNnet2FeaturePipelineConfig feature_opts;  
    feature_opts.feature_type = "mfcc";
    feature_opts.mfcc_config = "/data1/wjb/kaldi/egs/aishell/s5/exp/chain/nnet_online/conf/mfcc.conf";
    feature_opts.ivector_extraction_config = "/data1/wjb/kaldi/egs/aishell/s5/exp/chain/nnet_online/conf/ivector_extractor.conf";
    feature_opts.add_pitch = true;
    feature_opts.online_pitch_config= "/data1/wjb/kaldi/egs/aishell/s5/exp/chain/nnet_online/conf/online_pitch.conf"; 
    std::cout << std::endl << "******** setup feature_opts ********" << std::endl;    

    /** decoder_opts */
    kaldi::cuda_decoder::CudaDecoderConfig decoder_opts;
    decoder_opts.default_beam = 15.0;
    decoder_opts.lattice_beam = 8.0;//10
    decoder_opts.ntokens_pre_allocated = 2000000;
    decoder_opts.main_q_capacity = 50000;
    decoder_opts.aux_q_capacity = 500000;
    decoder_opts.max_active = 7000;//10000
    std::cout << std::endl << "******** setup decoder_opts ********" << std::endl;

    /** det_opts */
    fst::DeterminizeLatticePhonePrunedOptions det_opts;
    det_opts.max_mem = 50000000;
    det_opts.phone_determinize = true;
    det_opts.word_determinize = true;
    det_opts.minimize = false;  
    std::cout << std::endl << "******** setup det_opts ********" << std::endl;    

    /** compute_opts */
    nnet3::NnetBatchComputerOptions compute_opts;
    compute_opts.minibatch_size = 128;//128
    compute_opts.edge_minibatch_size = 32;//32
    compute_opts.ensure_exact_final_context = true;
    compute_opts.partial_minibatch_factor = 0.5;

    compute_opts.frames_per_chunk = 50;
    compute_opts.extra_left_context_initial = 0;
    compute_opts.frame_subsampling_factor = 3;
    compute_opts.acoustic_scale=1.0; 
    std::cout << std::endl << "******** setup compute_opts ********" << std::endl;  

    /** batched_decoder_config */
    kaldi::cuda_decoder::BatchedThreadedNnet3CudaPipelineConfig batched_decoder_config;    
    batched_decoder_config.max_batch_size = 50;//100
    batched_decoder_config.batch_drain_size = 5;//10
    batched_decoder_config.num_control_threads = 1;//2
    batched_decoder_config.num_worker_threads = 5;//20
    batched_decoder_config.determinize_lattice = true;//true
    batched_decoder_config.max_pending_tasks = 40;//4000

    batched_decoder_config.feature_opts = feature_opts;
    batched_decoder_config.decoder_opts = decoder_opts;
    batched_decoder_config.det_opts = det_opts;
    batched_decoder_config.compute_opts = compute_opts;
    std::cout << std::endl << "******** setup batched_decoder_config ********" << std::endl;    

    CuDevice::Instantiate().SelectGpuId("yes");
    CuDevice::Instantiate().AllowMultithreading();
    g_cuda_allocator.SetOptions(g_allocator_options);       

    BatchedThreadedNnet3CudaPipeline cuda_pipeline(batched_decoder_config);
    cuda_pipeline.Initialize(*decode_fst, am_nnet, trans_model);
    std::cout << std::endl << "******** initialize cuda_pipeline ********" << std::endl;
    delete decode_fst;

    int32 num_done = 0, num_err = 0;
    double tot_like = 0.0;
    int64 num_frames = 0;
    double total_audio = 0;

    nvtxRangePush("Global Timer");

    // starting timer here so we
    // can measure throughput
    // without allocation
    // overheads
    // using kaldi timer, which starts counting in the constructor
    Timer timer;

    int count_per_iteration = 0;
    int current_count = 0;
    int output_iter = 1;

    /** Read data */
    std::cout << std::endl << "**** Read Wav ****" << std::endl;    
    const char *w_chars = env->GetStringUTFChars(wav_scp_str, 0);
    const std::string wav_scp(w_chars);
    const std::string wav_rspecifier = "scp:" + wav_scp;
    std::cout << std::endl << "**** " << wav_rspecifier << " ****" << std::endl;
    std::queue<std::pair<std::string, std::string>> processed;
    for (int iter = 0; iter < 1; iter++) {
      SequentialTableReader<WaveHolder> wav_reader(wav_rspecifier);
      for (; !wav_reader.Done(); wav_reader.Next()) {
        nvtxRangePushA("Utterance Iteration");

        std::string utt = wav_reader.Key();
        std::string key = utt;
        if (iter > 0) {
          // make key unique for subsequent iterations
          key = key + "-" + std::to_string(iter);
        }
        const WaveData &wave_data = wav_reader.Value();

        if (iter == 0) {
          // calculating number of utterances per iteration
          count_per_iteration++;
          // calculating total audio time per iteration
          total_audio += wave_data.Duration();
        }

        cuda_pipeline.OpenDecodeHandle(key, wave_data);
        processed.push(pair<string, string>(utt, key));
        num_done++;

        while (processed.size() >= pipeline_length) {
          //std::cout << std::endl << "**** Enter In While ****" << std::endl;
          FinishOneDecode(batched_decoder_config, word_syms, write_lattice,
                          total_audio, count_per_iteration, &cuda_pipeline,
                          &processed, &timer, &current_count,
                          &num_frames, &output_iter, &tot_like);
        }  // end while

        nvtxRangePop();
        if (num_todo != -1 && num_done >= num_todo)
          break;
        } // end utterance loop

    } // end iterations loop

    while (processed.size() > 0) {
      std::cout << std::endl << "**** Enter In While ****" << std::endl;
      FinishOneDecode(batched_decoder_config, word_syms, write_lattice,
                      total_audio, count_per_iteration, &cuda_pipeline,
                      &processed, &timer, &current_count,
                      &num_frames, &output_iter, &tot_like);
    } // end while

    nvtxRangePop();

    delete word_syms; // will delete if non-NULL.

    cuda_pipeline.Finalize();
    cudaDeviceSynchronize();

    return 12345678;
}

#endif  // if HAVE_CUDA == 1```

kaldi-asr / kaldi

cudaErrorIllegalAddress----batched-wav-nnet3-cuda.cc #3451