The option `use_env_allocators` crashes the inference with CPU EP

Describe the bug

Introduced in #907, when there is no EP specified, use_env_allocators will be true. This crashes the inference on iOS with the following stacktrace:

* thread #1, queue = 'com.apple.main-thread', stop reason = EXC_BAD_ACCESS (code=1, address=0x11217c000)
    frame #0: 0x000000010543b7a8 TestGenAI.debug.dylib`onnxruntime::common::Status onnxruntime::contrib::RunRotaryEmbedding<float>(onnxruntime::concurrency::ThreadPool*, onnxruntime::contrib::rotary_embedding_helper::RotaryParameters, float const*, long long const*, float const*, float const*, float*, bool)::'lambda'(long, long)::operator()(long, long) const + 488
    frame #1: 0x0000000104d3e950 TestGenAI.debug.dylib`onnxruntime::concurrency::ThreadPool::ParallelFor(long, onnxruntime::TensorOpCost const&, std::__1::function<void (long, long)> const&) + 484
    frame #2: 0x000000010543a19c TestGenAI.debug.dylib`onnxruntime::common::Status onnxruntime::contrib::RunRotaryEmbedding<float>(onnxruntime::concurrency::ThreadPool*, onnxruntime::contrib::rotary_embedding_helper::RotaryParameters, float const*, long long const*, float const*, float const*, float*, bool) + 276
    frame #3: 0x000000010525a25c TestGenAI.debug.dylib`onnxruntime::contrib::GroupQueryAttention<float>::Compute(onnxruntime::OpKernelContext*) const + 2980
    frame #4: 0x0000000104dfc51c TestGenAI.debug.dylib`onnxruntime::ExecuteKernel(onnxruntime::StreamExecutionContext&, unsigned long, unsigned long, bool const&, onnxruntime::SessionScope&) + 312
    frame #5: 0x0000000104dbd3b4 TestGenAI.debug.dylib`onnxruntime::LaunchKernelStep::Execute(onnxruntime::StreamExecutionContext&, unsigned long, onnxruntime::SessionScope&, bool const&, bool&) + 48
    frame #6: 0x0000000104e2f3f8 TestGenAI.debug.dylib`onnxruntime::RunSince(unsigned long, onnxruntime::StreamExecutionContext&, onnxruntime::SessionScope&, bool const&, unsigned long) + 216
    frame #7: 0x0000000104dfd230 TestGenAI.debug.dylib`onnxruntime::ExecuteThePlan(onnxruntime::SessionState const&, gsl::span<int const, 18446744073709551615ul>, gsl::span<OrtValue const, 18446744073709551615ul>, gsl::span<int const, 18446744073709551615ul>, std::__1::vector<OrtValue, std::__1::allocator<OrtValue>>&, std::__1::unordered_map<unsigned long, std::__1::function<onnxruntime::common::Status (onnxruntime::TensorShape const&, OrtDevice const&, OrtValue&, bool&)>, std::__1::hash<unsigned long>, std::__1::equal_to<unsigned long>, std::__1::allocator<std::__1::pair<unsigned long const, std::__1::function<onnxruntime::common::Status (onnxruntime::TensorShape const&, OrtDevice const&, OrtValue&, bool&)>>>> const&, onnxruntime::logging::Logger const&, onnxruntime::DeviceStreamCollection const*, bool const&, bool, bool) + 804
    frame #8: 0x0000000104e54204 TestGenAI.debug.dylib`onnxruntime::utils::ExecuteGraphImpl(onnxruntime::SessionState const&, onnxruntime::FeedsFetchesManager const&, gsl::span<OrtValue const, 18446744073709551615ul>, std::__1::vector<OrtValue, std::__1::allocator<OrtValue>>&, std::__1::unordered_map<unsigned long, std::__1::function<onnxruntime::common::Status (onnxruntime::TensorShape const&, OrtDevice const&, OrtValue&, bool&)>, std::__1::hash<unsigned long>, std::__1::equal_to<unsigned long>, std::__1::allocator<std::__1::pair<unsigned long const, std::__1::function<onnxruntime::common::Status (onnxruntime::TensorShape const&, OrtDevice const&, OrtValue&, bool&)>>>> const&, ExecutionMode, bool const&, onnxruntime::logging::Logger const&, onnxruntime::DeviceStreamCollection*, bool, onnxruntime::Stream*) + [...]

Additional context

The model is Phi3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4.

Tested it with ORT 1.19.0 and ORT 1.19.2. The crash happens with both versions.

Update: I'm also hitting this on win-x64, with Phi3.5-vision model:

Exception thrown at 0x00007FFC186E53F9 (onnxruntime.dll) in phi3v.exe: 0xC0000005: Access violation reading location 0x000001E3AC256080.

>   onnxruntime.dll!onnxruntime::contrib::RunRotaryEmbedding::__l2::<lambda>(__int64 begin, __int64 end) Line 90    C++
    [External Code] 
    onnxruntime.dll!onnxruntime::concurrency::ThreadPool::ParallelForFixedBlockSizeScheduling::__l10::<lambda_1>::operator()(unsigned int idx) Line 433 C++
    [External Code] 
    onnxruntime.dll!onnxruntime::concurrency::ThreadPoolTempl<onnxruntime::Env>::RunInParallel(std::function<void __cdecl(unsigned int)> fn, unsigned int n, __int64 block_size) Line 1310  C++
    onnxruntime.dll!onnxruntime::concurrency::ThreadPool::RunInParallel(std::function<void __cdecl(unsigned int)> fn, unsigned int n, __int64 block_size) Line 525  C++
    onnxruntime.dll!onnxruntime::concurrency::ThreadPool::ParallelForFixedBlockSizeScheduling(__int64 total, __int64 block_size, const std::function<void __cdecl(__int64,__int64)> & fn) Line 440  C++
    onnxruntime.dll!onnxruntime::concurrency::ThreadPool::ParallelFor(__int64 n, const onnxruntime::TensorOpCost & c, const std::function<void __cdecl(__int64,__int64)> & f) Line 626  C++
    onnxruntime.dll!onnxruntime::concurrency::ThreadPool::TryParallelFor(onnxruntime::concurrency::ThreadPool * tp, __int64 total, const onnxruntime::TensorOpCost & cost_per_unit, const std::function<void __cdecl(__int64,__int64)> & fn) Line 703   C++
    onnxruntime.dll!onnxruntime::concurrency::ThreadPool::TryParallelFor(onnxruntime::concurrency::ThreadPool * tp, __int64 total, double cost_per_unit, const std::function<void __cdecl(__int64,__int64)> & fn) Line 251  C++
    onnxruntime.dll!onnxruntime::contrib::RunRotaryEmbedding<float>(onnxruntime::concurrency::ThreadPool * tp, onnxruntime::contrib::rotary_embedding_helper::RotaryParameters parameters, const float * input, const __int64 * position_ids, const float * cos_cache, const float * sin_cache, float * output, bool interleaved) Line 58   C++
    onnxruntime.dll!onnxruntime::contrib::GroupQueryAttention<float>::Compute(onnxruntime::OpKernelContext * context) Line 152  C++
    onnxruntime.dll!onnxruntime::ExecuteKernel(onnxruntime::StreamExecutionContext & ctx, unsigned __int64 idx, unsigned __int64 stream_idx, const bool & terminate_flag, onnxruntime::SessionScope & session_scope) Line 495   C++
    onnxruntime.dll!onnxruntime::LaunchKernelStep::Execute(onnxruntime::StreamExecutionContext & ctx, unsigned __int64 stream_idx, onnxruntime::SessionScope & session_scope, const bool & terminate_flag, bool & continue_flag) Line 73    C++
    onnxruntime.dll!onnxruntime::RunSince(unsigned __int64 stream_idx, onnxruntime::StreamExecutionContext & ctx, onnxruntime::SessionScope & session_scope, const bool & terminate_flag, unsigned __int64 since) Line 222  C++
    onnxruntime.dll!onnxruntime::ExecuteThePlan::__l23::<lambda>() Line 589 C++
    [External Code] 
    onnxruntime.dll!onnxruntime::concurrency::ThreadPool::Schedule(onnxruntime::concurrency::ThreadPool * tp, std::function<void __cdecl(void)> fn) Line 233    C++
    onnxruntime.dll!onnxruntime::ExecuteThePlan(const onnxruntime::SessionState & session_state, gsl::span<int const ,-1> feed_mlvalue_idxs, gsl::span<OrtValue const ,-1> feeds, gsl::span<int const ,-1> fetch_mlvalue_idxs, std::vector<OrtValue,std::allocator<OrtValue>> & fetches, const std::unordered_map<unsigned __int64,std::function<onnxruntime::common::Status __cdecl(onnxruntime::TensorShape const &,OrtDevice const &,OrtValue &,bool &)>,std::hash<unsigned __int64>,std::equal_to<unsigned __int64>,std::allocator<std::pair<unsigned __int64 const ,std::function<onnxruntime::common::Status __cdecl(onnxruntime::TensorShape const &,OrtDevice const &,OrtValue &,bool &)>>>> & fetch_allocators, const onnxruntime::logging::Logger & logger, const onnxruntime::DeviceStreamCollection * device_streams, const bool & terminate_flag, const bool only_execute_path_to_fetches, bool single_thread_mode) Line 588   C++
    onnxruntime.dll!onnxruntime::utils::ExecuteGraphImpl(const onnxruntime::SessionState & session_state, const onnxruntime::FeedsFetchesManager & feeds_fetches_manager, gsl::span<OrtValue const ,-1> feeds, std::vector<OrtValue,std::allocator<OrtValue>> & fetches, const std::unordered_map<unsigned __int64,std::function<onnxruntime::common::Status __cdecl(onnxruntime::TensorShape const &,OrtDevice const &,OrtValue &,bool &)>,std::hash<unsigned __int64>,std::equal_to<unsigned __int64>,std::allocator<std::pair<unsigned __int64 const ,std::function<onnxruntime::common::Status __cdecl(onnxruntime::TensorShape const &,OrtDevice const &,OrtValue &,bool &)>>>> & fetch_allocators, ExecutionMode execution_mode, const bool & terminate_flag, const onnxruntime::logging::Logger & logger, onnxruntime::DeviceStreamCollection * device_stream_collection, const bool only_execute_path_to_fetches, onnxruntime::Stream * parent_stream) Line 649 C++
    onnxruntime.dll!onnxruntime::utils::ExecuteGraph(const onnxruntime::SessionState & session_state, onnxruntime::FeedsFetchesManager & feeds_fetches_manager, gsl::span<OrtValue const ,-1> feeds, std::vector<OrtValue,std::allocator<OrtValue>> & fetches, ExecutionMode execution_mode, const bool & terminate_flag, const onnxruntime::logging::Logger & logger, onnxruntime::DeviceStreamCollectionHolder & device_stream_collection_holder, bool only_execute_path_to_fetches, onnxruntime::Stream * parent_stream) Line 752    C++
    onnxruntime.dll!onnxruntime::utils::ExecuteGraph(const onnxruntime::SessionState & session_state, onnxruntime::FeedsFetchesManager & feeds_fetches_manager, gsl::span<OrtValue const ,-1> feeds, std::vector<OrtValue,std::allocator<OrtValue>> & fetches, ExecutionMode execution_mode, const OrtRunOptions & run_options, onnxruntime::DeviceStreamCollectionHolder & device_stream_collection_holder, const onnxruntime::logging::Logger & logger) Line 774  C++
    onnxruntime.dll!onnxruntime::InferenceSession::Run(const OrtRunOptions & run_options, gsl::span<std::string const ,-1> feed_names, gsl::span<OrtValue const ,-1> feeds, gsl::span<std::string const ,-1> output_names, std::vector<OrtValue,std::allocator<OrtValue>> * p_fetches, const std::vector<OrtDevice,std::allocator<OrtDevice>> * p_fetches_device_info) Line 2592    C++
    onnxruntime.dll!onnxruntime::InferenceSession::Run(const OrtRunOptions & run_options, gsl::span<char const * const,-1> feed_names, gsl::span<OrtValue const * const,-1> feeds, gsl::span<char const * const,-1> fetch_names, gsl::span<OrtValue *,-1> fetches) Line 2720    C++
    onnxruntime.dll!OrtApis::Run(OrtSession * sess, const OrtRunOptions * run_options, const char * const * input_names, const OrtValue * const * input, unsigned __int64 input_len, const char * const * output_names, unsigned __int64 output_names_len, OrtValue * * output) Line 831    C++
    [Inline Frame] onnxruntime-genai.dll!OrtSession::Run(const OrtRunOptions *) Line 819    C++
    onnxruntime-genai.dll!Generators::State::Run(OrtSession & session, OrtRunOptions & run_options, int new_batch_size) Line 69 C++
    onnxruntime-genai.dll!Generators::DecoderState::Run(int current_length, Generators::RoamingArray<int> next_tokens, Generators::RoamingArray<int> next_indices) Line 131 C++
    onnxruntime-genai.dll!Generators::MultiModalPipelineState::Run(int current_length, Generators::RoamingArray<int> next_tokens, Generators::RoamingArray<int> next_indices) Line 173  C++
    onnxruntime-genai.dll!Generators::Generator::ComputeLogits() Line 161   C++
    onnxruntime-genai.dll!OgaGenerator_ComputeLogits(OgaGenerator * generator) Line 257 C++
    phi3v.exe!OgaGenerator::ComputeLogits() Line 240    C++
    phi3v.exe!CXX_API(const char * model_path) Line 80  C++
    phi3v.exe!main(int argc, char * * argv) Line 206    C++
    [External Code]

microsoft / onnxruntime-genai

The option `use_env_allocators` crashes the inference with CPU EP #929