Non-deterministic behavior

Working with one of the MLPerf's code and opcode history tool from NVbit, I see that over multiple runs, the order of the invoked kernels are different. For example:

Run 1:

kernel 0 - void tensorflow::functor::FillPhiloxRandomKernelLaunch<tsl::random::TruncatedNormalDistribution<tsl::random::SingleSampleAdapter<tsl::random::PhiloxRandom>, float> >(unsigned long const*, unsigned long const*, tsl::random::PhiloxRandom, tsl::random::TruncatedNormalDistribution<tsl::random::SingleSampleAdapter<tsl::random::PhiloxRandom>, float>::ResultElementType*, long, tsl::random::TruncatedNormalDistribution<tsl::random::SingleSampleAdapter<tsl::random::PhiloxRandom>, float>) - #thread-blocks 102,  kernel instructions 5364219, total instructions 5364219
kernel 1 - void tensorflow::functor::FillPhiloxRandomKernelLaunch<tsl::random::TruncatedNormalDistribution<tsl::random::SingleSampleAdapter<tsl::random::PhiloxRandom>, float> >(unsigned long const*, unsigned long const*, tsl::random::PhiloxRandom, tsl::random::TruncatedNormalDistribution<tsl::random::SingleSampleAdapter<tsl::random::PhiloxRandom>, float>::ResultElementType*, long, tsl::random::TruncatedNormalDistribution<tsl::random::SingleSampleAdapter<tsl::random::PhiloxRandom>, float>) - #thread-blocks 102,  kernel instructions 5359877, total instructions 10724096
kernel 2 - void tensorflow::functor::FillPhiloxRandomKernelLaunch<tsl::random::TruncatedNormalDistribution<tsl::random::SingleSampleAdapter<tsl::random::PhiloxRandom>, float> >(unsigned long const*, unsigned long const*, tsl::random::PhiloxRandom, tsl::random::TruncatedNormalDistribution<tsl::random::SingleSampleAdapter<tsl::random::PhiloxRandom>, float>::ResultElementType*, long, tsl::random::TruncatedNormalDistribution<tsl::random::SingleSampleAdapter<tsl::random::PhiloxRandom>, float>) - #thread-blocks 102,  kernel instructions 21086275, total instructions 31810371
kernel 3 - void tensorflow::functor::FillPhiloxRandomKernelLaunch<tsl::random::TruncatedNormalDistribution<tsl::random::SingleSampleAdapter<tsl::random::PhiloxRandom>, float> >(unsigned long const*, unsigned long const*, tsl::random::PhiloxRandom, tsl::random::TruncatedNormalDistribution<tsl::random::SingleSampleAdapter<tsl::random::PhiloxRandom>, float>::ResultElementType*, long, tsl::random::TruncatedNormalDistribution<tsl::random::SingleSampleAdapter<tsl::random::PhiloxRandom>, float>) - #thread-blocks 102,  kernel instructions 21082484, total instructions 52892855

Run 2:

kernel 0 - void tensorflow::functor::FillPhiloxRandomKernelLaunch<tsl::random::TruncatedNormalDistribution<tsl::random::SingleSampleAdapter<tsl::random::PhiloxRandom>, float> >(unsigned long const*, unsigned long const*, tsl::random::PhiloxRandom, tsl::random::TruncatedNormalDistribution<tsl::random::SingleSampleAdapter<tsl::random::PhiloxRandom>, float>::ResultElementType*, long, tsl::random::TruncatedNormalDistribution<tsl::random::SingleSampleAdapter<tsl::random::PhiloxRandom>, float>) - #thread-blocks 102,  kernel instructions 5357397, total instructions 5357397
kernel 1 - void tensorflow::functor::FillPhiloxRandomKernelLaunch<tsl::random::TruncatedNormalDistribution<tsl::random::SingleSampleAdapter<tsl::random::PhiloxRandom>, float> >(unsigned long const*, unsigned long const*, tsl::random::PhiloxRandom, tsl::random::TruncatedNormalDistribution<tsl::random::SingleSampleAdapter<tsl::random::PhiloxRandom>, float>::ResultElementType*, long, tsl::random::TruncatedNormalDistribution<tsl::random::SingleSampleAdapter<tsl::random::PhiloxRandom>, float>) - #thread-blocks 102,  kernel instructions 5356351, total instructions 10713748
kernel 2 - void tensorflow::functor::FillPhiloxRandomKernelLaunch<tsl::random::TruncatedNormalDistribution<tsl::random::SingleSampleAdapter<tsl::random::PhiloxRandom>, float> >(unsigned long const*, unsigned long const*, tsl::random::PhiloxRandom, tsl::random::TruncatedNormalDistribution<tsl::random::SingleSampleAdapter<tsl::random::PhiloxRandom>, float>::ResultElementType*, long, tsl::random::TruncatedNormalDistribution<tsl::random::SingleSampleAdapter<tsl::random::PhiloxRandom>, float>) - #thread-blocks 102,  kernel instructions 21070770, total instructions 31784518
kernel 3 - void tensorflow::functor::FillPhiloxRandomKernelLaunch<tsl::random::TruncatedNormalDistribution<tsl::random::SingleSampleAdapter<tsl::random::PhiloxRandom>, float> >(unsigned long const*, unsigned long const*, tsl::random::PhiloxRandom, tsl::random::TruncatedNormalDistribution<tsl::random::SingleSampleAdapter<tsl::random::PhiloxRandom>, float>::ResultElementType*, long, tsl::random::TruncatedNormalDistribution<tsl::random::SingleSampleAdapter<tsl::random::PhiloxRandom>, float>) - #thread-blocks 102,  kernel instructions 5358753, total instructions 37143271

If you scroll the outputs to the right, you will see that Kernel 0, 1 and 2 have nearly the same number of instructions, but kernel-3 has 21,082,484 instructions in the first run and 5,358,753 in the second run.

Is there any way to make NVbit's behavior deterministic?

NVlabs / NVBit

Non-deterministic behavior #118