aiqm / torchani

Accurate Neural Network Potential on PyTorch
https://aiqm.github.io/torchani/
MIT License
459 stars 127 forks source link

CUAEV Torchscript compatible #518

Closed yueyericardo closed 3 years ago

yueyericardo commented 3 years ago

When I follow Extending TorchScript with Custom C++ Operators, try to registering the cuaev with TorchScript.

I make the following changes (based on the error messages)

diff --git a/torchani/extension/aev.cu b/torchani/extension/aev.cu
index c4de878..31c1ac3 100644
--- a/torchani/extension/aev.cu
+++ b/torchani/extension/aev.cu
@@ -8,6 +8,7 @@
 #include <thrust/device_vector.h>
 #include <thrust/equal.h>
 #include <torch/extension.h>
+#include <torch/script.h>^M

 #include <c10/cuda/CUDACachingAllocator.h>
 #include <THC/THC.h>
@@ -427,7 +428,7 @@ void cuComputeAEV(torch::Tensor coordinates_t, torch::Tensor species_t,
                   ScalarRealT Rcr, ScalarRealT Rca, torch::Tensor EtaR_t,
                   torch::Tensor ShfR_t, torch::Tensor EtaA_t,
                   torch::Tensor Zeta_t, torch::Tensor ShfA_t,
-                  torch::Tensor ShfZ_t, torch::Tensor aev_t, int num_species) {
+                  torch::Tensor ShfZ_t, torch::Tensor aev_t, int64_t num_species) {^M
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   auto thrust_allocator = THCThrustAllocator(at::globalContext().lazyInitCUDA());
   auto policy = thrust::cuda::par(thrust_allocator).on(stream);
@@ -439,17 +440,17 @@ void cuComputeAEV(torch::Tensor coordinates_t, torch::Tensor species_t,
   // max_natoms_per_mol << "\n";

   AEVScalarParams<float> aev_params;
-  aev_params.Rca = Rca;
-  aev_params.Rcr = Rcr;
-  aev_params.num_species = num_species;
+  aev_params.Rca = (float) Rca;^M
+  aev_params.Rcr = (float) Rcr;^M
+  aev_params.num_species = (int) num_species;^M

   aev_params.radial_sublength = EtaR_t.size(0) * ShfR_t.size(0);
-  aev_params.radial_length = aev_params.radial_sublength * num_species;
+  aev_params.radial_length = aev_params.radial_sublength * aev_params.num_species;^M

   aev_params.angular_sublength =
       EtaA_t.size(0) * Zeta_t.size(0) * ShfA_t.size(0) * ShfZ_t.size(0);
   aev_params.angular_length =
-      aev_params.angular_sublength * (num_species * (num_species + 1) / 2);
+      aev_params.angular_sublength * (aev_params.num_species * (aev_params.num_species + 1) / 2);^M

   if (EtaR_t.size(0) != 1 || EtaA_t.size(0) != 1 || Zeta_t.size(0) != 1) {
     std::cerr << "cuda extension is currently not supported for the specified "
@@ -582,3 +583,5 @@ void cuComputeAEV(torch::Tensor coordinates_t, torch::Tensor species_t,
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   m.def("cuComputeAEV", &cuComputeAEV<float>, "CUDA method to compute AEVs");
 }
+^M
+static auto registry = torch::RegisterOperators("cuaev::cuComputeAEV", &cuComputeAEV<double>);
\ No newline at end of file

But got errors below

building 'cuaev' extension
Emitting ninja build file /home/richard/dev/torchani_cuaev/torchani/extension/build/temp.linux-x86_64-3.6/build.ninja...
Compiling objects...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
[1/1] /usr/local/cuda-10.1/bin/nvcc -I/home/richard/dev/torchani_cuaev/torchani/extension/include -I/home/richard/program/anaconda3/envs/ml/lib/python3.6/site-packages/torch/include -I/home/richard/program/anaconda3/envs/ml/lib/python3.6/site-packages/torch/include/torch/csrc/api/include -I/home/richard/program/anaconda3/envs/ml/lib/python3.6/site-packages/torch/include/TH -I/home/richard/program/anaconda3/envs/ml/lib/python3.6/site-packages/torch/include/THC -I/usr/local/cuda-10.1/include -I/home/richard/program/anaconda3/envs/ml/include/python3.6m -c -c /home/richard/dev/torchani_cuaev/torchani/extension/aev.cu -o /home/richard/dev/torchani_cuaev/torchani/extension/build/temp.linux-x86_64-3.6/aev.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -Xptxas=-v --expt-extended-lambda -use_fast_math -DTORCH_API_INCLUDE_EXTENSION_H -DTORCH_EXTENSION_NAME=cuaev -D_GLIBCXX_USE_CXX11_ABI=0 -std=c++14
FAILED: /home/richard/dev/torchani_cuaev/torchani/extension/build/temp.linux-x86_64-3.6/aev.o
/usr/local/cuda-10.1/bin/nvcc -I/home/richard/dev/torchani_cuaev/torchani/extension/include -I/home/richard/program/anaconda3/envs/ml/lib/python3.6/site-packages/torch/include -I/home/richard/program/anaconda3/envs/ml/lib/python3.6/site-packages/torch/include/torch/csrc/api/include -I/home/richard/program/anaconda3/envs/ml/lib/python3.6/site-packages/torch/include/TH -I/home/richard/program/anaconda3/envs/ml/lib/python3.6/site-packages/torch/include/THC -I/usr/local/cuda-10.1/include -I/home/richard/program/anaconda3/envs/ml/include/python3.6m -c -c /home/richard/dev/torchani_cuaev/torchani/extension/aev.cu -o /home/richard/dev/torchani_cuaev/torchani/extension/build/temp.linux-x86_64-3.6/aev.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -Xptxas=-v --expt-extended-lambda -use_fast_math -DTORCH_API_INCLUDE_EXTENSION_H -DTORCH_EXTENSION_NAME=cuaev -D_GLIBCXX_USE_CXX11_ABI=0 -std=c++14
/home/richard/program/anaconda3/envs/ml/lib/python3.6/site-packages/torch/include/ATen/record_function.h(18): warning: attribute "__visibility__" does not apply here

/home/richard/program/anaconda3/envs/ml/lib/python3.6/site-packages/torch/include/torch/csrc/autograd/profiler.h(97): warning: attribute "__visibility__" does not apply here

/home/richard/program/anaconda3/envs/ml/lib/python3.6/site-packages/torch/include/torch/csrc/autograd/profiler.h(126): warning: attribute "__visibility__" does not apply here

/home/richard/dev/torchani_cuaev/torchani/extension/aev.cu(238): warning: variable "aev_offset" was declared but never referenced
            detected during:
            instantiation of "void cuAngularAEVs(at::PackedTensorAccessor32<SpeciesT, 2UL, at::RestrictPtrTraits>, at::PackedTensorAccessor32<DataT, 3UL, at::RestrictPtrTraits>, at::PackedTensorAccessor32<DataT, 1UL, at::RestrictPtrTraits>, at::PackedTensorAccessor32<DataT, 1UL, at::RestrictPtrTraits>, at::PackedTensorAccessor32<DataT, 1UL, at::RestrictPtrTraits>, at::PackedTensorAccessor32<DataT, 1UL, at::RestrictPtrTraits>, at::PackedTensorAccessor32<DataT, 3UL, at::RestrictPtrTraits>, PairDist<DataT> *, PairDist<DataT> *, int *, int *, AEVScalarParams<DataT, IndexT>, int, int, int) [with SpeciesT=int, DataT=float, IndexT=int, TILEX=8, TILEY=4]"
(576): here
            instantiation of "void cuComputeAEV(at::Tensor, at::Tensor, ScalarRealT, ScalarRealT, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, int64_t) [with ScalarRealT=float]"
(584): here

/home/richard/program/anaconda3/envs/ml/lib/python3.6/site-packages/torch/include/ATen/record_function.h(18): warning: attribute "__visibility__" does not apply here

/home/richard/program/anaconda3/envs/ml/lib/python3.6/site-packages/torch/include/torch/csrc/autograd/profiler.h(97): warning: attribute "__visibility__" does not apply here

/home/richard/program/anaconda3/envs/ml/lib/python3.6/site-packages/torch/include/torch/csrc/autograd/profiler.h(126): warning: attribute "__visibility__" does not apply here

/home/richard/dev/torchani_cuaev/torchani/extension/aev.cu(238): warning: variable "aev_offset" was declared but never referenced
            detected during:
            instantiation of "void cuAngularAEVs(at::PackedTensorAccessor32<SpeciesT, 2UL, at::RestrictPtrTraits>, at::PackedTensorAccessor32<DataT, 3UL, at::RestrictPtrTraits>, at::PackedTensorAccessor32<DataT, 1UL, at::RestrictPtrTraits>, at::PackedTensorAccessor32<DataT, 1UL, at::RestrictPtrTraits>, at::PackedTensorAccessor32<DataT, 1UL, at::RestrictPtrTraits>, at::PackedTensorAccessor32<DataT, 1UL, at::RestrictPtrTraits>, at::PackedTensorAccessor32<DataT, 3UL, at::RestrictPtrTraits>, PairDist<DataT> *, PairDist<DataT> *, int *, int *, AEVScalarParams<DataT, IndexT>, int, int, int) [with SpeciesT=int, DataT=float, IndexT=int, TILEX=8, TILEY=4]"
(576): here
            instantiation of "void cuComputeAEV(at::Tensor, at::Tensor, ScalarRealT, ScalarRealT, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, int64_t) [with ScalarRealT=float]"
(584): here

ptxas info    : 14 bytes gmem, 672 bytes cmem[3]
ptxas info    : Compiling entry function '_ZN6thrust8cuda_cub4core13_kernel_agentINS0_14__parallel_for16ParallelForAgentINS0_6__fill7functorIP8PairDistIfES8_EElEESA_lEEvT0_T1_' for 'sm_50'
ptxas info    : Function properties for _ZN6thrust8cuda_cub4core13_kernel_agentINS0_14__parallel_for16ParallelForAgentINS0_6__fill7functorIP8PairDistIfES8_EElEESA_lEEvT0_T1_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 10 registers, 352 bytes cmem[0]
ptxas info    : Compiling entry function '_ZN6thrust8cuda_cub3cub11EmptyKernelIvEEvv' for 'sm_50'
ptxas info    : Function properties for _ZN6thrust8cuda_cub3cub11EmptyKernelIvEEvv
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 2 registers, 320 bytes cmem[0]
ptxas info    : Compiling entry function '_ZN3cub23DeviceSelectSweepKernelINS_16DispatchSelectIfIPK8PairDistIfEPNS_8NullTypeEPS3_PiZ12cuComputeAEVIdEvN2at6TensorESC_T_SD_SC_SC_SC_SC_SC_SC_SC_lEUlS3_E0_S6_iLb0EE18PtxSelectIfPolicyTES5_S7_S8_S9_NS_13ScanTileStateIiLb1EEESE_S6_iLb0EEEvT0_T1_T2_T3_T4_T5_T6_T7_i' for 'sm_50'
ptxas info    : Function properties for _ZN3cub23DeviceSelectSweepKernelINS_16DispatchSelectIfIPK8PairDistIfEPNS_8NullTypeEPS3_PiZ12cuComputeAEVIdEvN2at6TensorESC_T_SD_SC_SC_SC_SC_SC_SC_SC_lEUlS3_E0_S6_iLb0EE18PtxSelectIfPolicyTES5_S7_S8_S9_NS_13ScanTileStateIiLb1EEESE_S6_iLb0EEEvT0_T1_T2_T3_T4_T5_T6_T7_i
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 32 registers, 4608 bytes smem, 380 bytes cmem[0], 8 bytes cmem[2]
ptxas info    : Compiling entry function '_ZN3cub23DeviceSelectSweepKernelINS_16DispatchSelectIfIPK8PairDistIfEPNS_8NullTypeEPS3_PiZ12cuComputeAEVIdEvN2at6TensorESC_T_SD_SC_SC_SC_SC_SC_SC_SC_lEUlS3_E_S6_iLb0EE18PtxSelectIfPolicyTES5_S7_S8_S9_NS_13ScanTileStateIiLb1EEESE_S6_iLb0EEEvT0_T1_T2_T3_T4_T5_T6_T7_i' for 'sm_50'
ptxas info    : Function properties for _ZN3cub23DeviceSelectSweepKernelINS_16DispatchSelectIfIPK8PairDistIfEPNS_8NullTypeEPS3_PiZ12cuComputeAEVIdEvN2at6TensorESC_T_SD_SC_SC_SC_SC_SC_SC_SC_lEUlS3_E_S6_iLb0EE18PtxSelectIfPolicyTES5_S7_S8_S9_NS_13ScanTileStateIiLb1EEESE_S6_iLb0EEEvT0_T1_T2_T3_T4_T5_T6_T7_i
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 32 registers, 4608 bytes smem, 380 bytes cmem[0], 8 bytes cmem[2]
ptxas info    : Compiling entry function '_ZN3cub28DeviceReduceSingleTileKernelINS_18DeviceReducePolicyIiiiNS_3MaxEE9Policy600EPiS5_iS2_iEEvT0_T1_T2_T3_T4_' for 'sm_50'
ptxas info    : Function properties for _ZN3cub28DeviceReduceSingleTileKernelINS_18DeviceReducePolicyIiiiNS_3MaxEE9Policy600EPiS5_iS2_iEEvT0_T1_T2_T3_T4_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 40 registers, 48 bytes smem, 348 bytes cmem[0], 4 bytes cmem[2]
ptxas info    : Compiling entry function '_ZN3cub18DeviceReduceKernelINS_18DeviceReducePolicyIiiiNS_3MaxEE9Policy600EPKiPiiS2_EEvT0_T1_T2_NS_13GridEvenShareISA_EET3_' for 'sm_50'
ptxas info    : Function properties for _ZN3cub18DeviceReduceKernelINS_18DeviceReducePolicyIiiiNS_3MaxEE9Policy600EPKiPiiS2_EEvT0_T1_T2_NS_13GridEvenShareISA_EET3_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 30 registers, 48 bytes smem, 381 bytes cmem[0], 4 bytes cmem[2]
ptxas info    : Compiling entry function '_ZN3cub28DeviceReduceSingleTileKernelINS_18DeviceReducePolicyIiiiNS_3MaxEE9Policy600EPKiPiiS2_iEEvT0_T1_T2_T3_T4_' for 'sm_50'
ptxas info    : Function properties for _ZN3cub28DeviceReduceSingleTileKernelINS_18DeviceReducePolicyIiiiNS_3MaxEE9Policy600EPKiPiiS2_iEEvT0_T1_T2_T3_T4_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 40 registers, 48 bytes smem, 348 bytes cmem[0], 4 bytes cmem[2]
ptxas info    : Compiling entry function '_ZN3cub16DeviceScanKernelINS_15AgentScanPolicyILi128ELi15EiLNS_18BlockLoadAlgorithmE2ELNS_17CacheLoadModifierE0ELNS_19BlockStoreAlgorithmE2ELNS_18BlockScanAlgorithmE2ENS_15MemBoundScalingILi128ELi15EiEEEEPKiPiNS_13ScanTileStateIiLb1EEENS_3SumEiiEEvT0_T1_T2_iT3_T4_T5_' for 'sm_50'
ptxas info    : Function properties for _ZN3cub16DeviceScanKernelINS_15AgentScanPolicyILi128ELi15EiLNS_18BlockLoadAlgorithmE2ELNS_17CacheLoadModifierE0ELNS_19BlockStoreAlgorithmE2ELNS_18BlockScanAlgorithmE2ENS_15MemBoundScalingILi128ELi15EiEEEEPKiPiNS_13ScanTileStateIiLb1EEENS_3SumEiiEEvT0_T1_T2_iT3_T4_T5_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 41 registers, 7696 bytes smem, 360 bytes cmem[0], 4 bytes cmem[2]
ptxas info    : Compiling entry function '_ZN3cub16DeviceScanKernelINS_15AgentScanPolicyILi128ELi12EiLNS_18BlockLoadAlgorithmE0ELNS_17CacheLoadModifierE5ELNS_19BlockStoreAlgorithmE3ELNS_18BlockScanAlgorithmE2ENS_15MemBoundScalingILi128ELi12EiEEEEPKiPiNS_13ScanTileStateIiLb1EEENS_3SumEiiEEvT0_T1_T2_iT3_T4_T5_' for 'sm_50'
ptxas info    : Function properties for _ZN3cub16DeviceScanKernelINS_15AgentScanPolicyILi128ELi12EiLNS_18BlockLoadAlgorithmE0ELNS_17CacheLoadModifierE5ELNS_19BlockStoreAlgorithmE3ELNS_18BlockScanAlgorithmE2ENS_15MemBoundScalingILi128ELi12EiEEEEPKiPiNS_13ScanTileStateIiLb1EEENS_3SumEiiEEvT0_T1_T2_iT3_T4_T5_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 37 registers, 6160 bytes smem, 360 bytes cmem[0], 4 bytes cmem[2]
ptxas info    : Compiling entry function '_ZN3cub16DeviceScanKernelINS_15AgentScanPolicyILi128ELi12EiLNS_18BlockLoadAlgorithmE0ELNS_17CacheLoadModifierE5ELNS_19BlockStoreAlgorithmE4ELNS_18BlockScanAlgorithmE0ENS_15MemBoundScalingILi128ELi12EiEEEEPKiPiNS_13ScanTileStateIiLb1EEENS_3SumEiiEEvT0_T1_T2_iT3_T4_T5_' for 'sm_50'
ptxas info    : Function properties for _ZN3cub16DeviceScanKernelINS_15AgentScanPolicyILi128ELi12EiLNS_18BlockLoadAlgorithmE0ELNS_17CacheLoadModifierE5ELNS_19BlockStoreAlgorithmE4ELNS_18BlockScanAlgorithmE0ENS_15MemBoundScalingILi128ELi12EiEEEEPKiPiNS_13ScanTileStateIiLb1EEENS_3SumEiiEEvT0_T1_T2_iT3_T4_T5_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 32 registers, 1552 bytes smem, 360 bytes cmem[0], 4 bytes cmem[2]
ptxas info    : Compiling entry function '_ZN3cub16DeviceScanKernelINS_15AgentScanPolicyILi256ELi9EiLNS_18BlockLoadAlgorithmE3ELNS_17CacheLoadModifierE0ELNS_19BlockStoreAlgorithmE3ELNS_18BlockScanAlgorithmE2ENS_15MemBoundScalingILi256ELi9EiEEEEPKiPiNS_13ScanTileStateIiLb1EEENS_3SumEiiEEvT0_T1_T2_iT3_T4_T5_' for 'sm_50'
ptxas info    : Function properties for _ZN3cub16DeviceScanKernelINS_15AgentScanPolicyILi256ELi9EiLNS_18BlockLoadAlgorithmE3ELNS_17CacheLoadModifierE0ELNS_19BlockStoreAlgorithmE3ELNS_18BlockScanAlgorithmE2ENS_15MemBoundScalingILi256ELi9EiEEEEPKiPiNS_13ScanTileStateIiLb1EEENS_3SumEiiEEvT0_T1_T2_iT3_T4_T5_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 39 registers, 9232 bytes smem, 360 bytes cmem[0], 4 bytes cmem[2]
ptxas info    : Compiling entry function '_ZN3cub16DeviceScanKernelINS_15AgentScanPolicyILi128ELi12EiLNS_18BlockLoadAlgorithmE3ELNS_17CacheLoadModifierE0ELNS_19BlockStoreAlgorithmE3ELNS_18BlockScanAlgorithmE2ENS_15MemBoundScalingILi128ELi12EiEEEEPKiPiNS_13ScanTileStateIiLb1EEENS_3SumEiiEEvT0_T1_T2_iT3_T4_T5_' for 'sm_50'
ptxas info    : Function properties for _ZN3cub16DeviceScanKernelINS_15AgentScanPolicyILi128ELi12EiLNS_18BlockLoadAlgorithmE3ELNS_17CacheLoadModifierE0ELNS_19BlockStoreAlgorithmE3ELNS_18BlockScanAlgorithmE2ENS_15MemBoundScalingILi128ELi12EiEEEEPKiPiNS_13ScanTileStateIiLb1EEENS_3SumEiiEEvT0_T1_T2_iT3_T4_T5_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 42 registers, 6160 bytes smem, 360 bytes cmem[0], 4 bytes cmem[2]
ptxas info    : Compiling entry function '_ZN3cub16DeviceScanKernelINS_15AgentScanPolicyILi96ELi21EiLNS_18BlockLoadAlgorithmE3ELNS_17CacheLoadModifierE0ELNS_19BlockStoreAlgorithmE3ELNS_18BlockScanAlgorithmE1ENS_15MemBoundScalingILi96ELi21EiEEEEPKiPiNS_13ScanTileStateIiLb1EEENS_3SumEiiEEvT0_T1_T2_iT3_T4_T5_' for 'sm_50'
ptxas info    : Function properties for _ZN3cub16DeviceScanKernelINS_15AgentScanPolicyILi96ELi21EiLNS_18BlockLoadAlgorithmE3ELNS_17CacheLoadModifierE0ELNS_19BlockStoreAlgorithmE3ELNS_18BlockScanAlgorithmE1ENS_15MemBoundScalingILi96ELi21EiEEEEPKiPiNS_13ScanTileStateIiLb1EEENS_3SumEiiEEvT0_T1_T2_iT3_T4_T5_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 45 registers, 8080 bytes smem, 360 bytes cmem[0], 4 bytes cmem[2]
ptxas info    : Compiling entry function '_ZN3cub16DeviceScanKernelINS_15AgentScanPolicyILi64ELi9EiLNS_18BlockLoadAlgorithmE3ELNS_17CacheLoadModifierE0ELNS_19BlockStoreAlgorithmE3ELNS_18BlockScanAlgorithmE2ENS_15MemBoundScalingILi64ELi9EiEEEEPKiPiNS_13ScanTileStateIiLb1EEENS_3SumEiiEEvT0_T1_T2_iT3_T4_T5_' for 'sm_50'
ptxas info    : Function properties for _ZN3cub16DeviceScanKernelINS_15AgentScanPolicyILi64ELi9EiLNS_18BlockLoadAlgorithmE3ELNS_17CacheLoadModifierE0ELNS_19BlockStoreAlgorithmE3ELNS_18BlockScanAlgorithmE2ENS_15MemBoundScalingILi64ELi9EiEEEEPKiPiNS_13ScanTileStateIiLb1EEENS_3SumEiiEEvT0_T1_T2_iT3_T4_T5_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 39 registers, 2320 bytes smem, 360 bytes cmem[0], 4 bytes cmem[2]
ptxas info    : Compiling entry function '_ZN3cub20DeviceScanInitKernelINS_13ScanTileStateIiLb1EEEEEvT_i' for 'sm_50'
ptxas info    : Function properties for _ZN3cub20DeviceScanInitKernelINS_13ScanTileStateIiLb1EEEEEvT_i
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 10 registers, 332 bytes cmem[0]
ptxas info    : Compiling entry function '_ZN3cub23DeviceReduceByKeyKernelINS_19DispatchReduceByKeyIPK8PairDistIfEPS3_NS_21ConstantInputIteratorIiiEEPiS9_NS_8EqualityENS_3SumEiE20PtxReduceByKeyPolicyES5_S6_S8_S9_S9_NS_24ReduceByKeyScanTileStateIiiLb1EEESA_SB_iEEvT0_T1_T2_T3_T4_T5_iT6_T7_T8_' for 'sm_50'
ptxas info    : Function properties for _ZN3cub23DeviceReduceByKeyKernelINS_19DispatchReduceByKeyIPK8PairDistIfEPS3_NS_21ConstantInputIteratorIiiEEPiS9_NS_8EqualityENS_3SumEiE20PtxReduceByKeyPolicyES5_S6_S8_S9_S9_NS_24ReduceByKeyScanTileStateIiiLb1EEESA_SB_iEEvT0_T1_T2_T3_T4_T5_iT6_T7_T8_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 53 registers, 6160 bytes smem, 380 bytes cmem[0], 8 bytes cmem[2]
ptxas info    : Compiling entry function '_ZN3cub23DeviceCompactInitKernelINS_24ReduceByKeyScanTileStateIiiLb1EEEPiEEvT_iT0_' for 'sm_50'
ptxas info    : Function properties for _ZN3cub23DeviceCompactInitKernelINS_24ReduceByKeyScanTileStateIiiLb1EEEPiEEvT_iT0_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 10 registers, 344 bytes cmem[0]
ptxas info    : Compiling entry function '_ZN3cub23DeviceSelectSweepKernelINS_16DispatchSelectIfIPK8PairDistIfEPNS_8NullTypeEPS3_PiZ12cuComputeAEVIfEvN2at6TensorESC_T_SD_SC_SC_SC_SC_SC_SC_SC_lEUlS3_E0_S6_iLb0EE18PtxSelectIfPolicyTES5_S7_S8_S9_NS_13ScanTileStateIiLb1EEESE_S6_iLb0EEEvT0_T1_T2_T3_T4_T5_T6_T7_i' for 'sm_50'
ptxas info    : Function properties for _ZN3cub23DeviceSelectSweepKernelINS_16DispatchSelectIfIPK8PairDistIfEPNS_8NullTypeEPS3_PiZ12cuComputeAEVIfEvN2at6TensorESC_T_SD_SC_SC_SC_SC_SC_SC_SC_lEUlS3_E0_S6_iLb0EE18PtxSelectIfPolicyTES5_S7_S8_S9_NS_13ScanTileStateIiLb1EEESE_S6_iLb0EEEvT0_T1_T2_T3_T4_T5_T6_T7_i
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 31 registers, 4608 bytes smem, 376 bytes cmem[0], 8 bytes cmem[2]
ptxas info    : Compiling entry function '_ZN3cub23DeviceSelectSweepKernelINS_16DispatchSelectIfIPK8PairDistIfEPNS_8NullTypeEPS3_PiZ12cuComputeAEVIfEvN2at6TensorESC_T_SD_SC_SC_SC_SC_SC_SC_SC_lEUlS3_E_S6_iLb0EE18PtxSelectIfPolicyTES5_S7_S8_S9_NS_13ScanTileStateIiLb1EEESE_S6_iLb0EEEvT0_T1_T2_T3_T4_T5_T6_T7_i' for 'sm_50'
ptxas info    : Function properties for _ZN3cub23DeviceSelectSweepKernelINS_16DispatchSelectIfIPK8PairDistIfEPNS_8NullTypeEPS3_PiZ12cuComputeAEVIfEvN2at6TensorESC_T_SD_SC_SC_SC_SC_SC_SC_SC_lEUlS3_E_S6_iLb0EE18PtxSelectIfPolicyTES5_S7_S8_S9_NS_13ScanTileStateIiLb1EEESE_S6_iLb0EEEvT0_T1_T2_T3_T4_T5_T6_T7_i
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 31 registers, 4608 bytes smem, 376 bytes cmem[0], 8 bytes cmem[2]
ptxas info    : Compiling entry function '_ZN3cub23DeviceCompactInitKernelINS_13ScanTileStateIiLb1EEEPiEEvT_iT0_' for 'sm_50'
ptxas info    : Function properties for _ZN3cub23DeviceCompactInitKernelINS_13ScanTileStateIiLb1EEEPiEEvT_iT0_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 8 registers, 344 bytes cmem[0]
ptxas info    : Compiling entry function '_ZN3cub11EmptyKernelIvEEvv' for 'sm_50'
ptxas info    : Function properties for _ZN3cub11EmptyKernelIvEEvv
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 2 registers, 320 bytes cmem[0]
ptxas info    : Compiling entry function '_Z13cuAngularAEVsIifiLi8ELi4EEvN2at27GenericPackedTensorAccessorIT_Lm2ENS0_17RestrictPtrTraitsEiEENS1_IT0_Lm3ES3_iEENS1_IS5_Lm1ES3_iEES7_S7_S7_S6_P8PairDistIS5_ESA_PiSB_15AEVScalarParamsIS5_T1_Eiii' for 'sm_50'
ptxas info    : Function properties for _Z13cuAngularAEVsIifiLi8ELi4EEvN2at27GenericPackedTensorAccessorIT_Lm2ENS0_17RestrictPtrTraitsEiEENS1_IT0_Lm3ES3_iEENS1_IS5_Lm1ES3_iEES7_S7_S7_S6_P8PairDistIS5_ESA_PiSB_15AEVScalarParamsIS5_T1_Eiii
    48 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 60 registers, 544 bytes cmem[0], 328 bytes cmem[2]
ptxas info    : Function properties for __internal_trig_reduction_slowpathd
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Compiling entry function '_Z12cuRadialAEVsIifLi8EEvN2at27GenericPackedTensorAccessorIT_Lm2ENS0_17RestrictPtrTraitsEiEENS1_IT0_Lm1ES3_iEES6_NS1_IS5_Lm3ES3_iEEP8PairDistIS5_E15AEVScalarParamsIS5_iEi' for 'sm_50'
ptxas info    : Function properties for _Z12cuRadialAEVsIifLi8EEvN2at27GenericPackedTensorAccessorIT_Lm2ENS0_17RestrictPtrTraitsEiEENS1_IT0_Lm1ES3_iEES6_NS1_IS5_Lm3ES3_iEEP8PairDistIS5_E15AEVScalarParamsIS5_iEi
    48 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 30 registers, 448 bytes cmem[0], 100 bytes cmem[2]
ptxas info    : Function properties for __internal_trig_reduction_slowpathd
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Compiling entry function '_Z16pairwiseDistanceIifiEvN2at27GenericPackedTensorAccessorIT_Lm2ENS0_17RestrictPtrTraitsEiEENS1_IT0_Lm3ES3_iEEP8PairDistIS5_ET1_' for 'sm_50'
ptxas info    : Function properties for _Z16pairwiseDistanceIifiEvN2at27GenericPackedTensorAccessorIT_Lm2ENS0_17RestrictPtrTraitsEiEENS1_IT0_Lm3ES3_iEEP8PairDistIS5_ET1_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 22 registers, 388 bytes cmem[0]
/home/richard/program/anaconda3/envs/ml/lib/python3.6/site-packages/torch/include/ATen/record_function.h(18): warning: attribute "__visibility__" does not apply here

/home/richard/program/anaconda3/envs/ml/lib/python3.6/site-packages/torch/include/torch/csrc/autograd/profiler.h(97): warning: attribute "__visibility__" does not apply here

/home/richard/program/anaconda3/envs/ml/lib/python3.6/site-packages/torch/include/torch/csrc/autograd/profiler.h(126): warning: attribute "__visibility__" does not apply here

/home/richard/dev/torchani_cuaev/torchani/extension/aev.cu(238): warning: variable "aev_offset" was declared but never referenced
            detected during:
            instantiation of "void cuAngularAEVs(at::PackedTensorAccessor32<SpeciesT, 2UL, at::RestrictPtrTraits>, at::PackedTensorAccessor32<DataT, 3UL, at::RestrictPtrTraits>, at::PackedTensorAccessor32<DataT, 1UL, at::RestrictPtrTraits>, at::PackedTensorAccessor32<DataT, 1UL, at::RestrictPtrTraits>, at::PackedTensorAccessor32<DataT, 1UL, at::RestrictPtrTraits>, at::PackedTensorAccessor32<DataT, 1UL, at::RestrictPtrTraits>, at::PackedTensorAccessor32<DataT, 3UL, at::RestrictPtrTraits>, PairDist<DataT> *, PairDist<DataT> *, int *, int *, AEVScalarParams<DataT, IndexT>, int, int, int) [with SpeciesT=int, DataT=float, IndexT=int, TILEX=8, TILEY=4]"
(576): here
            instantiation of "void cuComputeAEV(at::Tensor, at::Tensor, ScalarRealT, ScalarRealT, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, int64_t) [with ScalarRealT=float]"
(584): here

ptxas info    : 14 bytes gmem, 672 bytes cmem[3]
ptxas info    : Compiling entry function '_ZN6thrust8cuda_cub4core13_kernel_agentINS0_14__parallel_for16ParallelForAgentINS0_6__fill7functorIP8PairDistIfES8_EElEESA_lEEvT0_T1_' for 'sm_60'
ptxas info    : Function properties for _ZN6thrust8cuda_cub4core13_kernel_agentINS0_14__parallel_for16ParallelForAgentINS0_6__fill7functorIP8PairDistIfES8_EElEESA_lEEvT0_T1_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 10 registers, 352 bytes cmem[0]
ptxas info    : Compiling entry function '_ZN6thrust8cuda_cub3cub11EmptyKernelIvEEvv' for 'sm_60'
ptxas info    : Function properties for _ZN6thrust8cuda_cub3cub11EmptyKernelIvEEvv
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 2 registers, 320 bytes cmem[0]
ptxas info    : Compiling entry function '_ZN3cub23DeviceSelectSweepKernelINS_16DispatchSelectIfIPK8PairDistIfEPNS_8NullTypeEPS3_PiZ12cuComputeAEVIdEvN2at6TensorESC_T_SD_SC_SC_SC_SC_SC_SC_SC_lEUlS3_E0_S6_iLb0EE18PtxSelectIfPolicyTES5_S7_S8_S9_NS_13ScanTileStateIiLb1EEESE_S6_iLb0EEEvT0_T1_T2_T3_T4_T5_T6_T7_i' for 'sm_60'
ptxas info    : Function properties for _ZN3cub23DeviceSelectSweepKernelINS_16DispatchSelectIfIPK8PairDistIfEPNS_8NullTypeEPS3_PiZ12cuComputeAEVIdEvN2at6TensorESC_T_SD_SC_SC_SC_SC_SC_SC_SC_lEUlS3_E0_S6_iLb0EE18PtxSelectIfPolicyTES5_S7_S8_S9_NS_13ScanTileStateIiLb1EEESE_S6_iLb0EEEvT0_T1_T2_T3_T4_T5_T6_T7_i
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 32 registers, 4608 bytes smem, 380 bytes cmem[0], 8 bytes cmem[2]
ptxas info    : Compiling entry function '_ZN3cub23DeviceSelectSweepKernelINS_16DispatchSelectIfIPK8PairDistIfEPNS_8NullTypeEPS3_PiZ12cuComputeAEVIdEvN2at6TensorESC_T_SD_SC_SC_SC_SC_SC_SC_SC_lEUlS3_E_S6_iLb0EE18PtxSelectIfPolicyTES5_S7_S8_S9_NS_13ScanTileStateIiLb1EEESE_S6_iLb0EEEvT0_T1_T2_T3_T4_T5_T6_T7_i' for 'sm_60'
ptxas info    : Function properties for _ZN3cub23DeviceSelectSweepKernelINS_16DispatchSelectIfIPK8PairDistIfEPNS_8NullTypeEPS3_PiZ12cuComputeAEVIdEvN2at6TensorESC_T_SD_SC_SC_SC_SC_SC_SC_SC_lEUlS3_E_S6_iLb0EE18PtxSelectIfPolicyTES5_S7_S8_S9_NS_13ScanTileStateIiLb1EEESE_S6_iLb0EEEvT0_T1_T2_T3_T4_T5_T6_T7_i
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 32 registers, 4608 bytes smem, 380 bytes cmem[0], 8 bytes cmem[2]
ptxas info    : Compiling entry function '_ZN3cub28DeviceReduceSingleTileKernelINS_18DeviceReducePolicyIiiiNS_3MaxEE9Policy600EPiS5_iS2_iEEvT0_T1_T2_T3_T4_' for 'sm_60'
ptxas info    : Function properties for _ZN3cub28DeviceReduceSingleTileKernelINS_18DeviceReducePolicyIiiiNS_3MaxEE9Policy600EPiS5_iS2_iEEvT0_T1_T2_T3_T4_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 34 registers, 48 bytes smem, 348 bytes cmem[0], 4 bytes cmem[2]
ptxas info    : Compiling entry function '_ZN3cub18DeviceReduceKernelINS_18DeviceReducePolicyIiiiNS_3MaxEE9Policy600EPKiPiiS2_EEvT0_T1_T2_NS_13GridEvenShareISA_EET3_' for 'sm_60'
ptxas info    : Function properties for _ZN3cub18DeviceReduceKernelINS_18DeviceReducePolicyIiiiNS_3MaxEE9Policy600EPKiPiiS2_EEvT0_T1_T2_NS_13GridEvenShareISA_EET3_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 28 registers, 48 bytes smem, 381 bytes cmem[0], 4 bytes cmem[2]
ptxas info    : Compiling entry function '_ZN3cub28DeviceReduceSingleTileKernelINS_18DeviceReducePolicyIiiiNS_3MaxEE9Policy600EPKiPiiS2_iEEvT0_T1_T2_T3_T4_' for 'sm_60'
ptxas info    : Function properties for _ZN3cub28DeviceReduceSingleTileKernelINS_18DeviceReducePolicyIiiiNS_3MaxEE9Policy600EPKiPiiS2_iEEvT0_T1_T2_T3_T4_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 34 registers, 48 bytes smem, 348 bytes cmem[0], 4 bytes cmem[2]
ptxas info    : Compiling entry function '_ZN3cub16DeviceScanKernelINS_15AgentScanPolicyILi128ELi15EiLNS_18BlockLoadAlgorithmE2ELNS_17CacheLoadModifierE0ELNS_19BlockStoreAlgorithmE2ELNS_18BlockScanAlgorithmE2ENS_15MemBoundScalingILi128ELi15EiEEEEPKiPiNS_13ScanTileStateIiLb1EEENS_3SumEiiEEvT0_T1_T2_iT3_T4_T5_' for 'sm_60'
ptxas info    : Function properties for _ZN3cub16DeviceScanKernelINS_15AgentScanPolicyILi128ELi15EiLNS_18BlockLoadAlgorithmE2ELNS_17CacheLoadModifierE0ELNS_19BlockStoreAlgorithmE2ELNS_18BlockScanAlgorithmE2ENS_15MemBoundScalingILi128ELi15EiEEEEPKiPiNS_13ScanTileStateIiLb1EEENS_3SumEiiEEvT0_T1_T2_iT3_T4_T5_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 41 registers, 7696 bytes smem, 360 bytes cmem[0], 4 bytes cmem[2]
ptxas info    : Compiling entry function '_ZN3cub16DeviceScanKernelINS_15AgentScanPolicyILi128ELi12EiLNS_18BlockLoadAlgorithmE0ELNS_17CacheLoadModifierE5ELNS_19BlockStoreAlgorithmE3ELNS_18BlockScanAlgorithmE2ENS_15MemBoundScalingILi128ELi12EiEEEEPKiPiNS_13ScanTileStateIiLb1EEENS_3SumEiiEEvT0_T1_T2_iT3_T4_T5_' for 'sm_60'
ptxas info    : Function properties for _ZN3cub16DeviceScanKernelINS_15AgentScanPolicyILi128ELi12EiLNS_18BlockLoadAlgorithmE0ELNS_17CacheLoadModifierE5ELNS_19BlockStoreAlgorithmE3ELNS_18BlockScanAlgorithmE2ENS_15MemBoundScalingILi128ELi12EiEEEEPKiPiNS_13ScanTileStateIiLb1EEENS_3SumEiiEEvT0_T1_T2_iT3_T4_T5_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 37 registers, 6160 bytes smem, 360 bytes cmem[0], 4 bytes cmem[2]
ptxas info    : Compiling entry function '_ZN3cub16DeviceScanKernelINS_15AgentScanPolicyILi128ELi12EiLNS_18BlockLoadAlgorithmE0ELNS_17CacheLoadModifierE5ELNS_19BlockStoreAlgorithmE4ELNS_18BlockScanAlgorithmE0ENS_15MemBoundScalingILi128ELi12EiEEEEPKiPiNS_13ScanTileStateIiLb1EEENS_3SumEiiEEvT0_T1_T2_iT3_T4_T5_' for 'sm_60'
ptxas info    : Function properties for _ZN3cub16DeviceScanKernelINS_15AgentScanPolicyILi128ELi12EiLNS_18BlockLoadAlgorithmE0ELNS_17CacheLoadModifierE5ELNS_19BlockStoreAlgorithmE4ELNS_18BlockScanAlgorithmE0ENS_15MemBoundScalingILi128ELi12EiEEEEPKiPiNS_13ScanTileStateIiLb1EEENS_3SumEiiEEvT0_T1_T2_iT3_T4_T5_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 32 registers, 1552 bytes smem, 360 bytes cmem[0], 4 bytes cmem[2]
ptxas info    : Compiling entry function '_ZN3cub16DeviceScanKernelINS_15AgentScanPolicyILi256ELi9EiLNS_18BlockLoadAlgorithmE3ELNS_17CacheLoadModifierE0ELNS_19BlockStoreAlgorithmE3ELNS_18BlockScanAlgorithmE2ENS_15MemBoundScalingILi256ELi9EiEEEEPKiPiNS_13ScanTileStateIiLb1EEENS_3SumEiiEEvT0_T1_T2_iT3_T4_T5_' for 'sm_60'
ptxas info    : Function properties for _ZN3cub16DeviceScanKernelINS_15AgentScanPolicyILi256ELi9EiLNS_18BlockLoadAlgorithmE3ELNS_17CacheLoadModifierE0ELNS_19BlockStoreAlgorithmE3ELNS_18BlockScanAlgorithmE2ENS_15MemBoundScalingILi256ELi9EiEEEEPKiPiNS_13ScanTileStateIiLb1EEENS_3SumEiiEEvT0_T1_T2_iT3_T4_T5_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 39 registers, 9232 bytes smem, 360 bytes cmem[0], 4 bytes cmem[2]
ptxas info    : Compiling entry function '_ZN3cub16DeviceScanKernelINS_15AgentScanPolicyILi128ELi12EiLNS_18BlockLoadAlgorithmE3ELNS_17CacheLoadModifierE0ELNS_19BlockStoreAlgorithmE3ELNS_18BlockScanAlgorithmE2ENS_15MemBoundScalingILi128ELi12EiEEEEPKiPiNS_13ScanTileStateIiLb1EEENS_3SumEiiEEvT0_T1_T2_iT3_T4_T5_' for 'sm_60'
ptxas info    : Function properties for _ZN3cub16DeviceScanKernelINS_15AgentScanPolicyILi128ELi12EiLNS_18BlockLoadAlgorithmE3ELNS_17CacheLoadModifierE0ELNS_19BlockStoreAlgorithmE3ELNS_18BlockScanAlgorithmE2ENS_15MemBoundScalingILi128ELi12EiEEEEPKiPiNS_13ScanTileStateIiLb1EEENS_3SumEiiEEvT0_T1_T2_iT3_T4_T5_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 42 registers, 6160 bytes smem, 360 bytes cmem[0], 4 bytes cmem[2]
ptxas info    : Compiling entry function '_ZN3cub16DeviceScanKernelINS_15AgentScanPolicyILi96ELi21EiLNS_18BlockLoadAlgorithmE3ELNS_17CacheLoadModifierE0ELNS_19BlockStoreAlgorithmE3ELNS_18BlockScanAlgorithmE1ENS_15MemBoundScalingILi96ELi21EiEEEEPKiPiNS_13ScanTileStateIiLb1EEENS_3SumEiiEEvT0_T1_T2_iT3_T4_T5_' for 'sm_60'
ptxas info    : Function properties for _ZN3cub16DeviceScanKernelINS_15AgentScanPolicyILi96ELi21EiLNS_18BlockLoadAlgorithmE3ELNS_17CacheLoadModifierE0ELNS_19BlockStoreAlgorithmE3ELNS_18BlockScanAlgorithmE1ENS_15MemBoundScalingILi96ELi21EiEEEEPKiPiNS_13ScanTileStateIiLb1EEENS_3SumEiiEEvT0_T1_T2_iT3_T4_T5_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 45 registers, 8080 bytes smem, 360 bytes cmem[0], 4 bytes cmem[2]
ptxas info    : Compiling entry function '_ZN3cub16DeviceScanKernelINS_15AgentScanPolicyILi64ELi9EiLNS_18BlockLoadAlgorithmE3ELNS_17CacheLoadModifierE0ELNS_19BlockStoreAlgorithmE3ELNS_18BlockScanAlgorithmE2ENS_15MemBoundScalingILi64ELi9EiEEEEPKiPiNS_13ScanTileStateIiLb1EEENS_3SumEiiEEvT0_T1_T2_iT3_T4_T5_' for 'sm_60'
ptxas info    : Function properties for _ZN3cub16DeviceScanKernelINS_15AgentScanPolicyILi64ELi9EiLNS_18BlockLoadAlgorithmE3ELNS_17CacheLoadModifierE0ELNS_19BlockStoreAlgorithmE3ELNS_18BlockScanAlgorithmE2ENS_15MemBoundScalingILi64ELi9EiEEEEPKiPiNS_13ScanTileStateIiLb1EEENS_3SumEiiEEvT0_T1_T2_iT3_T4_T5_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 39 registers, 2320 bytes smem, 360 bytes cmem[0], 4 bytes cmem[2]
ptxas info    : Compiling entry function '_ZN3cub20DeviceScanInitKernelINS_13ScanTileStateIiLb1EEEEEvT_i' for 'sm_60'
ptxas info    : Function properties for _ZN3cub20DeviceScanInitKernelINS_13ScanTileStateIiLb1EEEEEvT_i
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 10 registers, 332 bytes cmem[0]
ptxas info    : Compiling entry function '_ZN3cub23DeviceReduceByKeyKernelINS_19DispatchReduceByKeyIPK8PairDistIfEPS3_NS_21ConstantInputIteratorIiiEEPiS9_NS_8EqualityENS_3SumEiE20PtxReduceByKeyPolicyES5_S6_S8_S9_S9_NS_24ReduceByKeyScanTileStateIiiLb1EEESA_SB_iEEvT0_T1_T2_T3_T4_T5_iT6_T7_T8_' for 'sm_60'
ptxas info    : Function properties for _ZN3cub23DeviceReduceByKeyKernelINS_19DispatchReduceByKeyIPK8PairDistIfEPS3_NS_21ConstantInputIteratorIiiEEPiS9_NS_8EqualityENS_3SumEiE20PtxReduceByKeyPolicyES5_S6_S8_S9_S9_NS_24ReduceByKeyScanTileStateIiiLb1EEESA_SB_iEEvT0_T1_T2_T3_T4_T5_iT6_T7_T8_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 53 registers, 6160 bytes smem, 380 bytes cmem[0], 8 bytes cmem[2]
ptxas info    : Compiling entry function '_ZN3cub23DeviceCompactInitKernelINS_24ReduceByKeyScanTileStateIiiLb1EEEPiEEvT_iT0_' for 'sm_60'
ptxas info    : Function properties for _ZN3cub23DeviceCompactInitKernelINS_24ReduceByKeyScanTileStateIiiLb1EEEPiEEvT_iT0_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 10 registers, 344 bytes cmem[0]
ptxas info    : Compiling entry function '_ZN3cub23DeviceSelectSweepKernelINS_16DispatchSelectIfIPK8PairDistIfEPNS_8NullTypeEPS3_PiZ12cuComputeAEVIfEvN2at6TensorESC_T_SD_SC_SC_SC_SC_SC_SC_SC_lEUlS3_E0_S6_iLb0EE18PtxSelectIfPolicyTES5_S7_S8_S9_NS_13ScanTileStateIiLb1EEESE_S6_iLb0EEEvT0_T1_T2_T3_T4_T5_T6_T7_i' for 'sm_60'
ptxas info    : Function properties for _ZN3cub23DeviceSelectSweepKernelINS_16DispatchSelectIfIPK8PairDistIfEPNS_8NullTypeEPS3_PiZ12cuComputeAEVIfEvN2at6TensorESC_T_SD_SC_SC_SC_SC_SC_SC_SC_lEUlS3_E0_S6_iLb0EE18PtxSelectIfPolicyTES5_S7_S8_S9_NS_13ScanTileStateIiLb1EEESE_S6_iLb0EEEvT0_T1_T2_T3_T4_T5_T6_T7_i
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 31 registers, 4608 bytes smem, 376 bytes cmem[0], 8 bytes cmem[2]
ptxas info    : Compiling entry function '_ZN3cub23DeviceSelectSweepKernelINS_16DispatchSelectIfIPK8PairDistIfEPNS_8NullTypeEPS3_PiZ12cuComputeAEVIfEvN2at6TensorESC_T_SD_SC_SC_SC_SC_SC_SC_SC_lEUlS3_E_S6_iLb0EE18PtxSelectIfPolicyTES5_S7_S8_S9_NS_13ScanTileStateIiLb1EEESE_S6_iLb0EEEvT0_T1_T2_T3_T4_T5_T6_T7_i' for 'sm_60'
ptxas info    : Function properties for _ZN3cub23DeviceSelectSweepKernelINS_16DispatchSelectIfIPK8PairDistIfEPNS_8NullTypeEPS3_PiZ12cuComputeAEVIfEvN2at6TensorESC_T_SD_SC_SC_SC_SC_SC_SC_SC_lEUlS3_E_S6_iLb0EE18PtxSelectIfPolicyTES5_S7_S8_S9_NS_13ScanTileStateIiLb1EEESE_S6_iLb0EEEvT0_T1_T2_T3_T4_T5_T6_T7_i
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 31 registers, 4608 bytes smem, 376 bytes cmem[0], 8 bytes cmem[2]
ptxas info    : Compiling entry function '_ZN3cub23DeviceCompactInitKernelINS_13ScanTileStateIiLb1EEEPiEEvT_iT0_' for 'sm_60'
ptxas info    : Function properties for _ZN3cub23DeviceCompactInitKernelINS_13ScanTileStateIiLb1EEEPiEEvT_iT0_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 8 registers, 344 bytes cmem[0]
ptxas info    : Compiling entry function '_ZN3cub11EmptyKernelIvEEvv' for 'sm_60'
ptxas info    : Function properties for _ZN3cub11EmptyKernelIvEEvv
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 2 registers, 320 bytes cmem[0]
ptxas info    : Compiling entry function '_Z13cuAngularAEVsIifiLi8ELi4EEvN2at27GenericPackedTensorAccessorIT_Lm2ENS0_17RestrictPtrTraitsEiEENS1_IT0_Lm3ES3_iEENS1_IS5_Lm1ES3_iEES7_S7_S7_S6_P8PairDistIS5_ESA_PiSB_15AEVScalarParamsIS5_T1_Eiii' for 'sm_60'
ptxas info    : Function properties for _Z13cuAngularAEVsIifiLi8ELi4EEvN2at27GenericPackedTensorAccessorIT_Lm2ENS0_17RestrictPtrTraitsEiEENS1_IT0_Lm3ES3_iEENS1_IS5_Lm1ES3_iEES7_S7_S7_S6_P8PairDistIS5_ESA_PiSB_15AEVScalarParamsIS5_T1_Eiii
    48 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 60 registers, 544 bytes cmem[0], 328 bytes cmem[2]
ptxas info    : Function properties for __internal_trig_reduction_slowpathd
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Compiling entry function '_Z12cuRadialAEVsIifLi8EEvN2at27GenericPackedTensorAccessorIT_Lm2ENS0_17RestrictPtrTraitsEiEENS1_IT0_Lm1ES3_iEES6_NS1_IS5_Lm3ES3_iEEP8PairDistIS5_E15AEVScalarParamsIS5_iEi' for 'sm_60'
ptxas info    : Function properties for _Z12cuRadialAEVsIifLi8EEvN2at27GenericPackedTensorAccessorIT_Lm2ENS0_17RestrictPtrTraitsEiEENS1_IT0_Lm1ES3_iEES6_NS1_IS5_Lm3ES3_iEEP8PairDistIS5_E15AEVScalarParamsIS5_iEi
    48 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 30 registers, 448 bytes cmem[0], 100 bytes cmem[2]
ptxas info    : Function properties for __internal_trig_reduction_slowpathd
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Compiling entry function '_Z16pairwiseDistanceIifiEvN2at27GenericPackedTensorAccessorIT_Lm2ENS0_17RestrictPtrTraitsEiEENS1_IT0_Lm3ES3_iEEP8PairDistIS5_ET1_' for 'sm_60'
ptxas info    : Function properties for _Z16pairwiseDistanceIifiEvN2at27GenericPackedTensorAccessorIT_Lm2ENS0_17RestrictPtrTraitsEiEENS1_IT0_Lm3ES3_iEEP8PairDistIS5_ET1_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 22 registers, 388 bytes cmem[0]
/home/richard/program/anaconda3/envs/ml/lib/python3.6/site-packages/torch/include/ATen/record_function.h(18): warning: attribute "__visibility__" does not apply here

/home/richard/program/anaconda3/envs/ml/lib/python3.6/site-packages/torch/include/torch/csrc/autograd/profiler.h(97): warning: attribute "__visibility__" does not apply here

/home/richard/program/anaconda3/envs/ml/lib/python3.6/site-packages/torch/include/torch/csrc/autograd/profiler.h(126): warning: attribute "__visibility__" does not apply here

/home/richard/dev/torchani_cuaev/torchani/extension/aev.cu(238): warning: variable "aev_offset" was declared but never referenced
            detected during:
            instantiation of "void cuAngularAEVs(at::PackedTensorAccessor32<SpeciesT, 2UL, at::RestrictPtrTraits>, at::PackedTensorAccessor32<DataT, 3UL, at::RestrictPtrTraits>, at::PackedTensorAccessor32<DataT, 1UL, at::RestrictPtrTraits>, at::PackedTensorAccessor32<DataT, 1UL, at::RestrictPtrTraits>, at::PackedTensorAccessor32<DataT, 1UL, at::RestrictPtrTraits>, at::PackedTensorAccessor32<DataT, 1UL, at::RestrictPtrTraits>, at::PackedTensorAccessor32<DataT, 3UL, at::RestrictPtrTraits>, PairDist<DataT> *, PairDist<DataT> *, int *, int *, AEVScalarParams<DataT, IndexT>, int, int, int) [with SpeciesT=int, DataT=float, IndexT=int, TILEX=8, TILEY=4]"
(576): here
            instantiation of "void cuComputeAEV(at::Tensor, at::Tensor, ScalarRealT, ScalarRealT, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, int64_t) [with ScalarRealT=float]"
(584): here

ptxas info    : 14 bytes gmem, 672 bytes cmem[3]
ptxas info    : Compiling entry function '_ZN6thrust8cuda_cub4core13_kernel_agentINS0_14__parallel_for16ParallelForAgentINS0_6__fill7functorIP8PairDistIfES8_EElEESA_lEEvT0_T1_' for 'sm_61'
ptxas info    : Function properties for _ZN6thrust8cuda_cub4core13_kernel_agentINS0_14__parallel_for16ParallelForAgentINS0_6__fill7functorIP8PairDistIfES8_EElEESA_lEEvT0_T1_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 10 registers, 352 bytes cmem[0]
ptxas info    : Compiling entry function '_ZN6thrust8cuda_cub3cub11EmptyKernelIvEEvv' for 'sm_61'
ptxas info    : Function properties for _ZN6thrust8cuda_cub3cub11EmptyKernelIvEEvv
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 2 registers, 320 bytes cmem[0]
ptxas info    : Compiling entry function '_ZN3cub23DeviceSelectSweepKernelINS_16DispatchSelectIfIPK8PairDistIfEPNS_8NullTypeEPS3_PiZ12cuComputeAEVIdEvN2at6TensorESC_T_SD_SC_SC_SC_SC_SC_SC_SC_lEUlS3_E0_S6_iLb0EE18PtxSelectIfPolicyTES5_S7_S8_S9_NS_13ScanTileStateIiLb1EEESE_S6_iLb0EEEvT0_T1_T2_T3_T4_T5_T6_T7_i' for 'sm_61'
ptxas info    : Function properties for _ZN3cub23DeviceSelectSweepKernelINS_16DispatchSelectIfIPK8PairDistIfEPNS_8NullTypeEPS3_PiZ12cuComputeAEVIdEvN2at6TensorESC_T_SD_SC_SC_SC_SC_SC_SC_SC_lEUlS3_E0_S6_iLb0EE18PtxSelectIfPolicyTES5_S7_S8_S9_NS_13ScanTileStateIiLb1EEESE_S6_iLb0EEEvT0_T1_T2_T3_T4_T5_T6_T7_i
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 32 registers, 4608 bytes smem, 380 bytes cmem[0], 8 bytes cmem[2]
ptxas info    : Compiling entry function '_ZN3cub23DeviceSelectSweepKernelINS_16DispatchSelectIfIPK8PairDistIfEPNS_8NullTypeEPS3_PiZ12cuComputeAEVIdEvN2at6TensorESC_T_SD_SC_SC_SC_SC_SC_SC_SC_lEUlS3_E_S6_iLb0EE18PtxSelectIfPolicyTES5_S7_S8_S9_NS_13ScanTileStateIiLb1EEESE_S6_iLb0EEEvT0_T1_T2_T3_T4_T5_T6_T7_i' for 'sm_61'
ptxas info    : Function properties for _ZN3cub23DeviceSelectSweepKernelINS_16DispatchSelectIfIPK8PairDistIfEPNS_8NullTypeEPS3_PiZ12cuComputeAEVIdEvN2at6TensorESC_T_SD_SC_SC_SC_SC_SC_SC_SC_lEUlS3_E_S6_iLb0EE18PtxSelectIfPolicyTES5_S7_S8_S9_NS_13ScanTileStateIiLb1EEESE_S6_iLb0EEEvT0_T1_T2_T3_T4_T5_T6_T7_i
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 32 registers, 4608 bytes smem, 380 bytes cmem[0], 8 bytes cmem[2]
ptxas info    : Compiling entry function '_ZN3cub28DeviceReduceSingleTileKernelINS_18DeviceReducePolicyIiiiNS_3MaxEE9Policy600EPiS5_iS2_iEEvT0_T1_T2_T3_T4_' for 'sm_61'
ptxas info    : Function properties for _ZN3cub28DeviceReduceSingleTileKernelINS_18DeviceReducePolicyIiiiNS_3MaxEE9Policy600EPiS5_iS2_iEEvT0_T1_T2_T3_T4_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 34 registers, 48 bytes smem, 348 bytes cmem[0], 4 bytes cmem[2]
ptxas info    : Compiling entry function '_ZN3cub18DeviceReduceKernelINS_18DeviceReducePolicyIiiiNS_3MaxEE9Policy600EPKiPiiS2_EEvT0_T1_T2_NS_13GridEvenShareISA_EET3_' for 'sm_61'
ptxas info    : Function properties for _ZN3cub18DeviceReduceKernelINS_18DeviceReducePolicyIiiiNS_3MaxEE9Policy600EPKiPiiS2_EEvT0_T1_T2_NS_13GridEvenShareISA_EET3_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 28 registers, 48 bytes smem, 381 bytes cmem[0], 4 bytes cmem[2]
ptxas info    : Compiling entry function '_ZN3cub28DeviceReduceSingleTileKernelINS_18DeviceReducePolicyIiiiNS_3MaxEE9Policy600EPKiPiiS2_iEEvT0_T1_T2_T3_T4_' for 'sm_61'
ptxas info    : Function properties for _ZN3cub28DeviceReduceSingleTileKernelINS_18DeviceReducePolicyIiiiNS_3MaxEE9Policy600EPKiPiiS2_iEEvT0_T1_T2_T3_T4_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 34 registers, 48 bytes smem, 348 bytes cmem[0], 4 bytes cmem[2]
ptxas info    : Compiling entry function '_ZN3cub16DeviceScanKernelINS_15AgentScanPolicyILi128ELi15EiLNS_18BlockLoadAlgorithmE2ELNS_17CacheLoadModifierE0ELNS_19BlockStoreAlgorithmE2ELNS_18BlockScanAlgorithmE2ENS_15MemBoundScalingILi128ELi15EiEEEEPKiPiNS_13ScanTileStateIiLb1EEENS_3SumEiiEEvT0_T1_T2_iT3_T4_T5_' for 'sm_61'
ptxas info    : Function properties for _ZN3cub16DeviceScanKernelINS_15AgentScanPolicyILi128ELi15EiLNS_18BlockLoadAlgorithmE2ELNS_17CacheLoadModifierE0ELNS_19BlockStoreAlgorithmE2ELNS_18BlockScanAlgorithmE2ENS_15MemBoundScalingILi128ELi15EiEEEEPKiPiNS_13ScanTileStateIiLb1EEENS_3SumEiiEEvT0_T1_T2_iT3_T4_T5_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 40 registers, 7696 bytes smem, 360 bytes cmem[0], 4 bytes cmem[2]
ptxas info    : Compiling entry function '_ZN3cub16DeviceScanKernelINS_15AgentScanPolicyILi128ELi12EiLNS_18BlockLoadAlgorithmE0ELNS_17CacheLoadModifierE5ELNS_19BlockStoreAlgorithmE3ELNS_18BlockScanAlgorithmE2ENS_15MemBoundScalingILi128ELi12EiEEEEPKiPiNS_13ScanTileStateIiLb1EEENS_3SumEiiEEvT0_T1_T2_iT3_T4_T5_' for 'sm_61'
ptxas info    : Function properties for _ZN3cub16DeviceScanKernelINS_15AgentScanPolicyILi128ELi12EiLNS_18BlockLoadAlgorithmE0ELNS_17CacheLoadModifierE5ELNS_19BlockStoreAlgorithmE3ELNS_18BlockScanAlgorithmE2ENS_15MemBoundScalingILi128ELi12EiEEEEPKiPiNS_13ScanTileStateIiLb1EEENS_3SumEiiEEvT0_T1_T2_iT3_T4_T5_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 38 registers, 6160 bytes smem, 360 bytes cmem[0], 4 bytes cmem[2]
ptxas info    : Compiling entry function '_ZN3cub16DeviceScanKernelINS_15AgentScanPolicyILi128ELi12EiLNS_18BlockLoadAlgorithmE0ELNS_17CacheLoadModifierE5ELNS_19BlockStoreAlgorithmE4ELNS_18BlockScanAlgorithmE0ENS_15MemBoundScalingILi128ELi12EiEEEEPKiPiNS_13ScanTileStateIiLb1EEENS_3SumEiiEEvT0_T1_T2_iT3_T4_T5_' for 'sm_61'
ptxas info    : Function properties for _ZN3cub16DeviceScanKernelINS_15AgentScanPolicyILi128ELi12EiLNS_18BlockLoadAlgorithmE0ELNS_17CacheLoadModifierE5ELNS_19BlockStoreAlgorithmE4ELNS_18BlockScanAlgorithmE0ENS_15MemBoundScalingILi128ELi12EiEEEEPKiPiNS_13ScanTileStateIiLb1EEENS_3SumEiiEEvT0_T1_T2_iT3_T4_T5_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 32 registers, 1552 bytes smem, 360 bytes cmem[0], 4 bytes cmem[2]
ptxas info    : Compiling entry function '_ZN3cub16DeviceScanKernelINS_15AgentScanPolicyILi256ELi9EiLNS_18BlockLoadAlgorithmE3ELNS_17CacheLoadModifierE0ELNS_19BlockStoreAlgorithmE3ELNS_18BlockScanAlgorithmE2ENS_15MemBoundScalingILi256ELi9EiEEEEPKiPiNS_13ScanTileStateIiLb1EEENS_3SumEiiEEvT0_T1_T2_iT3_T4_T5_' for 'sm_61'
ptxas info    : Function properties for _ZN3cub16DeviceScanKernelINS_15AgentScanPolicyILi256ELi9EiLNS_18BlockLoadAlgorithmE3ELNS_17CacheLoadModifierE0ELNS_19BlockStoreAlgorithmE3ELNS_18BlockScanAlgorithmE2ENS_15MemBoundScalingILi256ELi9EiEEEEPKiPiNS_13ScanTileStateIiLb1EEENS_3SumEiiEEvT0_T1_T2_iT3_T4_T5_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 39 registers, 9232 bytes smem, 360 bytes cmem[0], 4 bytes cmem[2]
ptxas info    : Compiling entry function '_ZN3cub16DeviceScanKernelINS_15AgentScanPolicyILi128ELi12EiLNS_18BlockLoadAlgorithmE3ELNS_17CacheLoadModifierE0ELNS_19BlockStoreAlgorithmE3ELNS_18BlockScanAlgorithmE2ENS_15MemBoundScalingILi128ELi12EiEEEEPKiPiNS_13ScanTileStateIiLb1EEENS_3SumEiiEEvT0_T1_T2_iT3_T4_T5_' for 'sm_61'
ptxas info    : Function properties for _ZN3cub16DeviceScanKernelINS_15AgentScanPolicyILi128ELi12EiLNS_18BlockLoadAlgorithmE3ELNS_17CacheLoadModifierE0ELNS_19BlockStoreAlgorithmE3ELNS_18BlockScanAlgorithmE2ENS_15MemBoundScalingILi128ELi12EiEEEEPKiPiNS_13ScanTileStateIiLb1EEENS_3SumEiiEEvT0_T1_T2_iT3_T4_T5_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 40 registers, 6160 bytes smem, 360 bytes cmem[0], 4 bytes cmem[2]
ptxas info    : Compiling entry function '_ZN3cub16DeviceScanKernelINS_15AgentScanPolicyILi96ELi21EiLNS_18BlockLoadAlgorithmE3ELNS_17CacheLoadModifierE0ELNS_19BlockStoreAlgorithmE3ELNS_18BlockScanAlgorithmE1ENS_15MemBoundScalingILi96ELi21EiEEEEPKiPiNS_13ScanTileStateIiLb1EEENS_3SumEiiEEvT0_T1_T2_iT3_T4_T5_' for 'sm_61'
ptxas info    : Function properties for _ZN3cub16DeviceScanKernelINS_15AgentScanPolicyILi96ELi21EiLNS_18BlockLoadAlgorithmE3ELNS_17CacheLoadModifierE0ELNS_19BlockStoreAlgorithmE3ELNS_18BlockScanAlgorithmE1ENS_15MemBoundScalingILi96ELi21EiEEEEPKiPiNS_13ScanTileStateIiLb1EEENS_3SumEiiEEvT0_T1_T2_iT3_T4_T5_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 45 registers, 8080 bytes smem, 360 bytes cmem[0], 4 bytes cmem[2]
ptxas info    : Compiling entry function '_ZN3cub16DeviceScanKernelINS_15AgentScanPolicyILi64ELi9EiLNS_18BlockLoadAlgorithmE3ELNS_17CacheLoadModifierE0ELNS_19BlockStoreAlgorithmE3ELNS_18BlockScanAlgorithmE2ENS_15MemBoundScalingILi64ELi9EiEEEEPKiPiNS_13ScanTileStateIiLb1EEENS_3SumEiiEEvT0_T1_T2_iT3_T4_T5_' for 'sm_61'
ptxas info    : Function properties for _ZN3cub16DeviceScanKernelINS_15AgentScanPolicyILi64ELi9EiLNS_18BlockLoadAlgorithmE3ELNS_17CacheLoadModifierE0ELNS_19BlockStoreAlgorithmE3ELNS_18BlockScanAlgorithmE2ENS_15MemBoundScalingILi64ELi9EiEEEEPKiPiNS_13ScanTileStateIiLb1EEENS_3SumEiiEEvT0_T1_T2_iT3_T4_T5_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 39 registers, 2320 bytes smem, 360 bytes cmem[0], 4 bytes cmem[2]
ptxas info    : Compiling entry function '_ZN3cub20DeviceScanInitKernelINS_13ScanTileStateIiLb1EEEEEvT_i' for 'sm_61'
ptxas info    : Function properties for _ZN3cub20DeviceScanInitKernelINS_13ScanTileStateIiLb1EEEEEvT_i
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 10 registers, 332 bytes cmem[0]
ptxas info    : Compiling entry function '_ZN3cub23DeviceReduceByKeyKernelINS_19DispatchReduceByKeyIPK8PairDistIfEPS3_NS_21ConstantInputIteratorIiiEEPiS9_NS_8EqualityENS_3SumEiE20PtxReduceByKeyPolicyES5_S6_S8_S9_S9_NS_24ReduceByKeyScanTileStateIiiLb1EEESA_SB_iEEvT0_T1_T2_T3_T4_T5_iT6_T7_T8_' for 'sm_61'
ptxas info    : Function properties for _ZN3cub23DeviceReduceByKeyKernelINS_19DispatchReduceByKeyIPK8PairDistIfEPS3_NS_21ConstantInputIteratorIiiEEPiS9_NS_8EqualityENS_3SumEiE20PtxReduceByKeyPolicyES5_S6_S8_S9_S9_NS_24ReduceByKeyScanTileStateIiiLb1EEESA_SB_iEEvT0_T1_T2_T3_T4_T5_iT6_T7_T8_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 45 registers, 6160 bytes smem, 380 bytes cmem[0], 8 bytes cmem[2]
ptxas info    : Compiling entry function '_ZN3cub23DeviceCompactInitKernelINS_24ReduceByKeyScanTileStateIiiLb1EEEPiEEvT_iT0_' for 'sm_61'
ptxas info    : Function properties for _ZN3cub23DeviceCompactInitKernelINS_24ReduceByKeyScanTileStateIiiLb1EEEPiEEvT_iT0_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 10 registers, 344 bytes cmem[0]
ptxas info    : Compiling entry function '_ZN3cub23DeviceSelectSweepKernelINS_16DispatchSelectIfIPK8PairDistIfEPNS_8NullTypeEPS3_PiZ12cuComputeAEVIfEvN2at6TensorESC_T_SD_SC_SC_SC_SC_SC_SC_SC_lEUlS3_E0_S6_iLb0EE18PtxSelectIfPolicyTES5_S7_S8_S9_NS_13ScanTileStateIiLb1EEESE_S6_iLb0EEEvT0_T1_T2_T3_T4_T5_T6_T7_i' for 'sm_61'
ptxas info    : Function properties for _ZN3cub23DeviceSelectSweepKernelINS_16DispatchSelectIfIPK8PairDistIfEPNS_8NullTypeEPS3_PiZ12cuComputeAEVIfEvN2at6TensorESC_T_SD_SC_SC_SC_SC_SC_SC_SC_lEUlS3_E0_S6_iLb0EE18PtxSelectIfPolicyTES5_S7_S8_S9_NS_13ScanTileStateIiLb1EEESE_S6_iLb0EEEvT0_T1_T2_T3_T4_T5_T6_T7_i
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 31 registers, 4608 bytes smem, 376 bytes cmem[0], 8 bytes cmem[2]
ptxas info    : Compiling entry function '_ZN3cub23DeviceSelectSweepKernelINS_16DispatchSelectIfIPK8PairDistIfEPNS_8NullTypeEPS3_PiZ12cuComputeAEVIfEvN2at6TensorESC_T_SD_SC_SC_SC_SC_SC_SC_SC_lEUlS3_E_S6_iLb0EE18PtxSelectIfPolicyTES5_S7_S8_S9_NS_13ScanTileStateIiLb1EEESE_S6_iLb0EEEvT0_T1_T2_T3_T4_T5_T6_T7_i' for 'sm_61'
ptxas info    : Function properties for _ZN3cub23DeviceSelectSweepKernelINS_16DispatchSelectIfIPK8PairDistIfEPNS_8NullTypeEPS3_PiZ12cuComputeAEVIfEvN2at6TensorESC_T_SD_SC_SC_SC_SC_SC_SC_SC_lEUlS3_E_S6_iLb0EE18PtxSelectIfPolicyTES5_S7_S8_S9_NS_13ScanTileStateIiLb1EEESE_S6_iLb0EEEvT0_T1_T2_T3_T4_T5_T6_T7_i
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 31 registers, 4608 bytes smem, 376 bytes cmem[0], 8 bytes cmem[2]
ptxas info    : Compiling entry function '_ZN3cub23DeviceCompactInitKernelINS_13ScanTileStateIiLb1EEEPiEEvT_iT0_' for 'sm_61'
ptxas info    : Function properties for _ZN3cub23DeviceCompactInitKernelINS_13ScanTileStateIiLb1EEEPiEEvT_iT0_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 8 registers, 344 bytes cmem[0]
ptxas info    : Compiling entry function '_ZN3cub11EmptyKernelIvEEvv' for 'sm_61'
ptxas info    : Function properties for _ZN3cub11EmptyKernelIvEEvv
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 2 registers, 320 bytes cmem[0]
ptxas info    : Compiling entry function '_Z13cuAngularAEVsIifiLi8ELi4EEvN2at27GenericPackedTensorAccessorIT_Lm2ENS0_17RestrictPtrTraitsEiEENS1_IT0_Lm3ES3_iEENS1_IS5_Lm1ES3_iEES7_S7_S7_S6_P8PairDistIS5_ESA_PiSB_15AEVScalarParamsIS5_T1_Eiii' for 'sm_61'
ptxas info    : Function properties for _Z13cuAngularAEVsIifiLi8ELi4EEvN2at27GenericPackedTensorAccessorIT_Lm2ENS0_17RestrictPtrTraitsEiEENS1_IT0_Lm3ES3_iEENS1_IS5_Lm1ES3_iEES7_S7_S7_S6_P8PairDistIS5_ESA_PiSB_15AEVScalarParamsIS5_T1_Eiii
    48 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 60 registers, 544 bytes cmem[0], 328 bytes cmem[2]
ptxas info    : Function properties for __internal_trig_reduction_slowpathd
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Compiling entry function '_Z12cuRadialAEVsIifLi8EEvN2at27GenericPackedTensorAccessorIT_Lm2ENS0_17RestrictPtrTraitsEiEENS1_IT0_Lm1ES3_iEES6_NS1_IS5_Lm3ES3_iEEP8PairDistIS5_E15AEVScalarParamsIS5_iEi' for 'sm_61'
ptxas info    : Function properties for _Z12cuRadialAEVsIifLi8EEvN2at27GenericPackedTensorAccessorIT_Lm2ENS0_17RestrictPtrTraitsEiEENS1_IT0_Lm1ES3_iEES6_NS1_IS5_Lm3ES3_iEEP8PairDistIS5_E15AEVScalarParamsIS5_iEi
    48 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 30 registers, 448 bytes cmem[0], 100 bytes cmem[2]
ptxas info    : Function properties for __internal_trig_reduction_slowpathd
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Compiling entry function '_Z16pairwiseDistanceIifiEvN2at27GenericPackedTensorAccessorIT_Lm2ENS0_17RestrictPtrTraitsEiEENS1_IT0_Lm3ES3_iEEP8PairDistIS5_ET1_' for 'sm_61'
ptxas info    : Function properties for _Z16pairwiseDistanceIifiEvN2at27GenericPackedTensorAccessorIT_Lm2ENS0_17RestrictPtrTraitsEiEENS1_IT0_Lm3ES3_iEEP8PairDistIS5_ET1_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 22 registers, 388 bytes cmem[0]
ptxas info    : 14 bytes gmem, 672 bytes cmem[3]
ptxas info    : Compiling entry function '_ZN6thrust8cuda_cub4core13_kernel_agentINS0_14__parallel_for16ParallelForAgentINS0_6__fill7functorIP8PairDistIfES8_EElEESA_lEEvT0_T1_' for 'sm_70'
ptxas info    : Function properties for _ZN6thrust8cuda_cub4core13_kernel_agentINS0_14__parallel_for16ParallelForAgentINS0_6__fill7functorIP8PairDistIfES8_EElEESA_lEEvT0_T1_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 14 registers, 384 bytes cmem[0]
ptxas info    : Compiling entry function '_ZN6thrust8cuda_cub3cub11EmptyKernelIvEEvv' for 'sm_70'
ptxas info    : Function properties for _ZN6thrust8cuda_cub3cub11EmptyKernelIvEEvv
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 4 registers, 352 bytes cmem[0]
ptxas info    : Compiling entry function '_ZN3cub23DeviceSelectSweepKernelINS_16DispatchSelectIfIPK8PairDistIfEPNS_8NullTypeEPS3_PiZ12cuComputeAEVIdEvN2at6TensorESC_T_SD_SC_SC_SC_SC_SC_SC_SC_lEUlS3_E0_S6_iLb0EE18PtxSelectIfPolicyTES5_S7_S8_S9_NS_13ScanTileStateIiLb1EEESE_S6_iLb0EEEvT0_T1_T2_T3_T4_T5_T6_T7_i' for 'sm_70'
ptxas info    : Function properties for _ZN3cub23DeviceSelectSweepKernelINS_16DispatchSelectIfIPK8PairDistIfEPNS_8NullTypeEPS3_PiZ12cuComputeAEVIdEvN2at6TensorESC_T_SD_SC_SC_SC_SC_SC_SC_SC_lEUlS3_E0_S6_iLb0EE18PtxSelectIfPolicyTES5_S7_S8_S9_NS_13ScanTileStateIiLb1EEESE_S6_iLb0EEEvT0_T1_T2_T3_T4_T5_T6_T7_i
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 40 registers, 4608 bytes smem, 412 bytes cmem[0]
ptxas info    : Compiling entry function '_ZN3cub23DeviceSelectSweepKernelINS_16DispatchSelectIfIPK8PairDistIfEPNS_8NullTypeEPS3_PiZ12cuComputeAEVIdEvN2at6TensorESC_T_SD_SC_SC_SC_SC_SC_SC_SC_lEUlS3_E_S6_iLb0EE18PtxSelectIfPolicyTES5_S7_S8_S9_NS_13ScanTileStateIiLb1EEESE_S6_iLb0EEEvT0_T1_T2_T3_T4_T5_T6_T7_i' for 'sm_70'
ptxas info    : Function properties for _ZN3cub23DeviceSelectSweepKernelINS_16DispatchSelectIfIPK8PairDistIfEPNS_8NullTypeEPS3_PiZ12cuComputeAEVIdEvN2at6TensorESC_T_SD_SC_SC_SC_SC_SC_SC_SC_lEUlS3_E_S6_iLb0EE18PtxSelectIfPolicyTES5_S7_S8_S9_NS_13ScanTileStateIiLb1EEESE_S6_iLb0EEEvT0_T1_T2_T3_T4_T5_T6_T7_i
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 40 registers, 4608 bytes smem, 412 bytes cmem[0]
ptxas info    : Compiling entry function '_ZN3cub28DeviceReduceSingleTileKernelINS_18DeviceReducePolicyIiiiNS_3MaxEE9Policy600EPiS5_iS2_iEEvT0_T1_T2_T3_T4_' for 'sm_70'
ptxas info    : Function properties for _ZN3cub28DeviceReduceSingleTileKernelINS_18DeviceReducePolicyIiiiNS_3MaxEE9Policy600EPiS5_iS2_iEEvT0_T1_T2_T3_T4_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 32 registers, 48 bytes smem, 380 bytes cmem[0]
ptxas info    : Compiling entry function '_ZN3cub18DeviceReduceKernelINS_18DeviceReducePolicyIiiiNS_3MaxEE9Policy600EPKiPiiS2_EEvT0_T1_T2_NS_13GridEvenShareISA_EET3_' for 'sm_70'
ptxas info    : Function properties for _ZN3cub18DeviceReduceKernelINS_18DeviceReducePolicyIiiiNS_3MaxEE9Policy600EPKiPiiS2_EEvT0_T1_T2_NS_13GridEvenShareISA_EET3_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 32 registers, 48 bytes smem, 413 bytes cmem[0]
ptxas info    : Compiling entry function '_ZN3cub28DeviceReduceSingleTileKernelINS_18DeviceReducePolicyIiiiNS_3MaxEE9Policy600EPKiPiiS2_iEEvT0_T1_T2_T3_T4_' for 'sm_70'
ptxas info    : Function properties for _ZN3cub28DeviceReduceSingleTileKernelINS_18DeviceReducePolicyIiiiNS_3MaxEE9Policy600EPKiPiiS2_iEEvT0_T1_T2_T3_T4_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 32 registers, 48 bytes smem, 380 bytes cmem[0]
ptxas info    : Compiling entry function '_ZN3cub16DeviceScanKernelINS_15AgentScanPolicyILi128ELi15EiLNS_18BlockLoadAlgorithmE2ELNS_17CacheLoadModifierE0ELNS_19BlockStoreAlgorithmE2ELNS_18BlockScanAlgorithmE2ENS_15MemBoundScalingILi128ELi15EiEEEEPKiPiNS_13ScanTileStateIiLb1EEENS_3SumEiiEEvT0_T1_T2_iT3_T4_T5_' for 'sm_70'
ptxas info    : Function properties for _ZN3cub16DeviceScanKernelINS_15AgentScanPolicyILi128ELi15EiLNS_18BlockLoadAlgorithmE2ELNS_17CacheLoadModifierE0ELNS_19BlockStoreAlgorithmE2ELNS_18BlockScanAlgorithmE2ENS_15MemBoundScalingILi128ELi15EiEEEEPKiPiNS_13ScanTileStateIiLb1EEENS_3SumEiiEEvT0_T1_T2_iT3_T4_T5_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 48 registers, 7696 bytes smem, 392 bytes cmem[0]
ptxas info    : Compiling entry function '_ZN3cub16DeviceScanKernelINS_15AgentScanPolicyILi128ELi12EiLNS_18BlockLoadAlgorithmE0ELNS_17CacheLoadModifierE5ELNS_19BlockStoreAlgorithmE3ELNS_18BlockScanAlgorithmE2ENS_15MemBoundScalingILi128ELi12EiEEEEPKiPiNS_13ScanTileStateIiLb1EEENS_3SumEiiEEvT0_T1_T2_iT3_T4_T5_' for 'sm_70'
ptxas info    : Function properties for _ZN3cub16DeviceScanKernelINS_15AgentScanPolicyILi128ELi12EiLNS_18BlockLoadAlgorithmE0ELNS_17CacheLoadModifierE5ELNS_19BlockStoreAlgorithmE3ELNS_18BlockScanAlgorithmE2ENS_15MemBoundScalingILi128ELi12EiEEEEPKiPiNS_13ScanTileStateIiLb1EEENS_3SumEiiEEvT0_T1_T2_iT3_T4_T5_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 40 registers, 6160 bytes smem, 392 bytes cmem[0]
ptxas info    : Compiling entry function '_ZN3cub16DeviceScanKernelINS_15AgentScanPolicyILi128ELi12EiLNS_18BlockLoadAlgorithmE0ELNS_17CacheLoadModifierE5ELNS_19BlockStoreAlgorithmE4ELNS_18BlockScanAlgorithmE0ENS_15MemBoundScalingILi128ELi12EiEEEEPKiPiNS_13ScanTileStateIiLb1EEENS_3SumEiiEEvT0_T1_T2_iT3_T4_T5_' for 'sm_70'
ptxas info    : Function properties for _ZN3cub16DeviceScanKernelINS_15AgentScanPolicyILi128ELi12EiLNS_18BlockLoadAlgorithmE0ELNS_17CacheLoadModifierE5ELNS_19BlockStoreAlgorithmE4ELNS_18BlockScanAlgorithmE0ENS_15MemBoundScalingILi128ELi12EiEEEEPKiPiNS_13ScanTileStateIiLb1EEENS_3SumEiiEEvT0_T1_T2_iT3_T4_T5_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 44 registers, 1552 bytes smem, 392 bytes cmem[0]
ptxas info    : Compiling entry function '_ZN3cub16DeviceScanKernelINS_15AgentScanPolicyILi256ELi9EiLNS_18BlockLoadAlgorithmE3ELNS_17CacheLoadModifierE0ELNS_19BlockStoreAlgorithmE3ELNS_18BlockScanAlgorithmE2ENS_15MemBoundScalingILi256ELi9EiEEEEPKiPiNS_13ScanTileStateIiLb1EEENS_3SumEiiEEvT0_T1_T2_iT3_T4_T5_' for 'sm_70'
ptxas info    : Function properties for _ZN3cub16DeviceScanKernelINS_15AgentScanPolicyILi256ELi9EiLNS_18BlockLoadAlgorithmE3ELNS_17CacheLoadModifierE0ELNS_19BlockStoreAlgorithmE3ELNS_18BlockScanAlgorithmE2ENS_15MemBoundScalingILi256ELi9EiEEEEPKiPiNS_13ScanTileStateIiLb1EEENS_3SumEiiEEvT0_T1_T2_iT3_T4_T5_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 46 registers, 9232 bytes smem, 392 bytes cmem[0]
ptxas info    : Compiling entry function '_ZN3cub16DeviceScanKernelINS_15AgentScanPolicyILi128ELi12EiLNS_18BlockLoadAlgorithmE3ELNS_17CacheLoadModifierE0ELNS_19BlockStoreAlgorithmE3ELNS_18BlockScanAlgorithmE2ENS_15MemBoundScalingILi128ELi12EiEEEEPKiPiNS_13ScanTileStateIiLb1EEENS_3SumEiiEEvT0_T1_T2_iT3_T4_T5_' for 'sm_70'
ptxas info    : Function properties for _ZN3cub16DeviceScanKernelINS_15AgentScanPolicyILi128ELi12EiLNS_18BlockLoadAlgorithmE3ELNS_17CacheLoadModifierE0ELNS_19BlockStoreAlgorithmE3ELNS_18BlockScanAlgorithmE2ENS_15MemBoundScalingILi128ELi12EiEEEEPKiPiNS_13ScanTileStateIiLb1EEENS_3SumEiiEEvT0_T1_T2_iT3_T4_T5_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 48 registers, 6160 bytes smem, 392 bytes cmem[0]
ptxas info    : Compiling entry function '_ZN3cub16DeviceScanKernelINS_15AgentScanPolicyILi96ELi21EiLNS_18BlockLoadAlgorithmE3ELNS_17CacheLoadModifierE0ELNS_19BlockStoreAlgorithmE3ELNS_18BlockScanAlgorithmE1ENS_15MemBoundScalingILi96ELi21EiEEEEPKiPiNS_13ScanTileStateIiLb1EEENS_3SumEiiEEvT0_T1_T2_iT3_T4_T5_' for 'sm_70'
ptxas info    : Function properties for _ZN3cub16DeviceScanKernelINS_15AgentScanPolicyILi96ELi21EiLNS_18BlockLoadAlgorithmE3ELNS_17CacheLoadModifierE0ELNS_19BlockStoreAlgorithmE3ELNS_18BlockScanAlgorithmE1ENS_15MemBoundScalingILi96ELi21EiEEEEPKiPiNS_13ScanTileStateIiLb1EEENS_3SumEiiEEvT0_T1_T2_iT3_T4_T5_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 56 registers, 8080 bytes smem, 392 bytes cmem[0]
ptxas info    : Compiling entry function '_ZN3cub16DeviceScanKernelINS_15AgentScanPolicyILi64ELi9EiLNS_18BlockLoadAlgorithmE3ELNS_17CacheLoadModifierE0ELNS_19BlockStoreAlgorithmE3ELNS_18BlockScanAlgorithmE2ENS_15MemBoundScalingILi64ELi9EiEEEEPKiPiNS_13ScanTileStateIiLb1EEENS_3SumEiiEEvT0_T1_T2_iT3_T4_T5_' for 'sm_70'
ptxas info    : Function properties for _ZN3cub16DeviceScanKernelINS_15AgentScanPolicyILi64ELi9EiLNS_18BlockLoadAlgorithmE3ELNS_17CacheLoadModifierE0ELNS_19BlockStoreAlgorithmE3ELNS_18BlockScanAlgorithmE2ENS_15MemBoundScalingILi64ELi9EiEEEEPKiPiNS_13ScanTileStateIiLb1EEENS_3SumEiiEEvT0_T1_T2_iT3_T4_T5_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 43 registers, 2320 bytes smem, 392 bytes cmem[0]
ptxas info    : Compiling entry function '_ZN3cub20DeviceScanInitKernelINS_13ScanTileStateIiLb1EEEEEvT_i' for 'sm_70'
ptxas info    : Function properties for _ZN3cub20DeviceScanInitKernelINS_13ScanTileStateIiLb1EEEEEvT_i
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 10 registers, 364 bytes cmem[0]
ptxas info    : Compiling entry function '_ZN3cub23DeviceReduceByKeyKernelINS_19DispatchReduceByKeyIPK8PairDistIfEPS3_NS_21ConstantInputIteratorIiiEEPiS9_NS_8EqualityENS_3SumEiE20PtxReduceByKeyPolicyES5_S6_S8_S9_S9_NS_24ReduceByKeyScanTileStateIiiLb1EEESA_SB_iEEvT0_T1_T2_T3_T4_T5_iT6_T7_T8_' for 'sm_70'
ptxas info    : Function properties for _ZN3cub23DeviceReduceByKeyKernelINS_19DispatchReduceByKeyIPK8PairDistIfEPS3_NS_21ConstantInputIteratorIiiEEPiS9_NS_8EqualityENS_3SumEiE20PtxReduceByKeyPolicyES5_S6_S8_S9_S9_NS_24ReduceByKeyScanTileStateIiiLb1EEESA_SB_iEEvT0_T1_T2_T3_T4_T5_iT6_T7_T8_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 54 registers, 6160 bytes smem, 412 bytes cmem[0]
ptxas info    : Compiling entry function '_ZN3cub23DeviceCompactInitKernelINS_24ReduceByKeyScanTileStateIiiLb1EEEPiEEvT_iT0_' for 'sm_70'
ptxas info    : Function properties for _ZN3cub23DeviceCompactInitKernelINS_24ReduceByKeyScanTileStateIiiLb1EEEPiEEvT_iT0_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 14 registers, 376 bytes cmem[0]
ptxas info    : Compiling entry function '_ZN3cub23DeviceSelectSweepKernelINS_16DispatchSelectIfIPK8PairDistIfEPNS_8NullTypeEPS3_PiZ12cuComputeAEVIfEvN2at6TensorESC_T_SD_SC_SC_SC_SC_SC_SC_SC_lEUlS3_E0_S6_iLb0EE18PtxSelectIfPolicyTES5_S7_S8_S9_NS_13ScanTileStateIiLb1EEESE_S6_iLb0EEEvT0_T1_T2_T3_T4_T5_T6_T7_i' for 'sm_70'
ptxas info    : Function properties for _ZN3cub23DeviceSelectSweepKernelINS_16DispatchSelectIfIPK8PairDistIfEPNS_8NullTypeEPS3_PiZ12cuComputeAEVIfEvN2at6TensorESC_T_SD_SC_SC_SC_SC_SC_SC_SC_lEUlS3_E0_S6_iLb0EE18PtxSelectIfPolicyTES5_S7_S8_S9_NS_13ScanTileStateIiLb1EEESE_S6_iLb0EEEvT0_T1_T2_T3_T4_T5_T6_T7_i
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 37 registers, 4608 bytes smem, 408 bytes cmem[0]
ptxas info    : Compiling entry function '_ZN3cub23DeviceSelectSweepKernelINS_16DispatchSelectIfIPK8PairDistIfEPNS_8NullTypeEPS3_PiZ12cuComputeAEVIfEvN2at6TensorESC_T_SD_SC_SC_SC_SC_SC_SC_SC_lEUlS3_E_S6_iLb0EE18PtxSelectIfPolicyTES5_S7_S8_S9_NS_13ScanTileStateIiLb1EEESE_S6_iLb0EEEvT0_T1_T2_T3_T4_T5_T6_T7_i' for 'sm_70'
ptxas info    : Function properties for _ZN3cub23DeviceSelectSweepKernelINS_16DispatchSelectIfIPK8PairDistIfEPNS_8NullTypeEPS3_PiZ12cuComputeAEVIfEvN2at6TensorESC_T_SD_SC_SC_SC_SC_SC_SC_SC_lEUlS3_E_S6_iLb0EE18PtxSelectIfPolicyTES5_S7_S8_S9_NS_13ScanTileStateIiLb1EEESE_S6_iLb0EEEvT0_T1_T2_T3_T4_T5_T6_T7_i
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 37 registers, 4608 bytes smem, 408 bytes cmem[0]
ptxas info    : Compiling entry function '_ZN3cub23DeviceCompactInitKernelINS_13ScanTileStateIiLb1EEEPiEEvT_iT0_' for 'sm_70'
ptxas info    : Function properties for _ZN3cub23DeviceCompactInitKernelINS_13ScanTileStateIiLb1EEEPiEEvT_iT0_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 12 registers, 376 bytes cmem[0]
ptxas info    : Compiling entry function '_ZN3cub11EmptyKernelIvEEvv' for 'sm_70'
ptxas info    : Function properties for _ZN3cub11EmptyKernelIvEEvv
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 4 registers, 352 bytes cmem[0]
ptxas info    : Compiling entry function '_Z13cuAngularAEVsIifiLi8ELi4EEvN2at27GenericPackedTensorAccessorIT_Lm2ENS0_17RestrictPtrTraitsEiEENS1_IT0_Lm3ES3_iEENS1_IS5_Lm1ES3_iEES7_S7_S7_S6_P8PairDistIS5_ESA_PiSB_15AEVScalarParamsIS5_T1_Eiii' for 'sm_70'
ptxas info    : Function properties for _Z13cuAngularAEVsIifiLi8ELi4EEvN2at27GenericPackedTensorAccessorIT_Lm2ENS0_17RestrictPtrTraitsEiEENS1_IT0_Lm3ES3_iEENS1_IS5_Lm1ES3_iEES7_S7_S7_S6_P8PairDistIS5_ESA_PiSB_15AEVScalarParamsIS5_T1_Eiii
    48 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 64 registers, 576 bytes cmem[0], 288 bytes cmem[2]
ptxas info    : Function properties for __internal_trig_reduction_slowpathd
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Compiling entry function '_Z12cuRadialAEVsIifLi8EEvN2at27GenericPackedTensorAccessorIT_Lm2ENS0_17RestrictPtrTraitsEiEENS1_IT0_Lm1ES3_iEES6_NS1_IS5_Lm3ES3_iEEP8PairDistIS5_E15AEVScalarParamsIS5_iEi' for 'sm_70'
ptxas info    : Function properties for _Z12cuRadialAEVsIifLi8EEvN2at27GenericPackedTensorAccessorIT_Lm2ENS0_17RestrictPtrTraitsEiEENS1_IT0_Lm1ES3_iEES6_NS1_IS5_Lm3ES3_iEEP8PairDistIS5_E15AEVScalarParamsIS5_iEi
    48 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 32 registers, 480 bytes cmem[0], 64 bytes cmem[2]
ptxas info    : Function properties for __internal_trig_reduction_slowpathd
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Compiling entry function '_Z16pairwiseDistanceIifiEvN2at27GenericPackedTensorAccessorIT_Lm2ENS0_17RestrictPtrTraitsEiEENS1_IT0_Lm3ES3_iEEP8PairDistIS5_ET1_' for 'sm_70'
ptxas info    : Function properties for _Z16pairwiseDistanceIifiEvN2at27GenericPackedTensorAccessorIT_Lm2ENS0_17RestrictPtrTraitsEiEENS1_IT0_Lm3ES3_iEEP8PairDistIS5_ET1_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 22 registers, 420 bytes cmem[0]
/home/richard/program/anaconda3/envs/ml/lib/python3.6/site-packages/torch/include/ATen/record_function.h(18): warning: attribute "__visibility__" does not apply here

/home/richard/program/anaconda3/envs/ml/lib/python3.6/site-packages/torch/include/torch/csrc/autograd/profiler.h(97): warning: attribute "__visibility__" does not apply here

/home/richard/program/anaconda3/envs/ml/lib/python3.6/site-packages/torch/include/torch/csrc/autograd/profiler.h(126): warning: attribute "__visibility__" does not apply here

/home/richard/program/anaconda3/envs/ml/lib/python3.6/site-packages/torch/include/ATen/core/op_registration/infer_schema.h: In instantiation of ‘constexpr std::array<c10::detail::infer_schema::ArgumentDef, sizeof... (Ts)> c10::detail::infer_schema::createArgumentVectorFromTypes(std::index_sequence<I ...>) [with Ts = {at::Tensor, at::Tensor, double, double, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, long int}; long unsigned int ...Is = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; std::index_sequence<I ...> = std::integer_sequence<long unsigned int, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11>]’:
/home/richard/program/anaconda3/envs/ml/lib/python3.6/site-packages/torch/include/ATen/core/op_registration/infer_schema.h:63:58:   required from ‘static constexpr std::array<c10::detail::infer_schema::ArgumentDef, sizeof... (Ts)> c10::detail::infer_schema::createArguments<c10::guts::typelist::typelist<Types ...> >::call() [with ParameterTypes = {at::Tensor, at::Tensor, double, double, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, long int}]’
/home/richard/program/anaconda3/envs/ml/lib/python3.6/site-packages/torch/include/ATen/core/op_registration/infer_schema.h:119:94:   required from ‘c10::FunctionSchema c10::detail::infer_schema::createFunctionSchemaFromTraitsFlattenedReturns(std::string&&, std::string&&) [with FunctionTraits = c10::guts::function_traits<void(at::Tensor, at::Tensor, double, double, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, long int)>; std::string = std::basic_string<char>]’
/home/richard/program/anaconda3/envs/ml/lib/python3.6/site-packages/torch/include/ATen/core/op_registration/infer_schema.h:146:121:   required from ‘c10::FunctionSchema c10::inferFunctionSchemaFlattenedReturns(std::string&&, std::string&&) [with FuncType = void(at::Tensor, at::Tensor, double, double, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, long int); std::string = std::basic_string<char>]’
/home/richard/program/anaconda3/envs/ml/lib/python3.6/site-packages/torch/include/ATen/core/op_registration/op_registration.h:23:152:   required from ‘std::unique_ptr<c10::FunctionSchema> c10::detail::inferFunctionSchemaFromFunctor() [with KernelFunctor = c10::impl::detail::WrapFunctionIntoRuntimeFunctor_<void (*)(at::Tensor, at::Tensor, double, double, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, long int), void, c10::guts::typelist::typelist<at::Tensor, at::Tensor, double, double, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, long int> >]’
/home/richard/program/anaconda3/envs/ml/lib/python3.6/site-packages/torch/include/ATen/core/op_registration/op_registration.h:547:317:   required from ‘std::enable_if_t<(c10::guts::is_function_type<T>::value && (! std::is_same<FuncType, void(const c10::OperatorHandle&, std::vector<c10::IValue>*)>::value)), c10::RegisterOperators&&> c10::RegisterOperators::op(const string&, FuncType*, c10::RegisterOperators::Options&&) && [with FuncType = void(at::Tensor, at::Tensor, double, double, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, long int); std::enable_if_t<(c10::guts::is_function_type<T>::value && (! std::is_same<FuncType, void(const c10::OperatorHandle&, std::vector<c10::IValue>*)>::value)), c10::RegisterOperators&&> = c10::RegisterOperators&&; std::string = std::basic_string<char>]’
/home/richard/program/anaconda3/envs/ml/lib/python3.6/site-packages/torch/include/ATen/core/op_registration/op_registration.h:506:1:   required from ‘c10::RegisterOperators::RegisterOperators(const string&, FuncType&&, c10::RegisterOperators::Options&&) [with FuncType = void (*)(at::Tensor, at::Tensor, double, double, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, long int); std::string = std::basic_string<char>]’
/home/richard/dev/torchani_cuaev/torchani/extension/aev.cu:587:95:   required from here
/home/richard/program/anaconda3/envs/ml/lib/python3.6/site-packages/torch/include/ATen/core/op_registration/infer_schema.h:48:81: error: no matching function for call to ‘c10::detail::infer_schema::ArgumentDef::ArgumentDef(c10::TypePtr (*)())’
    return (
                                                                                    ^
/home/richard/program/anaconda3/envs/ml/lib/python3.6/site-packages/torch/include/ATen/core/op_registration/infer_schema.h:22:8: note: candidate: c10::detail::infer_schema::ArgumentDef::ArgumentDef()
    struct ArgumentDef final {
        ^~~~~~~~~~~
/home/richard/program/anaconda3/envs/ml/lib/python3.6/site-packages/torch/include/ATen/core/op_registration/infer_schema.h:22:8: note:   candidate expects 0 arguments, 1 provided
/home/richard/program/anaconda3/envs/ml/lib/python3.6/site-packages/torch/include/ATen/core/op_registration/infer_schema.h:22:8: note: candidate: constexpr c10::detail::infer_schema::ArgumentDef::ArgumentDef(const c10::detail::infer_schema::ArgumentDef&)
/home/richard/program/anaconda3/envs/ml/lib/python3.6/site-packages/torch/include/ATen/core/op_registration/infer_schema.h:22:8: note:   no known conversion for argument 1 from ‘c10::TypePtr (*)() {aka std::shared_ptr<c10::Type> (*)()}’ to ‘const c10::detail::infer_schema::ArgumentDef&’
/home/richard/program/anaconda3/envs/ml/lib/python3.6/site-packages/torch/include/ATen/core/op_registration/infer_schema.h:22:8: note: candidate: constexpr c10::detail::infer_schema::ArgumentDef::ArgumentDef(c10::detail::infer_schema::ArgumentDef&&)
/home/richard/program/anaconda3/envs/ml/lib/python3.6/site-packages/torch/include/ATen/core/op_registration/infer_schema.h:22:8: note:   no known conversion for argument 1 from ‘c10::TypePtr (*)() {aka std::shared_ptr<c10::Type> (*)()}’ to ‘c10::detail::infer_schema::ArgumentDef&&’
/home/richard/program/anaconda3/envs/ml/lib/python3.6/site-packages/torch/include/ATen/core/op_registration/infer_schema.h: In instantiation of ‘c10::FunctionSchema c10::detail::infer_schema::createFunctionSchemaFromTraitsFlattenedReturns(std::string&&, std::string&&) [with FunctionTraits = c10::guts::function_traits<void(at::Tensor, at::Tensor, double, double, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, long int)>; std::string = std::basic_string<char>]’:
/home/richard/program/anaconda3/envs/ml/lib/python3.6/site-packages/torch/include/ATen/core/op_registration/infer_schema.h:146:121:   required from ‘c10::FunctionSchema c10::inferFunctionSchemaFlattenedReturns(std::string&&, std::string&&) [with FuncType = void(at::Tensor, at::Tensor, double, double, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, long int); std::string = std::basic_string<char>]’
/home/richard/program/anaconda3/envs/ml/lib/python3.6/site-packages/torch/include/ATen/core/op_registration/op_registration.h:23:152:   required from ‘std::unique_ptr<c10::FunctionSchema> c10::detail::inferFunctionSchemaFromFunctor() [with KernelFunctor = c10::impl::detail::WrapFunctionIntoRuntimeFunctor_<void (*)(at::Tensor, at::Tensor, double, double, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, long int), void, c10::guts::typelist::typelist<at::Tensor, at::Tensor, double, double, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, long int> >]’
/home/richard/program/anaconda3/envs/ml/lib/python3.6/site-packages/torch/include/ATen/core/op_registration/op_registration.h:547:317:   required from ‘std::enable_if_t<(c10::guts::is_function_type<T>::value && (! std::is_same<FuncType, void(const c10::OperatorHandle&, std::vector<c10::IValue>*)>::value)), c10::RegisterOperators&&> c10::RegisterOperators::op(const string&, FuncType*, c10::RegisterOperators::Options&&) && [with FuncType = void(at::Tensor, at::Tensor, double, double, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, long int); std::enable_if_t<(c10::guts::is_function_type<T>::value && (! std::is_same<FuncType, void(const c10::OperatorHandle&, std::vector<c10::IValue>*)>::value)), c10::RegisterOperators&&> = c10::RegisterOperators&&; std::string = std::basic_string<char>]’
/home/richard/program/anaconda3/envs/ml/lib/python3.6/site-packages/torch/include/ATen/core/op_registration/op_registration.h:506:1:   required from ‘c10::RegisterOperators::RegisterOperators(const string&, FuncType&&, c10::RegisterOperators::Options&&) [with FuncType = void (*)(at::Tensor, at::Tensor, double, double, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, long int); std::string = std::basic_string<char>]’
/home/richard/dev/torchani_cuaev/torchani/extension/aev.cu:587:95:   required from here
/home/richard/program/anaconda3/envs/ml/lib/python3.6/site-packages/torch/include/ATen/core/op_registration/infer_schema.h:119:94:   in constexpr expansion of ‘c10::detail::infer_schema::createArguments<c10::guts::typelist::typelist<at::Tensor, at::Tensor, double, double, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, long int> >::call()’
/home/richard/program/anaconda3/envs/ml/lib/python3.6/site-packages/torch/include/ATen/core/op_registration/infer_schema.h:63:58:   in constexpr expansion of ‘c10::detail::infer_schema::createArgumentVectorFromTypes<at::Tensor, at::Tensor, double, double, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, long int, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11>((std::make_index_sequence<12>(), std::make_index_sequence<12>()))’
/home/richard/program/anaconda3/envs/ml/lib/python3.6/site-packages/torch/include/ATen/core/op_registration/infer_schema.h:119:16: error: constexpr call flows off the end of the function
    constexpr auto arguments = createArguments<ParameterTypes>::call();
                ^~~~~~~~~
ninja: build stopped: subcommand failed.
Traceback (most recent call last):
    File "/home/richard/program/anaconda3/envs/ml/lib/python3.6/site-packages/torch/utils/cpp_extension.py", line 1515, in _run_ninja_build
    env=env)
    File "/home/richard/program/anaconda3/envs/ml/lib/python3.6/subprocess.py", line 438, in run
    output=stdout, stderr=stderr)
subprocess.CalledProcessError: Command '['ninja', '-v']' returned non-zero exit status 1.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
    File "<string>", line 1, in <module>
    File "/home/richard/dev/torchani_cuaev/torchani/extension/setup.py", line 47, in <module>
    'build_ext': BuildExtension
    File "/home/richard/program/anaconda3/envs/ml/lib/python3.6/site-packages/setuptools/__init__.py", line 144, in setup
    return distutils.core.setup(**attrs)
    File "/home/richard/program/anaconda3/envs/ml/lib/python3.6/distutils/core.py", line 148, in setup
    dist.run_commands()
    File "/home/richard/program/anaconda3/envs/ml/lib/python3.6/distutils/dist.py", line 955, in run_commands
    self.run_command(cmd)
    File "/home/richard/program/anaconda3/envs/ml/lib/python3.6/distutils/dist.py", line 974, in run_command
    cmd_obj.run()
    File "/home/richard/program/anaconda3/envs/ml/lib/python3.6/site-packages/setuptools/command/develop.py", line 38, in run
    self.install_for_development()
    File "/home/richard/program/anaconda3/envs/ml/lib/python3.6/site-packages/setuptools/command/develop.py", line 140, in install_for_development
    self.run_command('build_ext')
    File "/home/richard/program/anaconda3/envs/ml/lib/python3.6/distutils/cmd.py", line 313, in run_command
    self.distribution.run_command(command)
    File "/home/richard/program/anaconda3/envs/ml/lib/python3.6/distutils/dist.py", line 974, in run_command
    cmd_obj.run()
    File "/home/richard/program/anaconda3/envs/ml/lib/python3.6/site-packages/setuptools/command/build_ext.py", line 87, in run
    _build_ext.run(self)
    File "/home/richard/program/anaconda3/envs/ml/lib/python3.6/site-packages/Cython/Distutils/old_build_ext.py", line 186, in run
    _build_ext.build_ext.run(self)
    File "/home/richard/program/anaconda3/envs/ml/lib/python3.6/distutils/command/build_ext.py", line 339, in run
    self.build_extensions()
    File "/home/richard/program/anaconda3/envs/ml/lib/python3.6/site-packages/torch/utils/cpp_extension.py", line 649, in build_extensions
    build_ext.build_extensions(self)
    File "/home/richard/program/anaconda3/envs/ml/lib/python3.6/site-packages/Cython/Distutils/old_build_ext.py", line 195, in build_extensions
    _build_ext.build_ext.build_extensions(self)
    File "/home/richard/program/anaconda3/envs/ml/lib/python3.6/distutils/command/build_ext.py", line 448, in build_extensions
    self._build_extensions_serial()
    File "/home/richard/program/anaconda3/envs/ml/lib/python3.6/distutils/command/build_ext.py", line 473, in _build_extensions_serial
    self.build_extension(ext)
    File "/home/richard/program/anaconda3/envs/ml/lib/python3.6/site-packages/setuptools/command/build_ext.py", line 208, in build_extension
    _build_ext.build_extension(self, ext)
    File "/home/richard/program/anaconda3/envs/ml/lib/python3.6/distutils/command/build_ext.py", line 533, in build_extension
    depends=ext.depends)
    File "/home/richard/program/anaconda3/envs/ml/lib/python3.6/site-packages/torch/utils/cpp_extension.py", line 478, in unix_wrap_ninja_compile
    with_cuda=with_cuda)
    File "/home/richard/program/anaconda3/envs/ml/lib/python3.6/site-packages/torch/utils/cpp_extension.py", line 1233, in _write_ninja_file_and_compile_objects
    error_prefix='Error compiling objects for extension')
    File "/home/richard/program/anaconda3/envs/ml/lib/python3.6/site-packages/torch/utils/cpp_extension.py", line 1529, in _run_ninja_build
    raise RuntimeError(message)
RuntimeError: Error compiling objects for extension
zasdfgbnm commented 3 years ago

We can not build a cuda extension with TorchScript support because of a PyTorch bug https://github.com/pytorch/pytorch/issues/47493, this bug is fixed in https://github.com/pytorch/pytorch/pull/47492, when the fix is merged, I will change https://github.com/aiqm/torchani/pull/516 and master branch to register the operator in a way that supports TorchScript. As a result, everyone will need to update your PyTorch to the latest nightly at that day.

zasdfgbnm commented 3 years ago

This is fixed