NVlabs / NVBit

220 stars 20 forks source link

Assertion #48

Closed gxsaccount closed 3 years ago

gxsaccount commented 3 years ago

hi,
i want debug my own tools, so i add "-G" in my makefile. add run with vectoradd. but i get this Assertion:

vectoradd: nvbit_imp.cpp:702: void Nvbit::func_loading(CUcontext, CUfunction): Assertion `sizeof(_text_nvbit_nvbit_write_reg75) <= function->nbytes' failed.

i want to konw the meaning to this assertion , and fix out the problem.

thanks.

gxsaccount commented 3 years ago

i find that the tool mov_replace also have this problem. it seems like the assertion caused by some function in "nvbit_reg_rw.h" .

x-y-z commented 3 years ago

Can you try to apply the patch below to nvbit_reg_rw.h? In addition, we recently found bugs in nvbit_read/write_{ureg,pred_reg,upred_reg}() and will release the fix to them soon.

--- a/nvbit_reg_rw.h
+++ b/nvbit_reg_rw.h
@@ -42,14 +42,36 @@ extern "C" __device__ __noinline__ int32_t nvbit_read_reg(uint64_t reg_num) {
 #pragma unroll
     for (int i = 0; i < 1024; i++) __nvbit_var += i;
     assert(__nvbit_var == reg_num);
+#pragma unroll 1024
+    for (int i = 0; i < 1024; i++) __nvbit_var += i;
+#pragma unroll 1024
+    for (int i = 0; i < 1024; i++) __nvbit_var += reg_num*i;
+#pragma unroll 1024
+    for (int i = 0; i < 1024; i++) __nvbit_var += reg_num*i;
+#pragma unroll 1024
+    for (int i = 0; i < 1024; i++) __nvbit_var += reg_num*i;
+#pragma unroll 1024
+    for (int i = 0; i < 1024; i++) __nvbit_var += reg_num*i;
+#pragma unroll 1024
+    for (int i = 0; i < 1024; i++) __nvbit_var += reg_num*i;
     return __nvbit_var;
 }

 extern "C" __device__ __noinline__ void nvbit_write_reg(uint64_t reg_num,
                                                         int32_t reg_val) {
-#pragma unroll
+#pragma unroll 1024
     for (int i = 0; i < 1024; i++) __nvbit_var += i;
+#pragma unroll 1024
+    for (int i = 0; i < 1024; i++) __nvbit_var += reg_num*i;
     assert(__nvbit_var == reg_num + reg_val);
+#pragma unroll 1024
+    for (int i = 0; i < 1024; i++) __nvbit_var += reg_num*i;
+#pragma unroll 1024
+    for (int i = 0; i < 1024; i++) __nvbit_var += reg_num*i;
+#pragma unroll 1024
+    for (int i = 0; i < 1024; i++) __nvbit_var += reg_num*i;
+#pragma unroll 1024
+    for (int i = 0; i < 1024; i++) __nvbit_var += reg_num*i;
 }

 extern "C" __device__ __noinline__ int32_t nvbit_read_ureg(uint64_t reg_num) {