mrzhuzhe / riven

CPU Memory Compiler and Parallel programing
24 stars 2 forks source link

使用中间变量后在intel i9 cpu 上 速度大幅提升,但是结果产生e-14的微小误差 #3

Open mrzhuzhe opened 1 year ago

mrzhuzhe commented 1 year ago

代码:

代码来源于

blis https://github.com/flame/how-to-optimize-gemm/wiki#step-by-step-optimizations 中 optimize 07 https://github.com/flame/how-to-optimize-gemm/wiki/Optimization_1x4_7

可以cd 到 gemm/src/ 下 make 即可

不用中间变量就不会有这个问题:

riven/gemm/src/MMult7.c

void AddDot1x4( int k, double *a, int lda, double *b, int ldb, double *c, int ldc)
{   
    double 
    /* Point to the current elements in the four columns of B */
    *bp0_pntr, *bp1_pntr, *bp2_pntr, *bp3_pntr; 

   for (int p = 0; p < k ; p++){
      double reg_a = A(0, p);
      bp0_pntr = &B(p , 0);
      bp1_pntr = &B(p , 1);
      bp2_pntr = &B(p , 2);
      bp3_pntr = &B(p , 3);

      C(0, 0) += reg_a * *bp0_pntr++;
      C(0, 1) += reg_a * *bp1_pntr++;
      C(0, 2) += reg_a * *bp2_pntr++;
      C(0, 3) += reg_a * *bp3_pntr++;        
   }
}

与之对比的代码 Bad case

riven/gemm/src/MMult7_bad.c

void AddDot1x4( int k, double a, int lda, double b, int ldb, double c, int ldc) { //register double reg_a = A(0, p); //https://zh.wikipedia.org/zh-hk/%E5%AF%84%E5%AD%98%E5%99%A8 // has a benifit to use register
register double reg_a_0, reg_a_1, reg_a_2, reg_a_3; reg_a_0 = 0.0; reg_a_1 = 0.0; reg_a_2 = 0.0; reg_a_3 = 0.0; double reg_a; double /
Point to the current elements in the four columns of B / bp0_pntr, bp1_pntr, bp2_pntr, *bp3_pntr;

for (int p = 0; p < k ; p++){ reg_a = A(0, p); bp0_pntr = &B(p , 0); bp1_pntr = &B(p , 1); bp2_pntr = &B(p , 2); bp3_pntr = &B(p , 3);

 // faster multi ?
 reg_a_0 += reg_a * B(p , 0);
 reg_a_1 += reg_a * B(p , 1);
 reg_a_2 += reg_a * B(p , 2);
 reg_a_3 += reg_a * B(p , 3);  

}

C(0, 0) += reg_a_0; C(0, 1) += reg_a_1; C(0, 2) += reg_a_2; C(0, 3) += reg_a_3;

}


## lscpu

Architecture: x86_64 CPU op-mode(s): 32-bit, 64-bit Byte Order: Little Endian Address sizes: 39 bits physical, 48 bits virtual CPU(s): 16 On-line CPU(s) list: 0-15 Thread(s) per core: 2 Core(s) per socket: 8 Socket(s): 1 NUMA node(s): 1 Vendor ID: GenuineIntel CPU family: 6 Model: 167 Model name: 11th Gen Intel(R) Core(TM) i9-11900 @ 2.50GHz Stepping: 1 CPU MHz: 2500.000 CPU max MHz: 5200.0000 CPU min MHz: 800.0000 BogoMIPS: 4992.00 Virtualization: VT-x L1d cache: 384 KiB L1i cache: 256 KiB L2 cache: 4 MiB L3 cache: 16 MiB NUMA node0 CPU(s): 0-15 Vulnerability Itlb multihit: Not affected Vulnerability L1tf: Not affected Vulnerability Mds: Not affected Vulnerability Meltdown: Not affected Vulnerability Mmio stale data: Mitigation; Clear CPU buffers; SMT vulnerable Vulnerability Retbleed: Mitigation; Enhanced IBRS Vulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp Vulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization Vulnerability Spectre v2: Mitigation; Enhanced IBRS, IBPB conditional, RSB filling, PBRSB-eIBRS SW sequence Vulnerability Srbds: Not affected Vulnerability Tsx async abort: Not affected Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dt s acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc ar t arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc cpuid aperfmperf tsc_k nown_freq pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 sdbg fma cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm abm 3dnowprefetch cpuid_fault epb invpcid_single ssbd ibrs ibp b stibp ibrs_enhanced tpr_shadow vnmi flexpriority ept vpid ept_ad fsgsbase tsc_ad just bmi1 avx2 smep bmi2 erms invpcid mpx avx512f avx512dq rdseed adx smap avx512i fma clflushopt intel_pt avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves dtherm ida arat pln pts hwp hwp_notify hwp_act_window hwp_epp hwp_pkg_req a vx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bita lg avx512_vpopcntdq rdpid fsrm md_clear flush_l1d arch_capabilities


误差文件:

version = 'MMult7_bad'; MY_MMult = [ 40 8.533333e+00 2.664535e-15 80 8.325203e+00 7.105427e-15 120 8.793893e+00 1.065814e-14 160 8.808602e+00 1.421085e-14 200 8.757526e+00 1.421085e-14 240 8.807901e+00 1.776357e-14 280 8.801925e+00 1.776357e-14 320 8.784987e+00 2.842171e-14 360 8.741990e+00 2.309264e-14 400 8.673850e+00 2.842171e-14 440 8.766943e+00 3.197442e-14 480 8.664708e+00 3.907985e-14 520 8.537478e+00 3.907985e-14 560 8.408714e+00 4.618528e-14 600 8.279986e+00 4.263256e-14 640 8.222449e+00 4.440892e-14 680 8.137895e+00 5.329071e-14 720 8.238834e+00 4.973799e-14 760 8.274526e+00 4.973799e-14 800 8.087957e+00 4.973799e-14 ];

mrzhuzhe commented 1 year ago

浮点计算不同的相加路径会的出有一点区别的结果

检查一下asm

mrzhuzhe commented 1 year ago

./outputs/MMult7_bad.o:     file format elf64-x86-64

Disassembly of section .text:

0000000000000000 <AddDot1x4>:
   0:   f3 0f 1e fa             endbr64 
   4:   53                      push   %rbx
   5:   85 ff                   test   %edi,%edi
   7:   0f 8e bb 00 00 00       jle    c8 <AddDot1x4+0xc8>
   d:   47 8d 14 00             lea    (%r8,%r8,1),%r10d
  11:   48 63 d2                movslq %edx,%rdx
  14:   66 0f ef c9             pxor   %xmm1,%xmm1
  18:   47 8d 1c 02             lea    (%r10,%r8,1),%r11d
  1c:   66 0f 28 d1             movapd %xmm1,%xmm2
  20:   66 0f 28 d9             movapd %xmm1,%xmm3
  24:   4d 63 c0                movslq %r8d,%r8
  27:   48 8d 04 d5 00 00 00    lea    0x0(,%rdx,8),%rax
  2e:   00 
  2f:   8d 57 ff                lea    -0x1(%rdi),%edx
  32:   66 0f 28 e1             movapd %xmm1,%xmm4
  36:   49 63 fb                movslq %r11d,%rdi
  39:   48 8d 5c d1 08          lea    0x8(%rcx,%rdx,8),%rbx
  3e:   49 63 d2                movslq %r10d,%rdx
  41:   0f 1f 80 00 00 00 00    nopl   0x0(%rax)
  48:   f2 0f 10 06             movsd  (%rsi),%xmm0
  4c:   f2 0f 10 29             movsd  (%rcx),%xmm5
  50:   48 01 c6                add    %rax,%rsi
  53:   f2 0f 59 e8             mulsd  %xmm0,%xmm5
  57:   f2 0f 58 e5             addsd  %xmm5,%xmm4
  5b:   f2 42 0f 10 2c c1       movsd  (%rcx,%r8,8),%xmm5
  61:   f2 0f 59 e8             mulsd  %xmm0,%xmm5
  65:   f2 0f 58 dd             addsd  %xmm5,%xmm3
  69:   f2 0f 10 2c d1          movsd  (%rcx,%rdx,8),%xmm5
  6e:   f2 0f 59 e8             mulsd  %xmm0,%xmm5
  72:   f2 0f 59 04 f9          mulsd  (%rcx,%rdi,8),%xmm0
  77:   48 83 c1 08             add    $0x8,%rcx
  7b:   f2 0f 58 d5             addsd  %xmm5,%xmm2
  7f:   f2 0f 58 c8             addsd  %xmm0,%xmm1
  83:   48 39 cb                cmp    %rcx,%rbx
  86:   75 c0                   jne    48 <AddDot1x4+0x48>
  88:   48 63 44 24 10          movslq 0x10(%rsp),%rax
  8d:   f2 41 0f 58 21          addsd  (%r9),%xmm4
  92:   48 8d 14 c5 00 00 00    lea    0x0(,%rax,8),%rdx
  99:   00 
  9a:   49 8d 04 11             lea    (%r9,%rdx,1),%rax
  9e:   f2 41 0f 11 21          movsd  %xmm4,(%r9)
  a3:   f2 0f 58 18             addsd  (%rax),%xmm3
  a7:   f2 0f 11 18             movsd  %xmm3,(%rax)
  ab:   48 01 d0                add    %rdx,%rax
  ae:   f2 0f 58 10             addsd  (%rax),%xmm2
  b2:   f2 0f 11 10             movsd  %xmm2,(%rax)
  b6:   48 01 d0                add    %rdx,%rax
  b9:   f2 0f 58 08             addsd  (%rax),%xmm1
  bd:   f2 0f 11 08             movsd  %xmm1,(%rax)
  c1:   5b                      pop    %rbx
  c2:   c3                      retq   
  c3:   0f 1f 44 00 00          nopl   0x0(%rax,%rax,1)
  c8:   66 0f ef c9             pxor   %xmm1,%xmm1
  cc:   66 0f 28 d1             movapd %xmm1,%xmm2
  d0:   66 0f 28 d9             movapd %xmm1,%xmm3
  d4:   66 0f 28 e1             movapd %xmm1,%xmm4
  d8:   eb ae                   jmp    88 <AddDot1x4+0x88>
  da:   66 0f 1f 44 00 00       nopw   0x0(%rax,%rax,1)

00000000000000e0 <MY_MMult>:
  e0:   f3 0f 1e fa             endbr64 
  e4:   41 57                   push   %r15
  e6:   41 56                   push   %r14
  e8:   41 55                   push   %r13
  ea:   41 54                   push   %r12
  ec:   55                      push   %rbp
  ed:   53                      push   %rbx
  ee:   48 83 ec 38             sub    $0x38,%rsp
  f2:   89 7c 24 18             mov    %edi,0x18(%rsp)
  f6:   44 8b 6c 24 70          mov    0x70(%rsp),%r13d
  fb:   48 89 4c 24 28          mov    %rcx,0x28(%rsp)
 100:   44 8b b4 24 80 00 00    mov    0x80(%rsp),%r14d
 107:   00 
 108:   4c 89 4c 24 30          mov    %r9,0x30(%rsp)
 10d:   85 f6                   test   %esi,%esi
 10f:   0f 8e e2 00 00 00       jle    1f7 <MY_MMult+0x117>
 115:   42 8d 04 b5 00 00 00    lea    0x0(,%r14,4),%eax
 11c:   00 
 11d:   83 ee 01                sub    $0x1,%esi
 120:   89 d5                   mov    %edx,%ebp
 122:   45 89 c4                mov    %r8d,%r12d
 125:   89 44 24 1c             mov    %eax,0x1c(%rsp)
 129:   83 e6 fc                and    $0xfffffffc,%esi
 12c:   42 8d 04 ad 00 00 00    lea    0x0(,%r13,4),%eax
 133:   00 
 134:   89 44 24 20             mov    %eax,0x20(%rsp)
 138:   8d 46 04                lea    0x4(%rsi),%eax
 13b:   89 44 24 24             mov    %eax,0x24(%rsp)
 13f:   8d 47 ff                lea    -0x1(%rdi),%eax
 142:   48 8d 44 c1 08          lea    0x8(%rcx,%rax,8),%rax
 147:   c7 44 24 14 00 00 00    movl   $0x0,0x14(%rsp)
 14e:   00 
 14f:   48 89 04 24             mov    %rax,(%rsp)
 153:   c7 44 24 10 00 00 00    movl   $0x0,0x10(%rsp)
 15a:   00 
 15b:   c7 44 24 0c 00 00 00    movl   $0x0,0xc(%rsp)
 162:   00 
 163:   0f 1f 44 00 00          nopl   0x0(%rax,%rax,1)
 168:   8b 44 24 18             mov    0x18(%rsp),%eax
 16c:   85 c0                   test   %eax,%eax
 16e:   7e 64                   jle    1d4 <MY_MMult+0xf4>
 170:   48 63 44 24 14          movslq 0x14(%rsp),%rax
 175:   48 8b 5c 24 30          mov    0x30(%rsp),%rbx
 17a:   44 89 6c 24 70          mov    %r13d,0x70(%rsp)
 17f:   45 89 e5                mov    %r12d,%r13d
 182:   48 8b 74 24 78          mov    0x78(%rsp),%rsi
 187:   41 89 ec                mov    %ebp,%r12d
 18a:   4c 8b 7c 24 28          mov    0x28(%rsp),%r15
 18f:   48 8d 1c c3             lea    (%rbx,%rax,8),%rbx
 193:   48 63 44 24 10          movslq 0x10(%rsp),%rax
 198:   8b 6c 24 70             mov    0x70(%rsp),%ebp
 19c:   4c 8d 0c c6             lea    (%rsi,%rax,8),%r9
 1a0:   41 56                   push   %r14
 1a2:   44 89 ea                mov    %r13d,%edx
 1a5:   4c 89 fe                mov    %r15,%rsi
 1a8:   41 89 e8                mov    %ebp,%r8d
 1ab:   48 89 d9                mov    %rbx,%rcx
 1ae:   44 89 e7                mov    %r12d,%edi
 1b1:   49 83 c7 08             add    $0x8,%r15
 1b5:   e8 00 00 00 00          callq  1ba <MY_MMult+0xda>
 1ba:   5a                      pop    %rdx
 1bb:   49 83 c1 08             add    $0x8,%r9
 1bf:   4c 3b 3c 24             cmp    (%rsp),%r15
 1c3:   75 db                   jne    1a0 <MY_MMult+0xc0>
 1c5:   89 6c 24 70             mov    %ebp,0x70(%rsp)
 1c9:   44 89 e5                mov    %r12d,%ebp
 1cc:   45 89 ec                mov    %r13d,%r12d
 1cf:   44 8b 6c 24 70          mov    0x70(%rsp),%r13d
 1d4:   83 44 24 0c 04          addl   $0x4,0xc(%rsp)
 1d9:   8b 7c 24 1c             mov    0x1c(%rsp),%edi
 1dd:   8b 74 24 20             mov    0x20(%rsp),%esi
 1e1:   01 7c 24 10             add    %edi,0x10(%rsp)
 1e5:   01 74 24 14             add    %esi,0x14(%rsp)
 1e9:   8b 44 24 0c             mov    0xc(%rsp),%eax
 1ed:   3b 44 24 24             cmp    0x24(%rsp),%eax
 1f1:   0f 85 71 ff ff ff       jne    168 <MY_MMult+0x88>
 1f7:   48 83 c4 38             add    $0x38,%rsp
 1fb:   5b                      pop    %rbx
 1fc:   5d                      pop    %rbp
 1fd:   41 5c                   pop    %r12
 1ff:   41 5d                   pop    %r13
 201:   41 5e                   pop    %r14
 203:   41 5f                   pop    %r15
 205:   c3                      retq   
mrzhuzhe commented 1 year ago

./outputs/MMult7.o:     file format elf64-x86-64

Disassembly of section .text:

0000000000000000 <AddDot1x4>:
   0:   f3 0f 1e fa             endbr64 
   4:   85 ff                   test   %edi,%edi
   6:   0f 8e 9c 00 00 00       jle    a8 <AddDot1x4+0xa8>
   c:   41 54                   push   %r12
   e:   83 ef 01                sub    $0x1,%edi
  11:   48 63 d2                movslq %edx,%rdx
  14:   55                      push   %rbp
  15:   4c 8d 64 f9 08          lea    0x8(%rcx,%rdi,8),%r12
  1a:   48 c1 e2 03             shl    $0x3,%rdx
  1e:   53                      push   %rbx
  1f:   48 63 44 24 20          movslq 0x20(%rsp),%rax
  24:   43 8d 1c 00             lea    (%r8,%r8,1),%ebx
  28:   42 8d 2c 03             lea    (%rbx,%r8,1),%ebp
  2c:   48 63 fb                movslq %ebx,%rdi
  2f:   4d 63 c0                movslq %r8d,%r8
  32:   48 c1 e0 03             shl    $0x3,%rax
  36:   48 63 dd                movslq %ebp,%rbx
  39:   4d 8d 1c 01             lea    (%r9,%rax,1),%r11
  3d:   4d 8d 14 03             lea    (%r11,%rax,1),%r10
  41:   4c 01 d0                add    %r10,%rax
  44:   0f 1f 40 00             nopl   0x0(%rax)
  48:   f2 0f 10 06             movsd  (%rsi),%xmm0
  4c:   f2 0f 10 09             movsd  (%rcx),%xmm1
  50:   48 01 d6                add    %rdx,%rsi
  53:   f2 0f 59 c8             mulsd  %xmm0,%xmm1
  57:   f2 41 0f 58 09          addsd  (%r9),%xmm1
  5c:   f2 41 0f 11 09          movsd  %xmm1,(%r9)
  61:   f2 42 0f 10 0c c1       movsd  (%rcx,%r8,8),%xmm1
  67:   f2 0f 59 c8             mulsd  %xmm0,%xmm1
  6b:   f2 41 0f 58 0b          addsd  (%r11),%xmm1
  70:   f2 41 0f 11 0b          movsd  %xmm1,(%r11)
  75:   f2 0f 10 0c f9          movsd  (%rcx,%rdi,8),%xmm1
  7a:   f2 0f 59 c8             mulsd  %xmm0,%xmm1
  7e:   f2 41 0f 58 0a          addsd  (%r10),%xmm1
  83:   f2 41 0f 11 0a          movsd  %xmm1,(%r10)
  88:   f2 0f 59 04 d9          mulsd  (%rcx,%rbx,8),%xmm0
  8d:   48 83 c1 08             add    $0x8,%rcx
  91:   f2 0f 58 00             addsd  (%rax),%xmm0
  95:   f2 0f 11 00             movsd  %xmm0,(%rax)
  99:   49 39 cc                cmp    %rcx,%r12
  9c:   75 aa                   jne    48 <AddDot1x4+0x48>
  9e:   5b                      pop    %rbx
  9f:   5d                      pop    %rbp
  a0:   41 5c                   pop    %r12
  a2:   c3                      retq   
  a3:   0f 1f 44 00 00          nopl   0x0(%rax,%rax,1)
  a8:   c3                      retq   
  a9:   0f 1f 80 00 00 00 00    nopl   0x0(%rax)

00000000000000b0 <MY_MMult>:
  b0:   f3 0f 1e fa             endbr64 
  b4:   41 57                   push   %r15
  b6:   41 56                   push   %r14
  b8:   41 55                   push   %r13
  ba:   41 54                   push   %r12
  bc:   55                      push   %rbp
  bd:   53                      push   %rbx
  be:   48 83 ec 38             sub    $0x38,%rsp
  c2:   89 7c 24 18             mov    %edi,0x18(%rsp)
  c6:   44 8b 6c 24 70          mov    0x70(%rsp),%r13d
  cb:   48 89 4c 24 28          mov    %rcx,0x28(%rsp)
  d0:   44 8b b4 24 80 00 00    mov    0x80(%rsp),%r14d
  d7:   00 
  d8:   4c 89 4c 24 30          mov    %r9,0x30(%rsp)
  dd:   85 f6                   test   %esi,%esi
  df:   0f 8e e2 00 00 00       jle    1c7 <MY_MMult+0x117>
  e5:   42 8d 04 b5 00 00 00    lea    0x0(,%r14,4),%eax
  ec:   00 
  ed:   83 ee 01                sub    $0x1,%esi
  f0:   89 d5                   mov    %edx,%ebp
  f2:   45 89 c4                mov    %r8d,%r12d
  f5:   89 44 24 1c             mov    %eax,0x1c(%rsp)
  f9:   83 e6 fc                and    $0xfffffffc,%esi
  fc:   42 8d 04 ad 00 00 00    lea    0x0(,%r13,4),%eax
 103:   00 
 104:   89 44 24 20             mov    %eax,0x20(%rsp)
 108:   8d 46 04                lea    0x4(%rsi),%eax
 10b:   89 44 24 24             mov    %eax,0x24(%rsp)
 10f:   8d 47 ff                lea    -0x1(%rdi),%eax
 112:   48 8d 44 c1 08          lea    0x8(%rcx,%rax,8),%rax
 117:   c7 44 24 14 00 00 00    movl   $0x0,0x14(%rsp)
 11e:   00 
 11f:   48 89 04 24             mov    %rax,(%rsp)
 123:   c7 44 24 10 00 00 00    movl   $0x0,0x10(%rsp)
 12a:   00 
 12b:   c7 44 24 0c 00 00 00    movl   $0x0,0xc(%rsp)
 132:   00 
 133:   0f 1f 44 00 00          nopl   0x0(%rax,%rax,1)
 138:   8b 44 24 18             mov    0x18(%rsp),%eax
 13c:   85 c0                   test   %eax,%eax
 13e:   7e 64                   jle    1a4 <MY_MMult+0xf4>
 140:   48 63 44 24 14          movslq 0x14(%rsp),%rax
 145:   48 8b 5c 24 30          mov    0x30(%rsp),%rbx
 14a:   44 89 6c 24 70          mov    %r13d,0x70(%rsp)
 14f:   45 89 e5                mov    %r12d,%r13d
 152:   48 8b 74 24 78          mov    0x78(%rsp),%rsi
 157:   41 89 ec                mov    %ebp,%r12d
 15a:   4c 8b 7c 24 28          mov    0x28(%rsp),%r15
 15f:   48 8d 1c c3             lea    (%rbx,%rax,8),%rbx
 163:   48 63 44 24 10          movslq 0x10(%rsp),%rax
 168:   8b 6c 24 70             mov    0x70(%rsp),%ebp
 16c:   4c 8d 0c c6             lea    (%rsi,%rax,8),%r9
 170:   41 56                   push   %r14
 172:   44 89 ea                mov    %r13d,%edx
 175:   4c 89 fe                mov    %r15,%rsi
 178:   41 89 e8                mov    %ebp,%r8d
 17b:   48 89 d9                mov    %rbx,%rcx
 17e:   44 89 e7                mov    %r12d,%edi
 181:   49 83 c7 08             add    $0x8,%r15
 185:   e8 00 00 00 00          callq  18a <MY_MMult+0xda>
 18a:   5a                      pop    %rdx
 18b:   49 83 c1 08             add    $0x8,%r9
 18f:   4c 3b 3c 24             cmp    (%rsp),%r15
 193:   75 db                   jne    170 <MY_MMult+0xc0>
 195:   89 6c 24 70             mov    %ebp,0x70(%rsp)
 199:   44 89 e5                mov    %r12d,%ebp
 19c:   45 89 ec                mov    %r13d,%r12d
 19f:   44 8b 6c 24 70          mov    0x70(%rsp),%r13d
 1a4:   83 44 24 0c 04          addl   $0x4,0xc(%rsp)
 1a9:   8b 7c 24 1c             mov    0x1c(%rsp),%edi
 1ad:   8b 74 24 20             mov    0x20(%rsp),%esi
 1b1:   01 7c 24 10             add    %edi,0x10(%rsp)
 1b5:   01 74 24 14             add    %esi,0x14(%rsp)
 1b9:   8b 44 24 0c             mov    0xc(%rsp),%eax
 1bd:   3b 44 24 24             cmp    0x24(%rsp),%eax
 1c1:   0f 85 71 ff ff ff       jne    138 <MY_MMult+0x88>
 1c7:   48 83 c4 38             add    $0x38,%rsp
 1cb:   5b                      pop    %rbx
 1cc:   5d                      pop    %rbp
 1cd:   41 5c                   pop    %r12
 1cf:   41 5d                   pop    %r13
 1d1:   41 5e                   pop    %r14
 1d3:   41 5f                   pop    %r15
 1d5:   c3                      retq