Open mrzhuzhe opened 1 year ago
浮点计算不同的相加路径会的出有一点区别的结果
检查一下asm
./outputs/MMult7_bad.o: file format elf64-x86-64
Disassembly of section .text:
0000000000000000 <AddDot1x4>:
0: f3 0f 1e fa endbr64
4: 53 push %rbx
5: 85 ff test %edi,%edi
7: 0f 8e bb 00 00 00 jle c8 <AddDot1x4+0xc8>
d: 47 8d 14 00 lea (%r8,%r8,1),%r10d
11: 48 63 d2 movslq %edx,%rdx
14: 66 0f ef c9 pxor %xmm1,%xmm1
18: 47 8d 1c 02 lea (%r10,%r8,1),%r11d
1c: 66 0f 28 d1 movapd %xmm1,%xmm2
20: 66 0f 28 d9 movapd %xmm1,%xmm3
24: 4d 63 c0 movslq %r8d,%r8
27: 48 8d 04 d5 00 00 00 lea 0x0(,%rdx,8),%rax
2e: 00
2f: 8d 57 ff lea -0x1(%rdi),%edx
32: 66 0f 28 e1 movapd %xmm1,%xmm4
36: 49 63 fb movslq %r11d,%rdi
39: 48 8d 5c d1 08 lea 0x8(%rcx,%rdx,8),%rbx
3e: 49 63 d2 movslq %r10d,%rdx
41: 0f 1f 80 00 00 00 00 nopl 0x0(%rax)
48: f2 0f 10 06 movsd (%rsi),%xmm0
4c: f2 0f 10 29 movsd (%rcx),%xmm5
50: 48 01 c6 add %rax,%rsi
53: f2 0f 59 e8 mulsd %xmm0,%xmm5
57: f2 0f 58 e5 addsd %xmm5,%xmm4
5b: f2 42 0f 10 2c c1 movsd (%rcx,%r8,8),%xmm5
61: f2 0f 59 e8 mulsd %xmm0,%xmm5
65: f2 0f 58 dd addsd %xmm5,%xmm3
69: f2 0f 10 2c d1 movsd (%rcx,%rdx,8),%xmm5
6e: f2 0f 59 e8 mulsd %xmm0,%xmm5
72: f2 0f 59 04 f9 mulsd (%rcx,%rdi,8),%xmm0
77: 48 83 c1 08 add $0x8,%rcx
7b: f2 0f 58 d5 addsd %xmm5,%xmm2
7f: f2 0f 58 c8 addsd %xmm0,%xmm1
83: 48 39 cb cmp %rcx,%rbx
86: 75 c0 jne 48 <AddDot1x4+0x48>
88: 48 63 44 24 10 movslq 0x10(%rsp),%rax
8d: f2 41 0f 58 21 addsd (%r9),%xmm4
92: 48 8d 14 c5 00 00 00 lea 0x0(,%rax,8),%rdx
99: 00
9a: 49 8d 04 11 lea (%r9,%rdx,1),%rax
9e: f2 41 0f 11 21 movsd %xmm4,(%r9)
a3: f2 0f 58 18 addsd (%rax),%xmm3
a7: f2 0f 11 18 movsd %xmm3,(%rax)
ab: 48 01 d0 add %rdx,%rax
ae: f2 0f 58 10 addsd (%rax),%xmm2
b2: f2 0f 11 10 movsd %xmm2,(%rax)
b6: 48 01 d0 add %rdx,%rax
b9: f2 0f 58 08 addsd (%rax),%xmm1
bd: f2 0f 11 08 movsd %xmm1,(%rax)
c1: 5b pop %rbx
c2: c3 retq
c3: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1)
c8: 66 0f ef c9 pxor %xmm1,%xmm1
cc: 66 0f 28 d1 movapd %xmm1,%xmm2
d0: 66 0f 28 d9 movapd %xmm1,%xmm3
d4: 66 0f 28 e1 movapd %xmm1,%xmm4
d8: eb ae jmp 88 <AddDot1x4+0x88>
da: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1)
00000000000000e0 <MY_MMult>:
e0: f3 0f 1e fa endbr64
e4: 41 57 push %r15
e6: 41 56 push %r14
e8: 41 55 push %r13
ea: 41 54 push %r12
ec: 55 push %rbp
ed: 53 push %rbx
ee: 48 83 ec 38 sub $0x38,%rsp
f2: 89 7c 24 18 mov %edi,0x18(%rsp)
f6: 44 8b 6c 24 70 mov 0x70(%rsp),%r13d
fb: 48 89 4c 24 28 mov %rcx,0x28(%rsp)
100: 44 8b b4 24 80 00 00 mov 0x80(%rsp),%r14d
107: 00
108: 4c 89 4c 24 30 mov %r9,0x30(%rsp)
10d: 85 f6 test %esi,%esi
10f: 0f 8e e2 00 00 00 jle 1f7 <MY_MMult+0x117>
115: 42 8d 04 b5 00 00 00 lea 0x0(,%r14,4),%eax
11c: 00
11d: 83 ee 01 sub $0x1,%esi
120: 89 d5 mov %edx,%ebp
122: 45 89 c4 mov %r8d,%r12d
125: 89 44 24 1c mov %eax,0x1c(%rsp)
129: 83 e6 fc and $0xfffffffc,%esi
12c: 42 8d 04 ad 00 00 00 lea 0x0(,%r13,4),%eax
133: 00
134: 89 44 24 20 mov %eax,0x20(%rsp)
138: 8d 46 04 lea 0x4(%rsi),%eax
13b: 89 44 24 24 mov %eax,0x24(%rsp)
13f: 8d 47 ff lea -0x1(%rdi),%eax
142: 48 8d 44 c1 08 lea 0x8(%rcx,%rax,8),%rax
147: c7 44 24 14 00 00 00 movl $0x0,0x14(%rsp)
14e: 00
14f: 48 89 04 24 mov %rax,(%rsp)
153: c7 44 24 10 00 00 00 movl $0x0,0x10(%rsp)
15a: 00
15b: c7 44 24 0c 00 00 00 movl $0x0,0xc(%rsp)
162: 00
163: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1)
168: 8b 44 24 18 mov 0x18(%rsp),%eax
16c: 85 c0 test %eax,%eax
16e: 7e 64 jle 1d4 <MY_MMult+0xf4>
170: 48 63 44 24 14 movslq 0x14(%rsp),%rax
175: 48 8b 5c 24 30 mov 0x30(%rsp),%rbx
17a: 44 89 6c 24 70 mov %r13d,0x70(%rsp)
17f: 45 89 e5 mov %r12d,%r13d
182: 48 8b 74 24 78 mov 0x78(%rsp),%rsi
187: 41 89 ec mov %ebp,%r12d
18a: 4c 8b 7c 24 28 mov 0x28(%rsp),%r15
18f: 48 8d 1c c3 lea (%rbx,%rax,8),%rbx
193: 48 63 44 24 10 movslq 0x10(%rsp),%rax
198: 8b 6c 24 70 mov 0x70(%rsp),%ebp
19c: 4c 8d 0c c6 lea (%rsi,%rax,8),%r9
1a0: 41 56 push %r14
1a2: 44 89 ea mov %r13d,%edx
1a5: 4c 89 fe mov %r15,%rsi
1a8: 41 89 e8 mov %ebp,%r8d
1ab: 48 89 d9 mov %rbx,%rcx
1ae: 44 89 e7 mov %r12d,%edi
1b1: 49 83 c7 08 add $0x8,%r15
1b5: e8 00 00 00 00 callq 1ba <MY_MMult+0xda>
1ba: 5a pop %rdx
1bb: 49 83 c1 08 add $0x8,%r9
1bf: 4c 3b 3c 24 cmp (%rsp),%r15
1c3: 75 db jne 1a0 <MY_MMult+0xc0>
1c5: 89 6c 24 70 mov %ebp,0x70(%rsp)
1c9: 44 89 e5 mov %r12d,%ebp
1cc: 45 89 ec mov %r13d,%r12d
1cf: 44 8b 6c 24 70 mov 0x70(%rsp),%r13d
1d4: 83 44 24 0c 04 addl $0x4,0xc(%rsp)
1d9: 8b 7c 24 1c mov 0x1c(%rsp),%edi
1dd: 8b 74 24 20 mov 0x20(%rsp),%esi
1e1: 01 7c 24 10 add %edi,0x10(%rsp)
1e5: 01 74 24 14 add %esi,0x14(%rsp)
1e9: 8b 44 24 0c mov 0xc(%rsp),%eax
1ed: 3b 44 24 24 cmp 0x24(%rsp),%eax
1f1: 0f 85 71 ff ff ff jne 168 <MY_MMult+0x88>
1f7: 48 83 c4 38 add $0x38,%rsp
1fb: 5b pop %rbx
1fc: 5d pop %rbp
1fd: 41 5c pop %r12
1ff: 41 5d pop %r13
201: 41 5e pop %r14
203: 41 5f pop %r15
205: c3 retq
./outputs/MMult7.o: file format elf64-x86-64
Disassembly of section .text:
0000000000000000 <AddDot1x4>:
0: f3 0f 1e fa endbr64
4: 85 ff test %edi,%edi
6: 0f 8e 9c 00 00 00 jle a8 <AddDot1x4+0xa8>
c: 41 54 push %r12
e: 83 ef 01 sub $0x1,%edi
11: 48 63 d2 movslq %edx,%rdx
14: 55 push %rbp
15: 4c 8d 64 f9 08 lea 0x8(%rcx,%rdi,8),%r12
1a: 48 c1 e2 03 shl $0x3,%rdx
1e: 53 push %rbx
1f: 48 63 44 24 20 movslq 0x20(%rsp),%rax
24: 43 8d 1c 00 lea (%r8,%r8,1),%ebx
28: 42 8d 2c 03 lea (%rbx,%r8,1),%ebp
2c: 48 63 fb movslq %ebx,%rdi
2f: 4d 63 c0 movslq %r8d,%r8
32: 48 c1 e0 03 shl $0x3,%rax
36: 48 63 dd movslq %ebp,%rbx
39: 4d 8d 1c 01 lea (%r9,%rax,1),%r11
3d: 4d 8d 14 03 lea (%r11,%rax,1),%r10
41: 4c 01 d0 add %r10,%rax
44: 0f 1f 40 00 nopl 0x0(%rax)
48: f2 0f 10 06 movsd (%rsi),%xmm0
4c: f2 0f 10 09 movsd (%rcx),%xmm1
50: 48 01 d6 add %rdx,%rsi
53: f2 0f 59 c8 mulsd %xmm0,%xmm1
57: f2 41 0f 58 09 addsd (%r9),%xmm1
5c: f2 41 0f 11 09 movsd %xmm1,(%r9)
61: f2 42 0f 10 0c c1 movsd (%rcx,%r8,8),%xmm1
67: f2 0f 59 c8 mulsd %xmm0,%xmm1
6b: f2 41 0f 58 0b addsd (%r11),%xmm1
70: f2 41 0f 11 0b movsd %xmm1,(%r11)
75: f2 0f 10 0c f9 movsd (%rcx,%rdi,8),%xmm1
7a: f2 0f 59 c8 mulsd %xmm0,%xmm1
7e: f2 41 0f 58 0a addsd (%r10),%xmm1
83: f2 41 0f 11 0a movsd %xmm1,(%r10)
88: f2 0f 59 04 d9 mulsd (%rcx,%rbx,8),%xmm0
8d: 48 83 c1 08 add $0x8,%rcx
91: f2 0f 58 00 addsd (%rax),%xmm0
95: f2 0f 11 00 movsd %xmm0,(%rax)
99: 49 39 cc cmp %rcx,%r12
9c: 75 aa jne 48 <AddDot1x4+0x48>
9e: 5b pop %rbx
9f: 5d pop %rbp
a0: 41 5c pop %r12
a2: c3 retq
a3: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1)
a8: c3 retq
a9: 0f 1f 80 00 00 00 00 nopl 0x0(%rax)
00000000000000b0 <MY_MMult>:
b0: f3 0f 1e fa endbr64
b4: 41 57 push %r15
b6: 41 56 push %r14
b8: 41 55 push %r13
ba: 41 54 push %r12
bc: 55 push %rbp
bd: 53 push %rbx
be: 48 83 ec 38 sub $0x38,%rsp
c2: 89 7c 24 18 mov %edi,0x18(%rsp)
c6: 44 8b 6c 24 70 mov 0x70(%rsp),%r13d
cb: 48 89 4c 24 28 mov %rcx,0x28(%rsp)
d0: 44 8b b4 24 80 00 00 mov 0x80(%rsp),%r14d
d7: 00
d8: 4c 89 4c 24 30 mov %r9,0x30(%rsp)
dd: 85 f6 test %esi,%esi
df: 0f 8e e2 00 00 00 jle 1c7 <MY_MMult+0x117>
e5: 42 8d 04 b5 00 00 00 lea 0x0(,%r14,4),%eax
ec: 00
ed: 83 ee 01 sub $0x1,%esi
f0: 89 d5 mov %edx,%ebp
f2: 45 89 c4 mov %r8d,%r12d
f5: 89 44 24 1c mov %eax,0x1c(%rsp)
f9: 83 e6 fc and $0xfffffffc,%esi
fc: 42 8d 04 ad 00 00 00 lea 0x0(,%r13,4),%eax
103: 00
104: 89 44 24 20 mov %eax,0x20(%rsp)
108: 8d 46 04 lea 0x4(%rsi),%eax
10b: 89 44 24 24 mov %eax,0x24(%rsp)
10f: 8d 47 ff lea -0x1(%rdi),%eax
112: 48 8d 44 c1 08 lea 0x8(%rcx,%rax,8),%rax
117: c7 44 24 14 00 00 00 movl $0x0,0x14(%rsp)
11e: 00
11f: 48 89 04 24 mov %rax,(%rsp)
123: c7 44 24 10 00 00 00 movl $0x0,0x10(%rsp)
12a: 00
12b: c7 44 24 0c 00 00 00 movl $0x0,0xc(%rsp)
132: 00
133: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1)
138: 8b 44 24 18 mov 0x18(%rsp),%eax
13c: 85 c0 test %eax,%eax
13e: 7e 64 jle 1a4 <MY_MMult+0xf4>
140: 48 63 44 24 14 movslq 0x14(%rsp),%rax
145: 48 8b 5c 24 30 mov 0x30(%rsp),%rbx
14a: 44 89 6c 24 70 mov %r13d,0x70(%rsp)
14f: 45 89 e5 mov %r12d,%r13d
152: 48 8b 74 24 78 mov 0x78(%rsp),%rsi
157: 41 89 ec mov %ebp,%r12d
15a: 4c 8b 7c 24 28 mov 0x28(%rsp),%r15
15f: 48 8d 1c c3 lea (%rbx,%rax,8),%rbx
163: 48 63 44 24 10 movslq 0x10(%rsp),%rax
168: 8b 6c 24 70 mov 0x70(%rsp),%ebp
16c: 4c 8d 0c c6 lea (%rsi,%rax,8),%r9
170: 41 56 push %r14
172: 44 89 ea mov %r13d,%edx
175: 4c 89 fe mov %r15,%rsi
178: 41 89 e8 mov %ebp,%r8d
17b: 48 89 d9 mov %rbx,%rcx
17e: 44 89 e7 mov %r12d,%edi
181: 49 83 c7 08 add $0x8,%r15
185: e8 00 00 00 00 callq 18a <MY_MMult+0xda>
18a: 5a pop %rdx
18b: 49 83 c1 08 add $0x8,%r9
18f: 4c 3b 3c 24 cmp (%rsp),%r15
193: 75 db jne 170 <MY_MMult+0xc0>
195: 89 6c 24 70 mov %ebp,0x70(%rsp)
199: 44 89 e5 mov %r12d,%ebp
19c: 45 89 ec mov %r13d,%r12d
19f: 44 8b 6c 24 70 mov 0x70(%rsp),%r13d
1a4: 83 44 24 0c 04 addl $0x4,0xc(%rsp)
1a9: 8b 7c 24 1c mov 0x1c(%rsp),%edi
1ad: 8b 74 24 20 mov 0x20(%rsp),%esi
1b1: 01 7c 24 10 add %edi,0x10(%rsp)
1b5: 01 74 24 14 add %esi,0x14(%rsp)
1b9: 8b 44 24 0c mov 0xc(%rsp),%eax
1bd: 3b 44 24 24 cmp 0x24(%rsp),%eax
1c1: 0f 85 71 ff ff ff jne 138 <MY_MMult+0x88>
1c7: 48 83 c4 38 add $0x38,%rsp
1cb: 5b pop %rbx
1cc: 5d pop %rbp
1cd: 41 5c pop %r12
1cf: 41 5d pop %r13
1d1: 41 5e pop %r14
1d3: 41 5f pop %r15
1d5: c3 retq
代码:
代码来源于
可以cd 到 gemm/src/ 下 make 即可
不用中间变量就不会有这个问题:
与之对比的代码 Bad case
void AddDot1x4( int k, double a, int lda, double b, int ldb, double c, int ldc) { //register double reg_a = A(0, p); //https://zh.wikipedia.org/zh-hk/%E5%AF%84%E5%AD%98%E5%99%A8 // has a benifit to use register
register double reg_a_0, reg_a_1, reg_a_2, reg_a_3; reg_a_0 = 0.0; reg_a_1 = 0.0; reg_a_2 = 0.0; reg_a_3 = 0.0; double reg_a; double / Point to the current elements in the four columns of B / bp0_pntr, bp1_pntr, bp2_pntr, *bp3_pntr;
for (int p = 0; p < k ; p++){ reg_a = A(0, p); bp0_pntr = &B(p , 0); bp1_pntr = &B(p , 1); bp2_pntr = &B(p , 2); bp3_pntr = &B(p , 3);
}
C(0, 0) += reg_a_0; C(0, 1) += reg_a_1; C(0, 2) += reg_a_2; C(0, 3) += reg_a_3;
}
Architecture: x86_64 CPU op-mode(s): 32-bit, 64-bit Byte Order: Little Endian Address sizes: 39 bits physical, 48 bits virtual CPU(s): 16 On-line CPU(s) list: 0-15 Thread(s) per core: 2 Core(s) per socket: 8 Socket(s): 1 NUMA node(s): 1 Vendor ID: GenuineIntel CPU family: 6 Model: 167 Model name: 11th Gen Intel(R) Core(TM) i9-11900 @ 2.50GHz Stepping: 1 CPU MHz: 2500.000 CPU max MHz: 5200.0000 CPU min MHz: 800.0000 BogoMIPS: 4992.00 Virtualization: VT-x L1d cache: 384 KiB L1i cache: 256 KiB L2 cache: 4 MiB L3 cache: 16 MiB NUMA node0 CPU(s): 0-15 Vulnerability Itlb multihit: Not affected Vulnerability L1tf: Not affected Vulnerability Mds: Not affected Vulnerability Meltdown: Not affected Vulnerability Mmio stale data: Mitigation; Clear CPU buffers; SMT vulnerable Vulnerability Retbleed: Mitigation; Enhanced IBRS Vulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp Vulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization Vulnerability Spectre v2: Mitigation; Enhanced IBRS, IBPB conditional, RSB filling, PBRSB-eIBRS SW sequence Vulnerability Srbds: Not affected Vulnerability Tsx async abort: Not affected Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dt s acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc ar t arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc cpuid aperfmperf tsc_k nown_freq pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 sdbg fma cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm abm 3dnowprefetch cpuid_fault epb invpcid_single ssbd ibrs ibp b stibp ibrs_enhanced tpr_shadow vnmi flexpriority ept vpid ept_ad fsgsbase tsc_ad just bmi1 avx2 smep bmi2 erms invpcid mpx avx512f avx512dq rdseed adx smap avx512i fma clflushopt intel_pt avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves dtherm ida arat pln pts hwp hwp_notify hwp_act_window hwp_epp hwp_pkg_req a vx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bita lg avx512_vpopcntdq rdpid fsrm md_clear flush_l1d arch_capabilities
version = 'MMult7_bad'; MY_MMult = [ 40 8.533333e+00 2.664535e-15 80 8.325203e+00 7.105427e-15 120 8.793893e+00 1.065814e-14 160 8.808602e+00 1.421085e-14 200 8.757526e+00 1.421085e-14 240 8.807901e+00 1.776357e-14 280 8.801925e+00 1.776357e-14 320 8.784987e+00 2.842171e-14 360 8.741990e+00 2.309264e-14 400 8.673850e+00 2.842171e-14 440 8.766943e+00 3.197442e-14 480 8.664708e+00 3.907985e-14 520 8.537478e+00 3.907985e-14 560 8.408714e+00 4.618528e-14 600 8.279986e+00 4.263256e-14 640 8.222449e+00 4.440892e-14 680 8.137895e+00 5.329071e-14 720 8.238834e+00 4.973799e-14 760 8.274526e+00 4.973799e-14 800 8.087957e+00 4.973799e-14 ];