RRZE-HPC / OSACA

Open Source Architecture Code Analyzer
GNU Affero General Public License v3.0
296 stars 18 forks source link

osaca not terminating #72

Closed jdomke closed 3 years ago

jdomke commented 3 years ago

Hi, i have an asm sequence which forces osaca into a lifelock. I tried runnig with python -m trace to find out whats wrong, but had no luck pinpointing the issue without deep knowledge about the osaca internals, so I have to hand it of to you.

The problem exists in osaca-0.3.14 and osaca-0.4.0 (for latter: after fixing the parser bug at semantics/kernel_dg.py line 316 which prevents using hex for callq and other instructions).

My call: python3 -m osaca --arch BDW --ignore-unknown --verbose /dev/shm/osaca_43305_556_575.s

⇒  cat /dev/shm/osaca_43305_556_575.s
# OSACA-BEGIN
push   %r12
push   %r13
push   %r14
push   %r15
push   %rbp
mov    %ecx,%r12d
mov    %esi,%r14d
mov    %r12d,%ecx
mov    %r14d,%esi
mov    %rdx,%r13
mov    %rdi,%rbp
callq  0x4210d0 #<x264_pixel_satd_8x4> (File Offset:  0x210d0)
mov    %rdx,%r8
movzbl (%rdi),%r9d
movslq %esi,%rsi
movslq %ecx,%rcx
movzbl (%r8),%r10d
vmovd  %r9d,%xmm13
movzbl 0x4(%r8),%r9d
vpinsrb $0x1,(%rsi,%rdi,1),%xmm13,%xmm14
lea    (%rsi,%rsi,2),%rdx
vmovd  %r10d,%xmm1
vpinsrb $0x1,(%rcx,%r8,1),%xmm1,%xmm0
vmovd  %r9d,%xmm7
vpinsrb $0x1,0x4(%rcx,%r8,1),%xmm7,%xmm5
vpinsrb $0x2,(%rdi,%rsi,2),%xmm14,%xmm15
vpinsrb $0x2,(%r8,%rcx,2),%xmm0,%xmm6
vpinsrb $0x2,0x4(%r8,%rcx,2),%xmm5,%xmm9
vpinsrb $0x3,(%rdx,%rdi,1),%xmm15,%xmm4
movzbl 0x4(%rdi),%r11d
lea    (%rcx,%rcx,2),%rax
vpinsrb $0x3,(%rax,%r8,1),%xmm6,%xmm10
vpinsrb $0x3,0x4(%rax,%r8,1),%xmm9,%xmm11
vmovd  %r11d,%xmm2
vpinsrb $0x1,0x4(%rsi,%rdi,1),%xmm2,%xmm8
vpinsrb $0x2,0x4(%rdi,%rsi,2),%xmm8,%xmm3
movzbl 0x1(%rdi),%r10d
movzbl 0x5(%rdi),%r9d
movzbl 0x1(%r8),%r11d
vmovd  %r10d,%xmm1
movzbl 0x5(%r8),%r10d
vmovd  %r9d,%xmm7
vpmovzxbd %xmm4,%xmm4
vmovd  %r11d,%xmm2
vpmovzxbd %xmm10,%xmm10
vpinsrb $0x3,0x4(%rdx,%rdi,1),%xmm3,%xmm12
vpsubd %xmm10,%xmm4,%xmm14
vpinsrb $0x1,0x5(%rsi,%rdi,1),%xmm7,%xmm5
vmovd  %r10d,%xmm4
vpinsrb $0x1,0x5(%rcx,%r8,1),%xmm4,%xmm10
vpinsrb $0x1,0x1(%rcx,%r8,1),%xmm2,%xmm8
vpinsrb $0x1,0x1(%rsi,%rdi,1),%xmm1,%xmm0
vpinsrb $0x2,0x5(%rdi,%rsi,2),%xmm5,%xmm9
vpinsrb $0x2,0x1(%r8,%rcx,2),%xmm8,%xmm3
vpinsrb $0x2,0x1(%rdi,%rsi,2),%xmm0,%xmm6
vpmovzxbd %xmm12,%xmm12
vpmovzxbd %xmm11,%xmm11
vpsubd %xmm11,%xmm12,%xmm13
vpinsrb $0x2,0x5(%r8,%rcx,2),%xmm10,%xmm11
vpslld $0x10,%xmm13,%xmm15
vpinsrb $0x3,0x1(%rdx,%rdi,1),%xmm6,%xmm13
vpaddd %xmm15,%xmm14,%xmm12
vpinsrb $0x3,0x5(%rdx,%rdi,1),%xmm9,%xmm15
vpinsrb $0x3,0x1(%rax,%r8,1),%xmm3,%xmm14
vpinsrb $0x3,0x5(%rax,%r8,1),%xmm11,%xmm1
movzbl 0x2(%rdi),%r11d
movzbl 0x2(%r8),%r9d
vpmovzxbd %xmm15,%xmm15
vmovd  %r11d,%xmm8
vmovd  %r9d,%xmm5
vpinsrb $0x1,0x2(%rsi,%rdi,1),%xmm8,%xmm3
vpinsrb $0x1,0x2(%rcx,%r8,1),%xmm5,%xmm9
vpinsrb $0x2,0x2(%rdi,%rsi,2),%xmm3,%xmm7
vpinsrb $0x2,0x2(%r8,%rcx,2),%xmm9,%xmm4
vpinsrb $0x3,0x2(%rdx,%rdi,1),%xmm7,%xmm3
vpinsrb $0x3,0x2(%rax,%r8,1),%xmm4,%xmm7
vpmovzxbd %xmm1,%xmm1
movzbl 0x6(%r8),%r11d
vpsubd %xmm1,%xmm15,%xmm0
vpmovzxbd %xmm13,%xmm13
vpslld $0x10,%xmm0,%xmm2
vpmovzxbd %xmm14,%xmm14
vpsubd %xmm14,%xmm13,%xmm6
vpaddd %xmm2,%xmm6,%xmm11
vmovd  %r11d,%xmm6
vpinsrb $0x1,0x6(%rcx,%r8,1),%xmm6,%xmm2
movzbl 0x6(%rdi),%r10d
vpinsrb $0x2,0x6(%r8,%rcx,2),%xmm2,%xmm8
vmovd  %r10d,%xmm10
vpinsrb $0x1,0x6(%rsi,%rdi,1),%xmm10,%xmm1
vpinsrb $0x3,0x6(%rax,%r8,1),%xmm8,%xmm9
vpinsrb $0x2,0x6(%rdi,%rsi,2),%xmm1,%xmm0
movzbl 0x3(%rdi),%r9d
movzbl 0x7(%rdi),%r11d
vpmovzxbd %xmm3,%xmm3
vpmovzxbd %xmm7,%xmm7
vmovd  %r9d,%xmm14
vmovd  %r11d,%xmm8
vpsubd %xmm7,%xmm3,%xmm10
vpinsrb $0x1,0x3(%rsi,%rdi,1),%xmm14,%xmm15
vpinsrb $0x1,0x7(%rsi,%rdi,1),%xmm8,%xmm3
vpinsrb $0x3,0x6(%rdx,%rdi,1),%xmm0,%xmm5
vpinsrb $0x2,0x3(%rdi,%rsi,2),%xmm15,%xmm1
vpinsrb $0x2,0x7(%rdi,%rsi,2),%xmm3,%xmm7
vpaddd %xmm11,%xmm12,%xmm3
vpmovzxbd %xmm5,%xmm5
vpmovzxbd %xmm9,%xmm9
vpsubd %xmm9,%xmm5,%xmm4
vpslld $0x10,%xmm4,%xmm13
vpinsrb $0x3,0x7(%rdx,%rdi,1),%xmm7,%xmm15
vpaddd %xmm13,%xmm10,%xmm10
vpinsrb $0x3,0x3(%rdx,%rdi,1),%xmm1,%xmm13
movzbl 0x7(%r8),%edx
movzbl 0x3(%r8),%r10d
vpmovzxbd %xmm15,%xmm15
vmovd  %edx,%xmm5
vpinsrb $0x1,0x7(%rcx,%r8,1),%xmm5,%xmm9
vmovd  %r10d,%xmm0
vpinsrb $0x1,0x3(%rcx,%r8,1),%xmm0,%xmm6
vpinsrb $0x2,0x7(%r8,%rcx,2),%xmm9,%xmm4
vpinsrb $0x2,0x3(%r8,%rcx,2),%xmm6,%xmm2
vpinsrb $0x3,0x7(%rax,%r8,1),%xmm4,%xmm1
vpinsrb $0x3,0x3(%rax,%r8,1),%xmm2,%xmm14
vpmovzxbd %xmm1,%xmm1
vpmovzxbd %xmm13,%xmm13
vpsubd %xmm1,%xmm15,%xmm0
vpmovzxbd %xmm14,%xmm14
vpslld $0x10,%xmm0,%xmm2
vpsubd %xmm14,%xmm13,%xmm6
vpsubd %xmm11,%xmm12,%xmm1
vpaddd %xmm2,%xmm6,%xmm8
vpaddd %xmm8,%xmm10,%xmm12
vpsubd %xmm8,%xmm10,%xmm0
vpaddd %xmm12,%xmm3,%xmm8
vpaddd %xmm0,%xmm1,%xmm7
vpsubd %xmm12,%xmm3,%xmm3
vpsubd %xmm0,%xmm1,%xmm5
vunpcklps %xmm7,%xmm8,%xmm6
vunpcklps %xmm5,%xmm3,%xmm2
vunpckhps %xmm7,%xmm8,%xmm9
vunpckhps %xmm5,%xmm3,%xmm4
vunpcklpd %xmm2,%xmm6,%xmm10
vunpckhpd %xmm2,%xmm6,%xmm11
vunpcklpd %xmm4,%xmm9,%xmm12
vpaddd %xmm11,%xmm10,%xmm14
vunpckhpd %xmm4,%xmm9,%xmm13
vpsubd %xmm11,%xmm10,%xmm1
vpaddd %xmm13,%xmm12,%xmm15
vpsubd %xmm13,%xmm12,%xmm0
vpaddd %xmm15,%xmm14,%xmm9
vpaddd %xmm0,%xmm1,%xmm7
vpsubd %xmm15,%xmm14,%xmm8
vpsubd %xmm0,%xmm1,%xmm6
vmovdqu 0x279d68(%rip),%xmm15        # 0x69b170 #<hpel_ref1+0x130> (File Offset:  0x29b170)
vpsrld $0xf,%xmm9,%xmm2
vpsrld $0xf,%xmm7,%xmm10
vpand  %xmm15,%xmm2,%xmm3
vmovdqu 0x279d40(%rip),%xmm4        # 0x69b160 #<hpel_ref1+0x120> (File Offset:  0x29b160)
vpand  %xmm15,%xmm10,%xmm11
vpsrld $0xf,%xmm8,%xmm12
vpsrld $0xf,%xmm6,%xmm14
vpmulld %xmm3,%xmm4,%xmm5
vpand  %xmm15,%xmm12,%xmm13
vpmulld %xmm11,%xmm4,%xmm3
vpand  %xmm15,%xmm14,%xmm1
vpmulld %xmm13,%xmm4,%xmm2
vpaddd %xmm3,%xmm7,%xmm7
vpmulld %xmm1,%xmm4,%xmm0
vpaddd %xmm5,%xmm9,%xmm4
vpxor  %xmm5,%xmm4,%xmm5
vpxor  %xmm3,%xmm7,%xmm9
vpaddd %xmm2,%xmm8,%xmm8
vpaddd %xmm9,%xmm5,%xmm3
vpxor  %xmm2,%xmm8,%xmm2
vpaddd %xmm0,%xmm6,%xmm6
vpaddd %xmm2,%xmm3,%xmm4
vpxor  %xmm0,%xmm6,%xmm0
vpaddd %xmm0,%xmm4,%xmm2
vpxor  %xmm1,%xmm1,%xmm1
vpaddd %xmm2,%xmm1,%xmm1
vpsrldq $0x8,%xmm1,%xmm3
vpaddd %xmm3,%xmm1,%xmm5
vpsrlq $0x20,%xmm5,%xmm6
vpaddd %xmm6,%xmm5,%xmm7
vmovd  %xmm7,%ecx
movzwl %cx,%eax
shr    $0x10,%ecx
add    %ecx,%eax
shr    %eax
retq
# OSACA-END
JanLJL commented 3 years ago

Hi Jens, thanks for the issue and also for the bugfix in the new version ;) I could reproduce the error and it happens in the NetworkX module function for calculating the dependency paths throughout the kernel, so it might be a bug in their code. I will have a deeper look at it over the weekend and will try to come up with a fix!

JanLJL commented 3 years ago

Hi @jdomke, I dug a bit deeper and the good news are: It's no lifelock. On the other hand the resulting bad news are: The creation time of all possible path throughout the kernel graph increases tremendously with a growing number of edges.

In your special case, the last 6 instructions starting from vmovd %xmm7,%ecx create a dependency on ecx which is used in the beginning of our kernel and throughout it for multiple memory addressing operations. So running OSACA with --lines 0-185 on my PC takes about 6s, but as soon as you add the next line, we are getting in this lifelock-similiar state of almost endless computation.

Since your code does not look like an actual loop, a LCD analysis seems not to be necessary, so we added multiple enhancements which hopefully also meet your requirements:

Those features are included in the latest release (0.4.1). Feel free to re-open this issue in case this is not sufficient and we can think about other solutions!