Some optimizing idea...(VU mircro ESUM)

GoogleCodeExporter commented 8 years ago

I tested the ESUM code on SSE~SSE4.1.
the HADDPS could not a good method on ESUM...

// tested on Penryn-3M
Vec4Sum_Init:    6.00
 Vec4Sum_SSE:   10.33
 Vec4Sum1SSE:   11.00
Vec4Sum_SSE2:   10.08
Vec4Sum_SSE3:   14.40
Vec4Sum_SSE4:   11.00
Vec4Sum1SSE4:   10.12
Vec4Sum2SSE4:   10.00

// LFENCE
Vec4Sum_Init:   16.00
 Vec4Sum_SSE:   20.00
 Vec4Sum1SSE:   21.00
Vec4Sum_SSE2:   21.00
Vec4Sum_SSE3:   24.25
Vec4Sum_SSE4:   25.50
Vec4Sum1SSE4:   21.00
Vec4Sum2SSE4:   20.00

// SFENCE
Vec4Sum_Init:   22.01
 Vec4Sum_SSE:   21.00
 Vec4Sum1SSE:   21.00
Vec4Sum_SSE2:   21.00
Vec4Sum_SSE3:   21.18
Vec4Sum_SSE4:   33.00
Vec4Sum1SSE4:   21.00
Vec4Sum2SSE4:   21.00

// MFENCE
Vec4Sum_Init:   15.00
 Vec4Sum_SSE:   14.22
 Vec4Sum1SSE:   14.40
Vec4Sum_SSE2:   14.20
Vec4Sum_SSE3:   14.02
Vec4Sum_SSE4:   26.00
Vec4Sum1SSE4:   14.26
Vec4Sum2SSE4:   14.20

__declspec(naked) void Vec4Sum2SSE4() {
  __asm {
   movshdup xmm1, xmm0
    addps   xmm1, xmm0
    movhlps xmm0, xmm1
    addss   xmm0, xmm1

    movss   [eax] ,xmm0
    ret
  }
}

__declspec(naked) void Vec4Sum1SSE4() {
  __asm {
    movhlps xmm1, xmm0
    addps   xmm1, xmm0
   movshdup xmm0, xmm1
    addss   xmm0, xmm1

    movss   [eax] ,xmm0
    ret
  }
}

__declspec(align(16)) static float _one[4]={1,1,1,1};

__declspec(naked) void Vec4Sum_SSE4() {
  __asm {
    dpps    xmm0, _one, 11110001b

    movss   [eax] ,xmm0
    ret
  }
}

__declspec(naked) void Vec4Sum_SSE3() {
  __asm {
    haddps  xmm0, xmm0
    haddps  xmm0, xmm0

    movss   [eax] ,xmm0
    ret
  }
}

__declspec(naked) void Vec4Sum_SSE2() {
  __asm {
    movhlps xmm1, xmm0
    addps   xmm1, xmm0
    pshufd  xmm0, xmm1, 00000001b
    addss   xmm0, xmm1

    movss   [eax] ,xmm0
    ret
  }
}

__declspec(naked) void Vec4Sum1SSE() {
  __asm {
    movhlps xmm1, xmm0
    addps   xmm1, xmm0
   unpcklps xmm1, xmm1
    movhlps xmm0, xmm1
    addss   xmm0, xmm1

    movss   [eax] ,xmm0

    ret
  }
}

__declspec(naked) void Vec4Sum_SSE() {
  __asm {
    movhlps xmm1, xmm0
    addps   xmm1, xmm0
    movaps  xmm0, xmm1
    shufps  xmm1, xmm1, 00000001b
    addss   xmm0, xmm1

    movss   [eax] ,xmm0

    ret
  }
}

__declspec(naked) void Vec4Sum_Init() {
  __asm {
    movaps  xmm0, _one

    movss   [eax] ,xmm0

    ret
  }
}

Original issue reported on code.google.com by w0w71...@gmail.com on 1 Feb 2009 at 10:55

GoogleCodeExporter commented 8 years ago

ESUM isn't called enough to make a difference speed-wise.
so small optimizations probably won't be noticeable.

i'll keep this open incase any other dev wants to implement the code.

Original comment by cottonvibes on 2 Feb 2009 at 8:27

GoogleCodeExporter commented 8 years ago

Yes. Dot Product use scaler-add would be faster than vector-add(except dpps).
the SHUFPS and PSHUFD are bad latency on Core 2 Duo(Conroe) only.
but using MOVSHDUP can get less latency.

__declspec(naked) void Vec4Dot_SSE2() {
  __asm {
    mulps   xmm0, xmm0
    movhlps xmm1, xmm0
    addps   xmm1, xmm0
    pshufd  xmm0, xmm1, 00000001b
    addss   xmm0, xmm1

    movss   [eax] ,xmm0
    ret
  }
}

__declspec(naked) void Vec4Dot_SSSE3() {
  __asm {
    mulps   xmm0, xmm0
   movshdup xmm1, xmm0
    addps   xmm1, xmm0
    movhlps xmm0, xmm1
    addss   xmm0, xmm1

    movss   [eax] ,xmm0
    ret
  }
}

Original comment by w0w71...@gmail.com on 3 Feb 2009 at 3:48

sifadil / pcsx2-playground

Some optimizing idea...(VU mircro ESUM) #142