Open vfdff opened 1 year ago
simplified case: https://godbolt.org/z/q59hczjWG
void sub4x4_dct_simple (int16_t *__restrict d,
int16_t *__restrict dct,
uint8_t *pix1, uint8_t *pix2 )
{
for( int i = 0; i < 4; i++ )
{
int s03 = d[i*4+0] + d[i*4+3];
int s12 = d[i*4+1] + d[i*4+2];
int d03 = d[i*4+0] - d[i*4+3];
int d12 = d[i*4+1] - d[i*4+2];
dct[0*4+i] = s03 + s12;
dct[1*4+i] = 2*d03 + d12;
dct[2*4+i] = s03 - s12;
dct[3*4+i] = d03 - 2*d12;
}
}
@llvm/issue-subscribers-backend-aarch64
Author: Allen (vfdff)
it seems a cost model issue(commit d827865e9). It generate SLP when we increase the cost for fadd(Now x86 set cost 2 for double fadd)
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -2900,7 +2900,7 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
(Ty->getScalarType()->isBFloatTy() && !ST->hasBF16()))
return 2 * LT.first;
if (!Ty->getScalarType()->isFP128Ty())
- return LT.first;
+ return 2 * LT.first;
or add -aarch64-insert-extract-base-cost=1 for arm:https://godbolt.org/z/eras8WG91
// Get the smaller of the legalized or original pow2-extended number of
// vector elements, which represents the number of unpacks we'll end up
// performing.
unsigned NumElts = LT.second.getVectorNumElements();
unsigned Pow2Elts =
PowerOf2Ceil(cast<FixedVectorType>(Ty)->getNumElements());
Cost += (std::min<unsigned>(NumElts, Pow2Elts) - 1) * LT.first;
unsigned AArch64Subtarget::getVectorInsertExtractBaseCost() const {
if (OverrideVectorInsertExtractBaseCost.getNumOccurrences() > 0)
return OverrideVectorInsertExtractBaseCost;
return VectorInsertExtractBaseCost;
}
gcc gets new improvement idea record on https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98138.
test: https://godbolt.org/z/11TbEx119