Closed fxcoudert closed 10 months ago
Not ready for merging at this stage, first rough implementation. Will need tests.
In addition to AARCH64_BUILTIN_HUGE_VALQ AARCH64_BUILTIN_INFQ we add AARCH64_BUILTIN_COPYSIGNQ AARCH64_BUILTIN_FABSQ AARCH64_BUILTIN_NANQ AARCH64_BUILTIN_NANSQ
- builtin_nanq() and __builtin_nansq() are constant and always folded, like builtin_infq() and __builtin_huge_valq() already were. I need to check that the generated NaNs have the correct payload.
So we can (if necessary) split these out and put them in sooner.
- __builtin_fabsq() and __builtin_copysignq() generate library calls. The next step should be to avoid the library calls, and use the
abstf2
andcopysigntf3
patterns. I'm not sure how to do that, I think it needsemit_insn()
but I'm not entirely sure how to use that.
I guess looking at what aarch64-linux does for the long double case (I should check if it supports __float128 / _Float128, if so that ought to be even better as a guide).
I had a build of aarch64-linux already ...
_Float128 f (_Float128 c)
{
return __builtin_copysign (42.0F128, c);
}
seems to generate:
bl __trunctfdf2
fmov d31, d0
mov x0, 4631107791820423168
fmov d30, x0
mov x0, -9223372036854775808
fmov d29, x0
bif v31.8b, v30.8b, v29.8b
fmov d0, d31
bl __extenddftf2
Which, if I read it correctly, is horrible, losing all the precision of the float128 in the process....
edit: I suppose because __builtin_copysign is somehow defaulting to double
.
whereas....
long double ffd (long double c)
{
return __builtin_copysignl (42.0L, c);
}
<snip>
and x0, x0, -9223372036854775808
adrp x1, .LC0
add x1, x1, :lo12:.LC0
ldr q30, [x1]
cmp x0, 0
beq .L5
adrp x0, .LC1
add x0, x0, :lo12:.LC1
ldr q30, [x0]
.L5:
mov v0.16b, v30.16b
<snip>
.LC0:
.word 0
.word 0
.word 0
.word 1074024448
.align 4
.LC1:
.word 0
.word 0
.word 0
.word -1073459200
So it seems we need to figure out what is done for the long double case.
I'm wondering if there needs to be an optab entry for the "q" versions (and how that is arranged).
ah but, if I do...
_Float128 f128 (_Float128 c)
{
return __builtin_copysignl (42.0F128, c);
}
that is also accepted and produces the inline code without truncations.
(but presumably it is not for Darwin's __float128)
_Float128 f128 (_Float128 c)
{
return __builtin_copysignf128 (42.0F128, c);
}
__float128 ffd (__float128 c)
{
return __builtin_copysignf128 (42.0f128, c);
}
produces the right output, so maybe the solution is to alias __builtin_copysignq => __builtin_copysignf128
_Float128 f128 (_Float128 c) { return __builtin_copysignf128 (42.0F128, c); } __float128 ffd (__float128 c) { return __builtin_copysignf128 (42.0f128, c); }
produces the right output, so maybe the solution is to alias __builtin_copysignq => __builtin_copysignf128
edit: it does not seem that the core builtins have "q" as one of the options.
I have a further patch for fabsq():
diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc
index cd9e94117f1..46d099784a3 100644
--- a/gcc/config/aarch64/aarch64-builtins.cc
+++ b/gcc/config/aarch64/aarch64-builtins.cc
@@ -2483,6 +2483,22 @@ aarch64_expand_builtin_rsqrt (int fcode, tree exp, rtx target)
return target;
}
+/* Function to expand TFmode fabs builtin. */
+
+static rtx
+aarch64_expand_builtin_fabsq (tree exp, rtx target)
+{
+ tree arg0 = CALL_EXPR_ARG (exp, 0);
+ rtx op0 = expand_normal (arg0);
+
+ if (!target)
+ target = gen_reg_rtx (GET_MODE (op0));
+
+ emit_insn (gen_abstf2 (target, op0));
+
+ return target;
+}
+
/* Expand a FCMLA lane expression EXP with code FCODE and
result going to TARGET if that is convenient. */
@@ -2926,8 +2942,9 @@ aarch64_general_expand_builtin (unsigned int fcode, tree exp, rtx target,
case AARCH64_BUILTIN_RNG_RNDRRS:
return aarch64_expand_rng_builtin (exp, target, fcode, ignore);
- case AARCH64_BUILTIN_COPYSIGNQ:
case AARCH64_BUILTIN_FABSQ:
+ return aarch64_expand_builtin_fabsq (exp, target);
+ case AARCH64_BUILTIN_COPYSIGNQ:
return expand_call (exp, target, ignore);
}
but it fails to build because gen_abstf2
is not defined on the target. I only have:
$ grep gen_abs.f2 *.h
insn-flags.h:extern rtx gen_abshf2 (rtx, rtx);
insn-flags.h:extern rtx gen_abssf2 (rtx, rtx);
insn-flags.h:extern rtx gen_absdf2 (rtx, rtx);
$ grep gen_copysign.f3 *.h
insn-flags.h:extern rtx gen_copysignsf3_insn (rtx, rtx, rtx, rtx);
insn-flags.h:extern rtx gen_copysigndf3_insn (rtx, rtx, rtx, rtx);
insn-flags.h:extern rtx gen_copysignsf3 (rtx, rtx, rtx);
insn-flags.h:extern rtx gen_copysigndf3 (rtx, rtx, rtx);
so the TFmode variants are not exposed. I can see two places where that could come from in aarch64.md
:
(define_expand "abs<mode>2"
[(set (match_operand:GPI 0 "register_operand")
(abs:GPI (match_operand:GPI 1 "register_operand")))]
""
{
if (!TARGET_CSSC)
{
rtx ccreg = aarch64_gen_compare_reg (LT, operands[1], const0_rtx);
rtx x = gen_rtx_LT (VOIDmode, ccreg, const0_rtx);
emit_insn (gen_csneg3<mode>_insn (operands[0], x, operands[1], operands[1]));
DONE;
}
}
)
or
(define_insn "abs<mode>2"
[(set (match_operand:GPF_F16 0 "register_operand" "=w")
(abs:GPF_F16 (match_operand:GPF_F16 1 "register_operand" "w")))]
"TARGET_FLOAT"
"fabs\\t%<s>0, %<s>1"
[(set_attr "type" "ffarith<stype>")]
)
(but I think the latter one is for gen_abshf2
). I'm not sure how to change the patterns to create an abstf2
yes, for the standard optabs to expand that, there has to be a match like:
(define_expand "xxxxxxx<TF:mode>3"
[(match_operand:TF 0 "register_operand")
etc...
in aarch64.md
.
but, this does not seem to be necessary for things like copysign for f128 - the generic expansion code seems to work ..
_Float128 a = __builtin_fabsf128 (c);
also seems to work OK - so maybe it's a question of invoking that (figuring out what the spelling for the IFN_ is..) .. or maybe the generic expansion code can be called ?
also: looking at the output of fdump-tree-gimple for
_Float128 a = __builtin_fabsf128 (x);
_Float128 b = __builtin_huge_valf128 ();
_Float128 c = __builtin_inff128 ();
return __builtin_copysignf128 (42.0F128, x);
It seems that the front end already lowers the first three:
_Float128 a = ABS_EXPR <x>;
_Float128 b = Inf;
_Float128 c = Inf;
return __builtin_copysignf128 (4.2e+1, x);
I did not look at how it matches those.
Yes of course, I can fold fabsq(), it's much easier:
diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc
index cd9e94117f1..da42f7a6352 100644
--- a/gcc/config/aarch64/aarch64-builtins.cc
+++ b/gcc/config/aarch64/aarch64-builtins.cc
@@ -2927,7 +2927,6 @@ aarch64_general_expand_builtin (unsigned int fcode, tree exp, rtx target,
return aarch64_expand_rng_builtin (exp, target, fcode, ignore);
case AARCH64_BUILTIN_COPYSIGNQ:
- case AARCH64_BUILTIN_FABSQ:
return expand_call (exp, target, ignore);
}
@@ -3053,6 +3052,9 @@ aarch64_general_fold_builtin (unsigned int fcode, tree type,
return build_real (type, nan);
return NULL_TREE;
}
+ case AARCH64_BUILTIN_FABSQ:
+ gcc_assert (n_args == 1);
+ return fold_build1 (ABS_EXPR, type, args[0]);
default:
break;
}
Pushed as a second commit. Only __builtin_copysignq() remains to be dealt with.
I wonder if there's a way to fold the __builtin_copysignq => builtin_copysignf128. (or if declaring it with the IFN_xxxx for builtin_copysignf128 would just make it into an alias, which would mean we did not have to do anything special).
There are a lot of builtin_xxxxf128, AFAICS from looking in gcc/builtins.cc .. I don't think we'd want to do all of them manually.
Closing in favour of https://github.com/iains/gcc-darwin-arm64/pull/122
Not ready for merging at this stage, first rough implementation. Will need tests.
abstf2
andcopysigntf3
patterns. I'm not sure how to do that, I think it needsemit_insn()
but I'm not entirely sure how to use that.