Open flang-cavium opened 6 years ago
Is that your typical workload? I almost exclusively do numerical computing, where SLP vectorization is critical to performance.
Consider the following example, which is an unrolled 8x8 matrix multiplication:
module mul8mod
implicit none
contains
subroutine mul8x8(A, B, C)
real(8), dimension(64), intent(in) :: A, B
real(8), dimension(64), intent(out) :: C
C(1) = A(1) * B(1) + A(9) * B(2) + A(17) * B(3) + A(25) * B(4)
C(1) = C(1) + A(33) * B(5) + A(41) * B(6) + A(49) * B(7) + A(57) * B(8)
C(2) = A(2) * B(1) + A(10) * B(2) + A(18) * B(3) + A(26) * B(4)
C(2) = C(2) + A(34) * B(5) + A(42) * B(6) + A(50) * B(7) + A(58) * B(8)
C(3) = A(3) * B(1) + A(11) * B(2) + A(19) * B(3) + A(27) * B(4)
C(3) = C(3) + A(35) * B(5) + A(43) * B(6) + A(51) * B(7) + A(59) * B(8)
C(4) = A(4) * B(1) + A(12) * B(2) + A(20) * B(3) + A(28) * B(4)
C(4) = C(4) + A(36) * B(5) + A(44) * B(6) + A(52) * B(7) + A(60) * B(8)
C(5) = A(5) * B(1) + A(13) * B(2) + A(21) * B(3) + A(29) * B(4)
C(5) = C(5) + A(37) * B(5) + A(45) * B(6) + A(53) * B(7) + A(61) * B(8)
C(6) = A(6) * B(1) + A(14) * B(2) + A(22) * B(3) + A(30) * B(4)
C(6) = C(6) + A(38) * B(5) + A(46) * B(6) + A(54) * B(7) + A(62) * B(8)
C(7) = A(7) * B(1) + A(15) * B(2) + A(23) * B(3) + A(31) * B(4)
C(7) = C(7) + A(39) * B(5) + A(47) * B(6) + A(55) * B(7) + A(63) * B(8)
C(8) = A(8) * B(1) + A(16) * B(2) + A(24) * B(3) + A(32) * B(4)
C(8) = C(8) + A(40) * B(5) + A(48) * B(6) + A(56) * B(7) + A(64) * B(8)
C(9) = A(1) * B(9) + A(9) * B(10) + A(17) * B(11) + A(25) * B(12)
C(9) = C(9) + A(33) * B(13) + A(41) * B(14) + A(49) * B(15) + A(57) * B(16)
C(10) = A(2) * B(9) + A(10) * B(10) + A(18) * B(11) + A(26) * B(12)
C(10) = C(10) + A(34) * B(13) + A(42) * B(14) + A(50) * B(15) + A(58) * B(16)
C(11) = A(3) * B(9) + A(11) * B(10) + A(19) * B(11) + A(27) * B(12)
C(11) = C(11) + A(35) * B(13) + A(43) * B(14) + A(51) * B(15) + A(59) * B(16)
C(12) = A(4) * B(9) + A(12) * B(10) + A(20) * B(11) + A(28) * B(12)
C(12) = C(12) + A(36) * B(13) + A(44) * B(14) + A(52) * B(15) + A(60) * B(16)
C(13) = A(5) * B(9) + A(13) * B(10) + A(21) * B(11) + A(29) * B(12)
C(13) = C(13) + A(37) * B(13) + A(45) * B(14) + A(53) * B(15) + A(61) * B(16)
C(14) = A(6) * B(9) + A(14) * B(10) + A(22) * B(11) + A(30) * B(12)
C(14) = C(14) + A(38) * B(13) + A(46) * B(14) + A(54) * B(15) + A(62) * B(16)
C(15) = A(7) * B(9) + A(15) * B(10) + A(23) * B(11) + A(31) * B(12)
C(15) = C(15) + A(39) * B(13) + A(47) * B(14) + A(55) * B(15) + A(63) * B(16)
C(16) = A(8) * B(9) + A(16) * B(10) + A(24) * B(11) + A(32) * B(12)
C(16) = C(16) + A(40) * B(13) + A(48) * B(14) + A(56) * B(15) + A(64) * B(16)
C(17) = A(1) * B(17) + A(9) * B(18) + A(17) * B(19) + A(25) * B(20)
C(17) = C(17) + A(33) * B(21) + A(41) * B(22) + A(49) * B(23) + A(57) * B(24)
C(18) = A(2) * B(17) + A(10) * B(18) + A(18) * B(19) + A(26) * B(20)
C(18) = C(18) + A(34) * B(21) + A(42) * B(22) + A(50) * B(23) + A(58) * B(24)
C(19) = A(3) * B(17) + A(11) * B(18) + A(19) * B(19) + A(27) * B(20)
C(19) = C(19) + A(35) * B(21) + A(43) * B(22) + A(51) * B(23) + A(59) * B(24)
C(20) = A(4) * B(17) + A(12) * B(18) + A(20) * B(19) + A(28) * B(20)
C(20) = C(20) + A(36) * B(21) + A(44) * B(22) + A(52) * B(23) + A(60) * B(24)
C(21) = A(5) * B(17) + A(13) * B(18) + A(21) * B(19) + A(29) * B(20)
C(21) = C(21) + A(37) * B(21) + A(45) * B(22) + A(53) * B(23) + A(61) * B(24)
C(22) = A(6) * B(17) + A(14) * B(18) + A(22) * B(19) + A(30) * B(20)
C(22) = C(22) + A(38) * B(21) + A(46) * B(22) + A(54) * B(23) + A(62) * B(24)
C(23) = A(7) * B(17) + A(15) * B(18) + A(23) * B(19) + A(31) * B(20)
C(23) = C(23) + A(39) * B(21) + A(47) * B(22) + A(55) * B(23) + A(63) * B(24)
C(24) = A(8) * B(17) + A(16) * B(18) + A(24) * B(19) + A(32) * B(20)
C(24) = C(24) + A(40) * B(21) + A(48) * B(22) + A(56) * B(23) + A(64) * B(24)
C(25) = A(1) * B(25) + A(9) * B(26) + A(17) * B(27) + A(25) * B(28)
C(25) = C(25) + A(33) * B(29) + A(41) * B(30) + A(49) * B(31) + A(57) * B(32)
C(26) = A(2) * B(25) + A(10) * B(26) + A(18) * B(27) + A(26) * B(28)
C(26) = C(26) + A(34) * B(29) + A(42) * B(30) + A(50) * B(31) + A(58) * B(32)
C(27) = A(3) * B(25) + A(11) * B(26) + A(19) * B(27) + A(27) * B(28)
C(27) = C(27) + A(35) * B(29) + A(43) * B(30) + A(51) * B(31) + A(59) * B(32)
C(28) = A(4) * B(25) + A(12) * B(26) + A(20) * B(27) + A(28) * B(28)
C(28) = C(28) + A(36) * B(29) + A(44) * B(30) + A(52) * B(31) + A(60) * B(32)
C(29) = A(5) * B(25) + A(13) * B(26) + A(21) * B(27) + A(29) * B(28)
C(29) = C(29) + A(37) * B(29) + A(45) * B(30) + A(53) * B(31) + A(61) * B(32)
C(30) = A(6) * B(25) + A(14) * B(26) + A(22) * B(27) + A(30) * B(28)
C(30) = C(30) + A(38) * B(29) + A(46) * B(30) + A(54) * B(31) + A(62) * B(32)
C(31) = A(7) * B(25) + A(15) * B(26) + A(23) * B(27) + A(31) * B(28)
C(31) = C(31) + A(39) * B(29) + A(47) * B(30) + A(55) * B(31) + A(63) * B(32)
C(32) = A(8) * B(25) + A(16) * B(26) + A(24) * B(27) + A(32) * B(28)
C(32) = C(32) + A(40) * B(29) + A(48) * B(30) + A(56) * B(31) + A(64) * B(32)
C(33) = A(1) * B(33) + A(9) * B(34) + A(17) * B(35) + A(25) * B(36)
C(33) = C(33) + A(33) * B(37) + A(41) * B(38) + A(49) * B(39) + A(57) * B(40)
C(34) = A(2) * B(33) + A(10) * B(34) + A(18) * B(35) + A(26) * B(36)
C(34) = C(34) + A(34) * B(37) + A(42) * B(38) + A(50) * B(39) + A(58) * B(40)
C(35) = A(3) * B(33) + A(11) * B(34) + A(19) * B(35) + A(27) * B(36)
C(35) = C(35) + A(35) * B(37) + A(43) * B(38) + A(51) * B(39) + A(59) * B(40)
C(36) = A(4) * B(33) + A(12) * B(34) + A(20) * B(35) + A(28) * B(36)
C(36) = C(36) + A(36) * B(37) + A(44) * B(38) + A(52) * B(39) + A(60) * B(40)
C(37) = A(5) * B(33) + A(13) * B(34) + A(21) * B(35) + A(29) * B(36)
C(37) = C(37) + A(37) * B(37) + A(45) * B(38) + A(53) * B(39) + A(61) * B(40)
C(38) = A(6) * B(33) + A(14) * B(34) + A(22) * B(35) + A(30) * B(36)
C(38) = C(38) + A(38) * B(37) + A(46) * B(38) + A(54) * B(39) + A(62) * B(40)
C(39) = A(7) * B(33) + A(15) * B(34) + A(23) * B(35) + A(31) * B(36)
C(39) = C(39) + A(39) * B(37) + A(47) * B(38) + A(55) * B(39) + A(63) * B(40)
C(40) = A(8) * B(33) + A(16) * B(34) + A(24) * B(35) + A(32) * B(36)
C(40) = C(40) + A(40) * B(37) + A(48) * B(38) + A(56) * B(39) + A(64) * B(40)
C(41) = A(1) * B(41) + A(9) * B(42) + A(17) * B(43) + A(25) * B(44)
C(41) = C(41) + A(33) * B(45) + A(41) * B(46) + A(49) * B(47) + A(57) * B(48)
C(42) = A(2) * B(41) + A(10) * B(42) + A(18) * B(43) + A(26) * B(44)
C(42) = C(42) + A(34) * B(45) + A(42) * B(46) + A(50) * B(47) + A(58) * B(48)
C(43) = A(3) * B(41) + A(11) * B(42) + A(19) * B(43) + A(27) * B(44)
C(43) = C(43) + A(35) * B(45) + A(43) * B(46) + A(51) * B(47) + A(59) * B(48)
C(44) = A(4) * B(41) + A(12) * B(42) + A(20) * B(43) + A(28) * B(44)
C(44) = C(44) + A(36) * B(45) + A(44) * B(46) + A(52) * B(47) + A(60) * B(48)
C(45) = A(5) * B(41) + A(13) * B(42) + A(21) * B(43) + A(29) * B(44)
C(45) = C(45) + A(37) * B(45) + A(45) * B(46) + A(53) * B(47) + A(61) * B(48)
C(46) = A(6) * B(41) + A(14) * B(42) + A(22) * B(43) + A(30) * B(44)
C(46) = C(46) + A(38) * B(45) + A(46) * B(46) + A(54) * B(47) + A(62) * B(48)
C(47) = A(7) * B(41) + A(15) * B(42) + A(23) * B(43) + A(31) * B(44)
C(47) = C(47) + A(39) * B(45) + A(47) * B(46) + A(55) * B(47) + A(63) * B(48)
C(48) = A(8) * B(41) + A(16) * B(42) + A(24) * B(43) + A(32) * B(44)
C(48) = C(48) + A(40) * B(45) + A(48) * B(46) + A(56) * B(47) + A(64) * B(48)
C(49) = A(1) * B(49) + A(9) * B(50) + A(17) * B(51) + A(25) * B(52)
C(49) = C(49) + A(33) * B(53) + A(41) * B(54) + A(49) * B(55) + A(57) * B(56)
C(50) = A(2) * B(49) + A(10) * B(50) + A(18) * B(51) + A(26) * B(52)
C(50) = C(50) + A(34) * B(53) + A(42) * B(54) + A(50) * B(55) + A(58) * B(56)
C(51) = A(3) * B(49) + A(11) * B(50) + A(19) * B(51) + A(27) * B(52)
C(51) = C(51) + A(35) * B(53) + A(43) * B(54) + A(51) * B(55) + A(59) * B(56)
C(52) = A(4) * B(49) + A(12) * B(50) + A(20) * B(51) + A(28) * B(52)
C(52) = C(52) + A(36) * B(53) + A(44) * B(54) + A(52) * B(55) + A(60) * B(56)
C(53) = A(5) * B(49) + A(13) * B(50) + A(21) * B(51) + A(29) * B(52)
C(53) = C(53) + A(37) * B(53) + A(45) * B(54) + A(53) * B(55) + A(61) * B(56)
C(54) = A(6) * B(49) + A(14) * B(50) + A(22) * B(51) + A(30) * B(52)
C(54) = C(54) + A(38) * B(53) + A(46) * B(54) + A(54) * B(55) + A(62) * B(56)
C(55) = A(7) * B(49) + A(15) * B(50) + A(23) * B(51) + A(31) * B(52)
C(55) = C(55) + A(39) * B(53) + A(47) * B(54) + A(55) * B(55) + A(63) * B(56)
C(56) = A(8) * B(49) + A(16) * B(50) + A(24) * B(51) + A(32) * B(52)
C(56) = C(56) + A(40) * B(53) + A(48) * B(54) + A(56) * B(55) + A(64) * B(56)
C(57) = A(1) * B(57) + A(9) * B(58) + A(17) * B(59) + A(25) * B(60)
C(57) = C(57) + A(33) * B(61) + A(41) * B(62) + A(49) * B(63) + A(57) * B(64)
C(58) = A(2) * B(57) + A(10) * B(58) + A(18) * B(59) + A(26) * B(60)
C(58) = C(58) + A(34) * B(61) + A(42) * B(62) + A(50) * B(63) + A(58) * B(64)
C(59) = A(3) * B(57) + A(11) * B(58) + A(19) * B(59) + A(27) * B(60)
C(59) = C(59) + A(35) * B(61) + A(43) * B(62) + A(51) * B(63) + A(59) * B(64)
C(60) = A(4) * B(57) + A(12) * B(58) + A(20) * B(59) + A(28) * B(60)
C(60) = C(60) + A(36) * B(61) + A(44) * B(62) + A(52) * B(63) + A(60) * B(64)
C(61) = A(5) * B(57) + A(13) * B(58) + A(21) * B(59) + A(29) * B(60)
C(61) = C(61) + A(37) * B(61) + A(45) * B(62) + A(53) * B(63) + A(61) * B(64)
C(62) = A(6) * B(57) + A(14) * B(58) + A(22) * B(59) + A(30) * B(60)
C(62) = C(62) + A(38) * B(61) + A(46) * B(62) + A(54) * B(63) + A(62) * B(64)
C(63) = A(7) * B(57) + A(15) * B(58) + A(23) * B(59) + A(31) * B(60)
C(63) = C(63) + A(39) * B(61) + A(47) * B(62) + A(55) * B(63) + A(63) * B(64)
C(64) = A(8) * B(57) + A(16) * B(58) + A(24) * B(59) + A(32) * B(60)
C(64) = C(64) + A(40) * B(61) + A(48) * B(62) + A(56) * B(63) + A(64) * B(64)
end subroutine mul8x8
end module mul8mod
The above was a generated function. Ie, automated output (where the size of the matrices "A" and "B" was the input). It was meant to give the compiler a lot of flexibility in deciding how to optimize the code. But, the idea I specifically had in mind was something like this:
module mul8mod
implicit none
contains
subroutine mul8x8(A, B, C)
real(8), dimension(64), intent(in) :: A, B
real(8), dimension(64), intent(out) :: C
C(1:8) = B(1) * A(1:8) + B(2) * A(9:16) + B(3) * A(17:24) + B(4) * A(25:32) &
+ B(5) * A(33:40) + B(6) * A(41:48) + B(7) * A(49:56) + B(8) * A(57:64)
C(9:16) = B(9) * A(1:8) + B(10) * A(9:16) + B(11) * A(17:24) + B(12) * A(25:32) &
+ B(13) * A(33:40) + B(14) * A(41:48) + B(15) * A(49:56) + B(16) * A(57:64)
C(17:24) = B(17) * A(1:8) + B(18) * A(9:16) + B(19) * A(17:24) + B(20) * A(25:32) &
+ B(21) * A(33:40) + B(22) * A(41:48) + B(23) * A(49:56) + B(24) * A(57:64)
C(25:32) = B(25) * A(1:8) + B(26) * A(9:16) + B(27) * A(17:24) + B(28) * A(25:32) &
+ B(29) * A(33:40) + B(30) * A(41:48) + B(31) * A(49:56) + B(32) * A(57:64)
C(33:40) = B(33) * A(1:8) + B(34) * A(9:16) + B(35) * A(17:24) + B(36) * A(25:32) &
+ B(37) * A(33:40) + B(38) * A(41:48) + B(39) * A(49:56) + B(40) * A(57:64)
C(41:48) = B(41) * A(1:8) + B(42) * A(9:16) + B(43) * A(17:24) + B(44) * A(25:32) &
+ B(45) * A(33:40) + B(46) * A(41:48) + B(47) * A(49:56) + B(48) * A(57:64)
C(49:56) = B(49) * A(1:8) + B(50) * A(9:16) + B(51) * A(17:24) + B(52) * A(25:32) &
+ B(53) * A(33:40) + B(54) * A(41:48) + B(55) * A(49:56) + B(56) * A(57:64)
C(57:64) = B(57) * A(1:8) + B(58) * A(9:16) + B(59) * A(17:24) + B(60) * A(25:32) &
+ B(61) * A(33:40) + B(62) * A(41:48) + B(63) * A(49:56) + B(64) * A(57:64)
end subroutine mul8x8
end module mul8mod
That is, that elements of "B" get broadcasted across a register, while "A" gets loaded in vectors, and fused multiply-add instructions get used.
Compiling both the implied and explicit versions with these flags:
gfortran -Ofast -march=native -shared -fPIC mul8mod.f90 -o libmul8x8gfortran.so
flang -Ofast -march=native -shared -fPIC mul8mod.f90 -o libmul8x8flang.so
ifort -fast -xHost -shared -fPIC mul8mod.f90 -o libmul8x8ifort.so
and benchmarking the first one in Julia on a Ryzen processor:
julia> using Compat, BenchmarkTools
julia> A = randn(8,8);
julia> B = randn(8,8);
julia> C = similar(A);
julia> const gfortran_path = joinpath(pwd(), "libmul8x8gfortran.so")
"/home/chris/Documents/progwork/fortran/libmul8x8gfortran.so"
julia> const flang_path = joinpath(pwd(), "libmul8x8flang.so")
"/home/chris/Documents/progwork/fortran/libmul8x8flang.so"
julia> function gfortran!(C, A, B)
ccall((:__mul8mod_MOD_mul8x8, gfortran_path), Cvoid, (Ptr{Float64}, Ptr{Float64}, Ptr{Float64}), A, B, C)
C
end
gfortran! (generic function with 1 method)
julia> function flang!(C, A, B)
ccall((:mul8mod_mul8x8_, flang_path), Cvoid, (Ptr{Float64}, Ptr{Float64}, Ptr{Float64}), A, B, C)
C
end
flang! (generic function with 1 method)
julia> @benchmark flang!($C, $A, $B)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 50.064 ns (0.00% GC)
median time: 51.536 ns (0.00% GC)
mean time: 53.237 ns (0.00% GC)
maximum time: 146.304 ns (0.00% GC)
--------------
samples: 10000
evals/sample: 987
julia> @benchmark gfortran!($C, $A, $B)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 81.001 ns (0.00% GC)
median time: 83.449 ns (0.00% GC)
mean time: 85.937 ns (0.00% GC)
maximum time: 181.615 ns (0.00% GC)
--------------
samples: 10000
evals/sample: 966
Looking at the assembly, Flang/LLVM did exactly what I had in mind with the ymm (ie, 256 bit registers), so that it was doing 4 multiplications and 4 additions per instruction. gfortran used xmm (128-bit) exclusively, without the broadcasts. The results of the explicit version:
julia> @benchmark flang2!($C, $A, $B)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 47.457 ns (0.00% GC)
median time: 53.897 ns (0.00% GC)
mean time: 51.737 ns (0.00% GC)
maximum time: 139.817 ns (0.00% GC)
--------------
samples: 10000
evals/sample: 988
julia> @benchmark gfortran2!($C, $A, $B)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 46.376 ns (0.00% GC)
median time: 51.153 ns (0.00% GC)
mean time: 50.903 ns (0.00% GC)
maximum time: 137.984 ns (0.00% GC)
--------------
samples: 10000
evals/sample: 989
gfortran now takes the hint, and performance is optimal.
Moving now instead to a Skylake-X processor... Here is a dump of results. Endings with "1" mean it was the first, more flexibly written, version given above, and "2" the one written with the explicit vector operations. Because this was on an Intel processor, I also added ifort.
julia> @benchmark flang1!($C, $A, $B)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 18.224 ns (0.00% GC)
median time: 18.592 ns (0.00% GC)
mean time: 18.969 ns (0.00% GC)
maximum time: 45.079 ns (0.00% GC)
--------------
samples: 10000
evals/sample: 998
julia> @benchmark flang2!($C, $A, $B)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 33.877 ns (0.00% GC)
median time: 34.032 ns (0.00% GC)
mean time: 34.997 ns (0.00% GC)
maximum time: 61.429 ns (0.00% GC)
--------------
samples: 10000
evals/sample: 993
julia> @benchmark gfortran1!($C, $A, $B)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 124.716 ns (0.00% GC)
median time: 125.780 ns (0.00% GC)
mean time: 130.495 ns (0.00% GC)
maximum time: 180.812 ns (0.00% GC)
--------------
samples: 10000
evals/sample: 898
julia> @benchmark gfortran2!($C, $A, $B)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 28.680 ns (0.00% GC)
median time: 28.976 ns (0.00% GC)
mean time: 29.762 ns (0.00% GC)
maximum time: 74.363 ns (0.00% GC)
--------------
samples: 10000
evals/sample: 995
julia> @benchmark ifort1!($C, $A, $B)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 103.649 ns (0.00% GC)
median time: 104.381 ns (0.00% GC)
mean time: 108.445 ns (0.00% GC)
maximum time: 182.637 ns (0.00% GC)
--------------
samples: 10000
evals/sample: 939
julia> @benchmark ifort2!($C, $A, $B)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 55.687 ns (0.00% GC)
median time: 56.185 ns (0.00% GC)
mean time: 57.715 ns (0.00% GC)
maximum time: 95.370 ns (0.00% GC)
--------------
samples: 10000
evals/sample: 984
With the explicitly vectorized version, gfortran was slightly faster than Flang. But, with the more flexibly written version, Flang/LLVM's excellent vectorizer used a different strategy than I had in mind (actually, it didn't broadcast for either 1 or 2), and achieved by far the fastest time at under 19 ns.
gfortran used xmm and ymm registers for 1 and 2 respectively, while Flang used zmm (512 bit) for both.
While in most benchmarks I've seen online gcc beats Clang, as far as I can tell, recent versions of LLVM (eg, 0.6) are king of autovectorization, meaning if you like number crunching, LLVM may be the way to go. I get similar results with Clang(++) and Julia v0.7.
Don't quite see how matrix vectorization on Ryzen is relevant to trimming a string on AArch64.
Ryzen being X86_64. T99 (AArch64) not having 256-bit vector registers. And my examples having nothing to do with vectorization in the first place.
@chriselrod I believe the current behavior for llvm is not to generate AXV-512 instructions on skylake; you have to enable the zmm registers. In my experience, PGI and Intel are better at vectorization than GCC and LLVM. Have you presented the unexpanded matmul to the compiler and measured the results?
@flang-cavium I agree, although it's possible to vectorize the trim routine. I've never seen that but I have seen other vectorized string routines, especially copies. I've been meaning to ask: are you seeing the slow TRIM in a real app or a benchmark?
TRIM shows up in SPEC (628.pop2_s) as being very expensive when run under perf record and compiled with flang. As compared to the same 628.pop2_s being compiled with gfortran 7.3, where TRIM incurs a 0 (zero) cost.
And 628.pop2_s TRIMs a lot because it keeps reading a whole bunch of text files.
So, it's affecting the overall performance of 628.pop2_s.
@flang-cavium Sorry, you're obviously right. The optimizations I was talking about aren't applicable, so your specific workload doesn't really matter there. (and it wasn't just Ryzen, but I suspect avx capable x86 generally).
@sscalpone
The only optimization flags I used were -Ofast -march=native
. This is the assembly Flang generated:
.loc 1 13 1 prologue_end # mul8module1.f90:13:1
vmovupd (%rdi), %zmm0
vmovupd 64(%rdi), %zmm1
vmovupd 128(%rdi), %zmm2
vmovupd 192(%rdi), %zmm6
vmulpd 24(%rsi){1to8}, %zmm6, %zmm8
vfmadd231pd 16(%rsi){1to8}, %zmm2, %zmm8
vfmadd231pd 8(%rsi){1to8}, %zmm1, %zmm8
vfmadd231pd (%rsi){1to8}, %zmm0, %zmm8
.loc 1 14 1 # mul8module1.f90:14:1
vmovupd 448(%rdi), %zmm7
vmovupd 384(%rdi), %zmm5
vmovupd 320(%rdi), %zmm4
vmovupd 256(%rdi), %zmm3
vfmadd231pd 56(%rsi){1to8}, %zmm7, %zmm8
vfmadd231pd 48(%rsi){1to8}, %zmm5, %zmm8
vfmadd231pd 40(%rsi){1to8}, %zmm4, %zmm8
vfmadd231pd 32(%rsi){1to8}, %zmm3, %zmm8
vmovupd %zmm8, (%rdx)
.loc 1 29 1 # mul8module1.f90:29:1
vmulpd 88(%rsi){1to8}, %zmm6, %zmm8
vfmadd231pd 80(%rsi){1to8}, %zmm2, %zmm8
vfmadd231pd 72(%rsi){1to8}, %zmm1, %zmm8
vfmadd231pd 64(%rsi){1to8}, %zmm0, %zmm8
.loc 1 30 1 # mul8module1.f90:30:1
vfmadd231pd 120(%rsi){1to8}, %zmm7, %zmm8
vfmadd231pd 112(%rsi){1to8}, %zmm5, %zmm8
vfmadd231pd 104(%rsi){1to8}, %zmm4, %zmm8
vfmadd231pd 96(%rsi){1to8}, %zmm3, %zmm8
vmovupd %zmm8, 64(%rdx)
.loc 1 45 1 # mul8module1.f90:45:1
vmulpd 152(%rsi){1to8}, %zmm6, %zmm8
vfmadd231pd 144(%rsi){1to8}, %zmm2, %zmm8
vfmadd231pd 136(%rsi){1to8}, %zmm1, %zmm8
vfmadd231pd 128(%rsi){1to8}, %zmm0, %zmm8
.loc 1 46 1 # mul8module1.f90:46:1
vfmadd231pd 184(%rsi){1to8}, %zmm7, %zmm8
vfmadd231pd 176(%rsi){1to8}, %zmm5, %zmm8
vfmadd231pd 168(%rsi){1to8}, %zmm4, %zmm8
vfmadd231pd 160(%rsi){1to8}, %zmm3, %zmm8
vmovupd %zmm8, 128(%rdx)
.loc 1 61 1 # mul8module1.f90:61:1
vmulpd 216(%rsi){1to8}, %zmm6, %zmm8
vfmadd231pd 208(%rsi){1to8}, %zmm2, %zmm8
vfmadd231pd 200(%rsi){1to8}, %zmm1, %zmm8
vfmadd231pd 192(%rsi){1to8}, %zmm0, %zmm8
.loc 1 62 1 # mul8module1.f90:62:1
vfmadd231pd 248(%rsi){1to8}, %zmm7, %zmm8
vfmadd231pd 240(%rsi){1to8}, %zmm5, %zmm8
vfmadd231pd 232(%rsi){1to8}, %zmm4, %zmm8
vfmadd231pd 224(%rsi){1to8}, %zmm3, %zmm8
vmovupd %zmm8, 192(%rdx)
.loc 1 77 1 # mul8module1.f90:77:1
vmulpd 280(%rsi){1to8}, %zmm6, %zmm8
vfmadd231pd 272(%rsi){1to8}, %zmm2, %zmm8
vfmadd231pd 264(%rsi){1to8}, %zmm1, %zmm8
vfmadd231pd 256(%rsi){1to8}, %zmm0, %zmm8
.loc 1 78 1 # mul8module1.f90:78:1
vfmadd231pd 312(%rsi){1to8}, %zmm7, %zmm8
vfmadd231pd 304(%rsi){1to8}, %zmm5, %zmm8
vfmadd231pd 296(%rsi){1to8}, %zmm4, %zmm8
vfmadd231pd 288(%rsi){1to8}, %zmm3, %zmm8
vmovupd %zmm8, 256(%rdx)
.loc 1 93 1 # mul8module1.f90:93:1
vmulpd 344(%rsi){1to8}, %zmm6, %zmm8
vfmadd231pd 336(%rsi){1to8}, %zmm2, %zmm8
vfmadd231pd 328(%rsi){1to8}, %zmm1, %zmm8
vfmadd231pd 320(%rsi){1to8}, %zmm0, %zmm8
.loc 1 94 1 # mul8module1.f90:94:1
vfmadd231pd 376(%rsi){1to8}, %zmm7, %zmm8
vfmadd231pd 368(%rsi){1to8}, %zmm5, %zmm8
vfmadd231pd 360(%rsi){1to8}, %zmm4, %zmm8
vfmadd231pd 352(%rsi){1to8}, %zmm3, %zmm8
vmovupd %zmm8, 320(%rdx)
.loc 1 109 1 # mul8module1.f90:109:1
vmulpd 408(%rsi){1to8}, %zmm6, %zmm8
vfmadd231pd 400(%rsi){1to8}, %zmm2, %zmm8
vfmadd231pd 392(%rsi){1to8}, %zmm1, %zmm8
vfmadd231pd 384(%rsi){1to8}, %zmm0, %zmm8
.loc 1 110 1 # mul8module1.f90:110:1
vfmadd231pd 440(%rsi){1to8}, %zmm7, %zmm8
vfmadd231pd 432(%rsi){1to8}, %zmm5, %zmm8
vfmadd231pd 424(%rsi){1to8}, %zmm4, %zmm8
vfmadd231pd 416(%rsi){1to8}, %zmm3, %zmm8
vmovupd %zmm8, 384(%rdx)
.loc 1 125 1 # mul8module1.f90:125:1
vmulpd 472(%rsi){1to8}, %zmm6, %zmm6
vfmadd231pd 464(%rsi){1to8}, %zmm2, %zmm6
vfmadd231pd 456(%rsi){1to8}, %zmm1, %zmm6
vfmadd231pd 448(%rsi){1to8}, %zmm0, %zmm6
.loc 1 126 1 # mul8module1.f90:126:1
vfmadd231pd 504(%rsi){1to8}, %zmm7, %zmm6
vfmadd231pd 496(%rsi){1to8}, %zmm5, %zmm6
vfmadd231pd 488(%rsi){1to8}, %zmm4, %zmm6
vfmadd231pd 480(%rsi){1to8}, %zmm3, %zmm6
vmovupd %zmm6, 448(%rdx)
.loc 1 141 1 # mul8module1.f90:141:1
vzeroupper
retq
I did not have to coerce it with -mprefer-vector-width=512
.
I haven't tested different versions of Flang/Clang. I spend the vast majority of my time with Julia, where I realized that the current development version of Julia (using LLVM 6) vectorizes the above example "correctly", but the latest release (still on LLVM 3.9) does not. So I'm guessing one or more optimization passes got added at some point in the interim.
Perhaps Flang release_60 does dramatically better than Flang release_40 or release_50? Or maybe it's specific to blocks like the above. Vectorizing blocks vs loops, handling different sorts of transformations (eg, a/b -> a * (1/b) ) etc are all different questions.
But at least for simple loops, like a dot product, LLVM also seems to do much better than gcc.
LLVM 3.9 is ancient.
GCC/gfortran: -ftree-vectorize -mavx -mavx2?
When I compile the first version with just
gfortran -O2 -ftree-vectorize -mavx -mavx2 -mfma -S mul8module1.f90 -o libmul8x8gfortran1v2.s
it only uses xmm registers. I had to add -mfma
to get it to use fma instructions.
Alternatively,
gfortran -O2 -ftree-vectorize -mavx512f -S mul8module1.f90 -o libmul8x8gfortran1v2.s
also just uses the xmm registers, but now it uses xmm0
through xmm31
instead of just through xmm15
, because that instruction set doubled the number.
I also realized that I don't need -funsafe-math-optimizations -fassociative-math
to get gfortran or Flang to use fma instructions. Thinking I had to is why I was compiling with -Ofast
. In Julia, you do need to annotate the code blocks with @fastmath
to allow fma, because it changes rounding (...by making it more accurate?).
Using the second version, I could get gfortran to use zmm registers via
gfortran -Ofast -march=native -mprefer-vector-width=512 -S mul8module8.f90 -o libmul8x8gfortran2z.s
gfortran -Ofast -march=native -mprefer-vector-width=512 -shared -fPIC mul8module8.f90 -o libmul8x8gfortran2z.so
and this was actually really fast
julia> @benchmark gfortran2z!($C, $A, $B)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 24.248 ns (0.00% GC)
median time: 24.688 ns (0.00% GC)
mean time: 24.736 ns (0.00% GC)
maximum time: 57.837 ns (0.00% GC)
--------------
samples: 10000
evals/sample: 996
although still not quite as fast as Flang on the first version (19 ns), it is much faster than Flang on the same version (34 ns).
Looking more at the assembly...
For version 1 of the code, gfortran simply fails to vectorize beyond xmm
registers, even with -mprefer-vector-width=512
or -mprefer-vector-width=256
.
For version 2, looking at the assembly...
gfortran's assembly seems to imply syntax for vfmaddIJKpd
is 1*2 + 3
; looking at a snapshot (from version 2 of the code)
vmovupd 128(%rdi), %zmm3
vmovupd 192(%rdi), %zmm7
vbroadcastsd 16(%rsi), %zmm4
vbroadcastsd 24(%rsi), %zmm9
vmulpd %zmm7, %zmm9, %zmm9
vfmadd132pd %zmm3, %zmm9, %zmm4
%rdi
refers to matrix A, and %rsi
to matrix B.
Load 8 elements from A into zmm3, and 8 more into zmm7.
Broadcast two elements of B across registers zmm4 and zmm9.
Then zmm9 = zmm7 * zmm9
and zmm4 = zmm3 * zmm4 + zmm9
makes sense, implying it is 1 * 2 + 3.
This is supported by the next chunk, which follows the same pattern, and add its result to zmm4
calculated above:
vmovupd (%rdi), %zmm0
vmovupd 64(%rdi), %zmm8
vbroadcastsd (%rsi), %zmm9
vbroadcastsd 8(%rsi), %zmm10
vmulpd %zmm8, %zmm10, %zmm10
vfmadd132pd %zmm0, %zmm10, %zmm9
vaddpd %zmm9, %zmm4, %zmm4
I also wonder why it isn't properly chaining fma
instructions, Ie, why it's doing this
fma(x,y,z) = x*y+z
fma(b, x, a*w) + fma(d, z, c*y) +
instead of a bunch of successive fmas:
fma( ... fma(d, z, fma(c, y, fma(b, x, a*w) ) ) ... )
Maybe it wants to break up the dependency chain, for out of order execution?
Maybe if I change the order I can get it to use 7 fma
instructions instead of 4 per column of output matrix C. Or maybe I could add parenthesis.
I suspect that it's only using 4 out of the possible 7 per column of C is why it's slower than Flang.
Looking at Flang's assembly for version 1 of the code, it looks liek the vfmaddIJKpdsyntax must be
2*3 + 1` instead:
vmovupd (%rdi), %zmm0
vmovupd 64(%rdi), %zmm1
vmovupd 128(%rdi), %zmm2
vmovupd 192(%rdi), %zmm6
vmulpd 24(%rsi){1to8}, %zmm6, %zmm8
vfmadd231pd 16(%rsi){1to8}, %zmm2, %zmm8
vfmadd231pd 8(%rsi){1to8}, %zmm1, %zmm8
vfmadd231pd (%rsi){1to8}, %zmm0, %zmm8
.loc 1 14 1 # mul8module1.f90:14:1
vmovupd 448(%rdi), %zmm7
vmovupd 384(%rdi), %zmm5
vmovupd 320(%rdi), %zmm4
vmovupd 256(%rdi), %zmm3
vfmadd231pd 56(%rsi){1to8}, %zmm7, %zmm8
vfmadd231pd 48(%rsi){1to8}, %zmm5, %zmm8
vfmadd231pd 40(%rsi){1to8}, %zmm4, %zmm8
vfmadd231pd 32(%rsi){1to8}, %zmm3, %zmm8
vmovupd %zmm8, (%rdx)
zmm8
must be a column of C.
I am guessing this means that the {1to8}
syntax means it is an implicit broadcast, without having to fill a register? That would make the code make sense, especially given that they're all offset by 8.
But then I'd wonder why neither Flang or gfortran could figure it out from the more explicitly written version 2 of the code.
I wonder if the fused broadcast-multiplication from the mask {1to8}
is faster than actually broadcasting, or if it's the difference in number of fused operations.
Flang, in version 2 of the code, reloads the entire matrix A for every column of C it calculates. For absolutely zero reason; the first 2 columns of C:
vmovupd 448(%rdi), %zmm0
vmulpd 56(%rsi){1to8}, %zmm0, %zmm0
vmovupd 384(%rdi), %zmm1
vmovupd 320(%rdi), %zmm2
vmovupd 256(%rdi), %zmm3
vmovupd (%rdi), %zmm4
vmovupd 64(%rdi), %zmm5
vmovupd 128(%rdi), %zmm6
vmovupd 192(%rdi), %zmm7
vfmadd132pd 48(%rsi){1to8}, %zmm0, %zmm1
vfmadd231pd 40(%rsi){1to8}, %zmm2, %zmm1
vfmadd231pd 32(%rsi){1to8}, %zmm3, %zmm1
vfmadd231pd 24(%rsi){1to8}, %zmm7, %zmm1
vfmadd231pd 16(%rsi){1to8}, %zmm6, %zmm1
vfmadd231pd 8(%rsi){1to8}, %zmm5, %zmm1
vfmadd231pd (%rsi){1to8}, %zmm4, %zmm1
vmovupd %zmm1, (%rdx)
.loc 1 16 1 # mul8module8.f90:16:1
vmovupd 448(%rdi), %zmm0
vmulpd 120(%rsi){1to8}, %zmm0, %zmm0
vmovupd 384(%rdi), %zmm1
vmovupd 320(%rdi), %zmm2
vmovupd 256(%rdi), %zmm3
vmovupd (%rdi), %zmm4
vmovupd 64(%rdi), %zmm5
vmovupd 128(%rdi), %zmm6
vmovupd 192(%rdi), %zmm7
vfmadd132pd 112(%rsi){1to8}, %zmm0, %zmm1
vfmadd231pd 104(%rsi){1to8}, %zmm2, %zmm1
vfmadd231pd 96(%rsi){1to8}, %zmm3, %zmm1
vfmadd231pd 88(%rsi){1to8}, %zmm7, %zmm1
vfmadd231pd 80(%rsi){1to8}, %zmm6, %zmm1
vfmadd231pd 72(%rsi){1to8}, %zmm5, %zmm1
vfmadd231pd 64(%rsi){1to8}, %zmm4, %zmm1
vmovupd %zmm1, 64(%rdx)
It overwrites the columns of A
loaded into zmm0
and zmm1
while calculating the columns of C (which it could easily not do -- plenty of empty registers), and then reloads the remaining 6 columns into the exact same registers zmm2
through zmm6
that already contained that data.
That seems like the obviously wrong thing to do. (Fortran should be assuming C
and A
don't alias, right?)
EDIT: Flang definitely seems to assume the possibility of aliasing there. All the copying goes away when I create a bunch of temporaries for the columns of A or C. But then the compiler generates a bunch of weird code for the copying of data into the temporaries, or for the coping of data from the temporaries back to the matrices.
But, the best time I got so far:
julia> @benchmark gfortran2b!($C, $A, $B)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 14.761 ns (0.00% GC)
median time: 15.009 ns (0.00% GC)
mean time: 15.161 ns (0.00% GC)
maximum time: 41.416 ns (0.00% GC)
--------------
samples: 10000
evals/sample: 998
I tried replacing -Ofast
with -O3
and adding parenthesis. This did cause gfortran to use 7/7 fma instructions, but still took about 20-21 ns. So thinking that it may have broken up the fma chain to break dependency issues, I figured I'd break up the matrix multiplication into blocks that'd hopefully let it stay busy, instead of waiting for one instruction to finish before starting the next.
module mul8mod
implicit none
contains
subroutine mul8x8(A, B, C)
real(8), dimension(64), intent(in) :: A, B
real(8), dimension(64), intent(out) :: C
C(1:8) = B(1) * A(1:8) + B(2) * A(9:16)
C(9:16) = B(9) * A(1:8) + B(10) * A(9:16)
C(17:24) = B(17) * A(1:8) + B(18) * A(9:16)
C(25:32) = B(25) * A(1:8) + B(26) * A(9:16)
C(33:40) = B(33) * A(1:8) + B(34) * A(9:16)
C(41:48) = B(41) * A(1:8) + B(42) * A(9:16)
C(49:56) = B(49) * A(1:8) + B(50) * A(9:16)
C(57:64) = B(57) * A(1:8) + B(58) * A(9:16)
C(1:8) = C(1:8) + B(3) * A(17:24)
C(9:16) = C(9:16) + B(11) * A(17:24)
C(17:24) = C(17:24) + B(19) * A(17:24)
C(25:32) = C(25:32) + B(27) * A(17:24)
C(33:40) = C(33:40) + B(35) * A(17:24)
C(41:48) = C(41:48) + B(43) * A(17:24)
C(49:56) = C(49:56) + B(51) * A(17:24)
C(57:64) = C(57:64) + B(59) * A(17:24)
C(1:8) = C(1:8) + B(4) * A(25:32)
C(9:16) = C(9:16) + B(12) * A(25:32)
C(17:24) = C(17:24) + B(20) * A(25:32)
C(25:32) = C(25:32) + B(28) * A(25:32)
C(33:40) = C(33:40) + B(36) * A(25:32)
C(41:48) = C(41:48) + B(44) * A(25:32)
C(49:56) = C(49:56) + B(52) * A(25:32)
C(57:64) = C(57:64) + B(60) * A(25:32)
C(1:8) = C(1:8) + B(5) * A(33:40)
C(9:16) = C(9:16) + B(13) * A(33:40)
C(17:24) = C(17:24) + B(21) * A(33:40)
C(25:32) = C(25:32) + B(29) * A(33:40)
C(33:40) = C(33:40) + B(37) * A(33:40)
C(41:48) = C(41:48) + B(45) * A(33:40)
C(49:56) = C(49:56) + B(53) * A(33:40)
C(57:64) = C(57:64) + B(61) * A(33:40)
C(1:8) = C(1:8) + B(6) * A(41:48)
C(9:16) = C(9:16) + B(14) * A(41:48)
C(17:24) = C(17:24) + B(22) * A(41:48)
C(25:32) = C(25:32) + B(30) * A(41:48)
C(33:40) = C(33:40) + B(38) * A(41:48)
C(41:48) = C(41:48) + B(46) * A(41:48)
C(49:56) = C(49:56) + B(54) * A(41:48)
C(57:64) = C(57:64) + B(62) * A(41:48)
C(1:8) = C(1:8) + B(7) * A(49:56)
C(9:16) = C(9:16) + B(15) * A(49:56)
C(17:24) = C(17:24) + B(23) * A(49:56)
C(25:32) = C(25:32) + B(31) * A(49:56)
C(33:40) = C(33:40) + B(39) * A(49:56)
C(41:48) = C(41:48) + B(47) * A(49:56)
C(49:56) = C(49:56) + B(55) * A(49:56)
C(57:64) = C(57:64) + B(63) * A(49:56)
C(1:8) = C(1:8) + B(8) * A(57:64)
C(9:16) = C(9:16) + B(16) * A(57:64)
C(17:24) = C(17:24) + B(24) * A(57:64)
C(25:32) = C(25:32) + B(32) * A(57:64)
C(33:40) = C(33:40) + B(40) * A(57:64)
C(41:48) = C(41:48) + B(48) * A(57:64)
C(49:56) = C(49:56) + B(56) * A(57:64)
C(57:64) = C(57:64) + B(64) * A(57:64)
end subroutine mul8x8
end module mul8mod
This resulted in the 15 ns runtime for gfortran
.
Flang was very slow here, though (>50 ns). It again looks like Flang is assuming aliasing, because it is constantly storing and reloading:
vmovupd (%rdi), %zmm0
vmovupd 64(%rdi), %zmm1
vmulpd 8(%rsi){1to8}, %zmm1, %zmm1
vfmadd231pd (%rsi){1to8}, %zmm0, %zmm1
vmovupd %zmm1, (%rdx)
.loc 1 14 1 # mul8module8block.f90:14:1
vmovupd (%rdi), %zmm0
vmovupd 64(%rdi), %zmm1
vmulpd 72(%rsi){1to8}, %zmm1, %zmm1
vfmadd231pd 64(%rsi){1to8}, %zmm0, %zmm1
vmovupd %zmm1, 64(%rdx)
.loc 1 15 1 # mul8module8block.f90:15:1
vmovupd (%rdi), %zmm0
vmovupd 64(%rdi), %zmm1
vmulpd 136(%rsi){1to8}, %zmm1, %zmm1
vfmadd231pd 128(%rsi){1to8}, %zmm0, %zmm1
vmovupd %zmm1, 128(%rdx)
yikes! Every time it stores into rdx
, it is acting like rdi
could've changed so that it has to reload.
I don't know much about compiler implementations or internals, but isn't this something that can be fixed on the front end? I've seen discussions from the Julia developers about "TBAA" tags used to tell LLVM about aliasing info https://llvm.org/docs/LangRef.html#tbaa-metadata
Please find below the output from perf stat on two very simple test programs. This is on AArch64 (Cavium TX2-T99). gfortran is significantly faster than flang.
program main character (len=26) :: abc = "The Quick Brown Fox " print "(a,$)", "'" print "(a,$)", trim(abc) print "(a,$)", "'" write (,) end program main
GCC (gfortran) 7.3 flang 6.0.1
GCC CFLAGS: -O3 -mcpu=thunderx2t99 flang CFLAGS: -O3 -mcpu=thunderx2t99
perf stat -e task-clock,cycles,instructions,branches -d -d -d:
Performance counter stats for 'trim-gcc-73':
Performance counter stats for 'trim-flang':
================================================
program main implicit none
character(len=44) :: s1 character(len=44) :: s2
s1 = "The Quick Brown Fox Jumps Over The Lazy Dog" s2 = s1
print "(a,$)", "'" print "(a,$)", s1 print "(a,$)", "'" write (,)
print "(a,$)", "'" print "(a,$)", s2 print "(a,$)", "'" write (,)
print "(a,$)", "'" print "(a,$)", trim(s1) print "(a,$)", "'" write (,)
print "(a,$)", "'" print "(a,$)", trim(s2) print "(a,$)", "'" write (,) end program main
GCC (gfortran) 7.3 flang 6.0.1
GCC CFLAGS: -O3 -mcpu=thunderx2t99 flang CFLAGS: -O3 -mcpu=thunderx2t99
perf stat -e task-clock,cycles,instructions,branches -d -d -d:
Performance counter stats for 'str-gcc-73':
Performance counter stats for 'str-flang':