ExtremeFLOW / neko

/ᐠ. 。.ᐟ\ᵐᵉᵒʷˎˊ˗
https://neko.cfd/
Other
158 stars 27 forks source link

Use standard language constructs to guide vectorisation #1290

Closed njansson closed 1 month ago

njansson commented 1 month ago

Use standard do concurrent instead of compiler directives to promote vectorisation of loops containing pointers to scratch arrays.

For example using Fujitsu, we see the following

         67                         call neko_scratch_registry%request_field(temp1, temp_indices(1))
         68                         call neko_scratch_registry%request_field(temp2, temp_indices(2))
         69                         call neko_scratch_registry%request_field(temp3, temp_indices(3))
         70
                              <<< Loop-information Start >>>
                              <<<  [OPTIMIZATION]
                              <<<    PREFETCH(HARD) Expected by compiler :
                              <<<      (unknown)
                              <<< Loop-information  End >>>
         71     1                   do i = 1, n
         72     1                      temp1%x(i,1,1,1) = ext_coeffs(2) * fx_lag%x(i,1,1,1) + &
         73     1                                         ext_coeffs(3) * fx_laglag%x(i,1,1,1)
         74     1                      temp2%x(i,1,1,1) = ext_coeffs(2) * fy_lag%x(i,1,1,1) + &
         75     1                                         ext_coeffs(3) * fy_laglag%x(i,1,1,1)
         76     1                      temp3%x(i,1,1,1) = ext_coeffs(2) * fz_lag%x(i,1,1,1) + &
         77     1                                         ext_coeffs(3) * fz_laglag%x(i,1,1,1)
         78     1                   end do
         79

after

         67                         call neko_scratch_registry%request_field(temp1, temp_indices(1))
         68                         call neko_scratch_registry%request_field(temp2, temp_indices(2))
         69                         call neko_scratch_registry%request_field(temp3, temp_indices(3))
         70
                              <<< Loop-information Start >>>
                              <<<  [OPTIMIZATION]
                              <<<    SIMD(VL: 8)
                              <<<    SOFTWARE PIPELINING(IPC: 1.30, ITR: 120, MVE: 8, POL: S)
                              <<<    PREFETCH(HARD) Expected by compiler :
                              <<<      (unknown)
                              <<< Loop-information  End >>>
         71     1        v          do concurrent (i = 1:n)
         72     1        v             temp1%x(i,1,1,1) = ext_coeffs(2) * fx_lag%x(i,1,1,1) + &
         73     1                                         ext_coeffs(3) * fx_laglag%x(i,1,1,1)
         74     1        v             temp2%x(i,1,1,1) = ext_coeffs(2) * fy_lag%x(i,1,1,1) + &
         75     1                                         ext_coeffs(3) * fy_laglag%x(i,1,1,1)
         76     1        v             temp3%x(i,1,1,1) = ext_coeffs(2) * fz_lag%x(i,1,1,1) + &
         77     1                                         ext_coeffs(3) * fz_laglag%x(i,1,1,1)
         78     1        v          end do
         79