Use standard language constructs to guide vectorisation

Use standard do concurrent instead of compiler directives to promote vectorisation of loops containing pointers to scratch arrays.

For example using Fujitsu, we see the following

         67                         call neko_scratch_registry%request_field(temp1, temp_indices(1))
         68                         call neko_scratch_registry%request_field(temp2, temp_indices(2))
         69                         call neko_scratch_registry%request_field(temp3, temp_indices(3))
         70
                              <<< Loop-information Start >>>
                              <<<  [OPTIMIZATION]
                              <<<    PREFETCH(HARD) Expected by compiler :
                              <<<      (unknown)
                              <<< Loop-information  End >>>
         71     1                   do i = 1, n
         72     1                      temp1%x(i,1,1,1) = ext_coeffs(2) * fx_lag%x(i,1,1,1) + &
         73     1                                         ext_coeffs(3) * fx_laglag%x(i,1,1,1)
         74     1                      temp2%x(i,1,1,1) = ext_coeffs(2) * fy_lag%x(i,1,1,1) + &
         75     1                                         ext_coeffs(3) * fy_laglag%x(i,1,1,1)
         76     1                      temp3%x(i,1,1,1) = ext_coeffs(2) * fz_lag%x(i,1,1,1) + &
         77     1                                         ext_coeffs(3) * fz_laglag%x(i,1,1,1)
         78     1                   end do
         79

after

         67                         call neko_scratch_registry%request_field(temp1, temp_indices(1))
         68                         call neko_scratch_registry%request_field(temp2, temp_indices(2))
         69                         call neko_scratch_registry%request_field(temp3, temp_indices(3))
         70
                              <<< Loop-information Start >>>
                              <<<  [OPTIMIZATION]
                              <<<    SIMD(VL: 8)
                              <<<    SOFTWARE PIPELINING(IPC: 1.30, ITR: 120, MVE: 8, POL: S)
                              <<<    PREFETCH(HARD) Expected by compiler :
                              <<<      (unknown)
                              <<< Loop-information  End >>>
         71     1        v          do concurrent (i = 1:n)
         72     1        v             temp1%x(i,1,1,1) = ext_coeffs(2) * fx_lag%x(i,1,1,1) + &
         73     1                                         ext_coeffs(3) * fx_laglag%x(i,1,1,1)
         74     1        v             temp2%x(i,1,1,1) = ext_coeffs(2) * fy_lag%x(i,1,1,1) + &
         75     1                                         ext_coeffs(3) * fy_laglag%x(i,1,1,1)
         76     1        v             temp3%x(i,1,1,1) = ext_coeffs(2) * fz_lag%x(i,1,1,1) + &
         77     1                                         ext_coeffs(3) * fz_laglag%x(i,1,1,1)
         78     1        v          end do
         79

ExtremeFLOW / neko

Use standard language constructs to guide vectorisation #1290