Closed timofeymukha closed 1 year ago
@njansson Starting slow, with just providing a routine for the stress. Here is a first try.
subroutine stress_tensor(sij, u, v, w, coef)
type(field_t), intent(in) :: u, v, w
type(coef_t), intent(in) :: coef
real(kind=rp), intent(out) :: sij(u%Xh%lx, u%Xh%ly, u%Xh%lz, u%msh%nelv, 6)
real(kind=rp) :: dudx(u%Xh%lx, u%Xh%ly, u%Xh%lz, u%msh%nelv)
real(kind=rp) :: dudy(u%Xh%lx, u%Xh%ly, u%Xh%lz, u%msh%nelv)
real(kind=rp) :: dudz(u%Xh%lx, u%Xh%ly, u%Xh%lz, u%msh%nelv)
real(kind=rp) :: dvdx(v%Xh%lx, v%Xh%ly, v%Xh%lz, u%msh%nelv)
real(kind=rp) :: dvdy(v%Xh%lx, v%Xh%ly, v%Xh%lz, u%msh%nelv)
real(kind=rp) :: dvdz(v%Xh%lx, v%Xh%ly, v%Xh%lz, u%msh%nelv)
real(kind=rp) :: dwdx(w%Xh%lx, w%Xh%ly, w%Xh%lz, u%msh%nelv)
real(kind=rp) :: dwdy(w%Xh%lx, w%Xh%ly, w%Xh%lz, u%msh%nelv)
real(kind=rp) :: dwdz(w%Xh%lx, w%Xh%ly, w%Xh%lz, u%msh%nelv)
integer :: i, e
call opgrad(dudx, dudy, dudz, u%x, coef)
call opgrad(dvdx, dvdy, dvdz, v%x, coef)
call opgrad(dwdx, dwdy, dwdz, w%x, coef)
do e=1,u%msh%nelv
do i=1,u%Xh%lxyz
sij(i, 1, 1, e, 1) = 2 * dudx(i, 1, 1, e)
sij(i, 1, 1, e, 2) = 2 * dvdy(i, 1, 1, e)
sij(i, 1, 1, e, 3) = 2 * dwdz(i, 1, 1, e)
sij(i, 1, 1, e, 4) = dudy(i, 1, 1, e) + dvdx(i, 1, 1, e)
sij(i, 1, 1, e, 5) = dvdz(i, 1, 1, e) + dwdy(i, 1, 1, e)
sij(i, 1, 1, e, 6) = dudz(i, 1, 1, e) + dwdx(i, 1, 1, e)
enddo
enddo
end subroutine stress_tensor
I guess this eats memory due to computing all the derivatives and storing them. But maybe good in terms of number of times kernels are called? Not sure what one should optimize for nowadays, is memory size an issue?
New version with 1 temprary. Not very pretty maybe, but works?
!> Compute double the strain rate tensor, i.e du_i/dx_j + du_j/dx_i
!! Similar to comp_sij in Nek5000, with the same component ordering
!! Note that here the component is the last index in sij array though.
subroutine strain_rate(sij, u, v, w, coef)
type(field_t), intent(in) :: u, v, w
type(coef_t), intent(in) :: coef
real(kind=rp), intent(out) :: sij(u%Xh%lx, u%Xh%ly, u%Xh%lz, u%msh%nelv, 6)
real(kind=rp) :: du(u%Xh%lx, u%Xh%ly, u%Xh%lz, u%msh%nelv)
integer :: i, e
! du/dx
call dudxyz (du, u%x, coef%drdx, coef%dsdx, coef%dtdx, coef)
do e=1,u%msh%nelv
do i=1,u%Xh%lxyz
sij(i, 1, 1, e, 1) = 2 * du(i, 1, 1, e)
enddo
enddo
! du/dy
call dudxyz (du, u%x, coef%drdy, coef%dsdy, coef%dtdy, coef)
do e=1,u%msh%nelv
do i=1,u%Xh%lxyz
sij(i, 1, 1, e, 4) = du(i, 1, 1, e)
enddo
enddo
! du/dz
call dudxyz (du, u%x, coef%drdz, coef%dsdz, coef%dtdz, coef)
do e=1,u%msh%nelv
do i=1,u%Xh%lxyz
enddo
enddo
! dv/dx
call dudxyz (du, v%x, coef%drdx, coef%dsdx, coef%dtdx, coef)
do e=1,u%msh%nelv
do i=1,u%Xh%lxyz
sij(i, 1, 1, e, 4) = sij(i, 1, 1, e, 4) + du(i, 1, 1, e)
enddo
enddo
! dv/dy
call dudxyz (du, v%x, coef%drdy, coef%dsdy, coef%dtdy, coef)
do e=1,u%msh%nelv
do i=1,u%Xh%lxyz
sij(i, 1, 1, e, 2) = 2 * du(i, 1, 1, e)
enddo
enddo
! dv/dz
call dudxyz (du, v%x, coef%drdz, coef%dsdz, coef%dtdz, coef)
do e=1,u%msh%nelv
do i=1,u%Xh%lxyz
sij(i, 1, 1, e, 5) = du(i, 1, 1, e)
enddo
enddo
! dw/dx
call dudxyz (du, w%x, coef%drdx, coef%dsdx, coef%dtdx, coef)
do e=1,u%msh%nelv
do i=1,u%Xh%lxyz
sij(i, 1, 1, e, 6) = sij(i, 1, 1, e, 6) + du(i, 1, 1, e)
enddo
enddo
! dw/dy
call dudxyz (du, w%x, coef%drdy, coef%dsdy, coef%dtdy, coef)
do e=1,u%msh%nelv
do i=1,u%Xh%lxyz
sij(i, 1, 1, e, 5) = sij(i, 1, 1, e, 5) + du(i, 1, 1, e)
enddo
enddo
! dw/dz
call dudxyz (du, w%x, coef%drdz, coef%dsdz, coef%dtdz, coef)
do e=1,u%msh%nelv
do i=1,u%Xh%lxyz
sij(i, 1, 1, e, 3) = 2 * du(i, 1, 1, e)
enddo
enddo
end subroutine strain_rate
From a performance perspective, likely the first one you made was faster as you compute all the derivatives at once, enabling more data locality per element(and I think looks a bit nicer). As you say, the memory footprint is larger, you could do a mix of these two, where you have three arrays that you reuse for opgrad instead. ``
@MartinKarp Yeah, it is a balance... But now I am scared about this weak grad vs strong grad, because I don't know what the difference is :-)).
@njansson Starting slow, with just providing a routine for the stress. Here is a first try.
subroutine stress_tensor(sij, u, v, w, coef) type(field_t), intent(in) :: u, v, w type(coef_t), intent(in) :: coef real(kind=rp), intent(out) :: sij(u%Xh%lx, u%Xh%ly, u%Xh%lz, u%msh%nelv, 6) real(kind=rp) :: dudx(u%Xh%lx, u%Xh%ly, u%Xh%lz, u%msh%nelv) real(kind=rp) :: dudy(u%Xh%lx, u%Xh%ly, u%Xh%lz, u%msh%nelv) real(kind=rp) :: dudz(u%Xh%lx, u%Xh%ly, u%Xh%lz, u%msh%nelv) real(kind=rp) :: dvdx(v%Xh%lx, v%Xh%ly, v%Xh%lz, u%msh%nelv) real(kind=rp) :: dvdy(v%Xh%lx, v%Xh%ly, v%Xh%lz, u%msh%nelv) real(kind=rp) :: dvdz(v%Xh%lx, v%Xh%ly, v%Xh%lz, u%msh%nelv) real(kind=rp) :: dwdx(w%Xh%lx, w%Xh%ly, w%Xh%lz, u%msh%nelv) real(kind=rp) :: dwdy(w%Xh%lx, w%Xh%ly, w%Xh%lz, u%msh%nelv) real(kind=rp) :: dwdz(w%Xh%lx, w%Xh%ly, w%Xh%lz, u%msh%nelv) integer :: i, e call opgrad(dudx, dudy, dudz, u%x, coef) call opgrad(dvdx, dvdy, dvdz, v%x, coef) call opgrad(dwdx, dwdy, dwdz, w%x, coef) do e=1,u%msh%nelv do i=1,u%Xh%lxyz sij(i, 1, 1, e, 1) = 2 * dudx(i, 1, 1, e) sij(i, 1, 1, e, 2) = 2 * dvdy(i, 1, 1, e) sij(i, 1, 1, e, 3) = 2 * dwdz(i, 1, 1, e) sij(i, 1, 1, e, 4) = dudy(i, 1, 1, e) + dvdx(i, 1, 1, e) sij(i, 1, 1, e, 5) = dvdz(i, 1, 1, e) + dwdy(i, 1, 1, e) sij(i, 1, 1, e, 6) = dudz(i, 1, 1, e) + dwdx(i, 1, 1, e) enddo enddo end subroutine stress_tensor
I guess this eats memory due to computing all the derivatives and storing them. But maybe good in terms of number of times kernels are called? Not sure what one should optimize for nowadays, is memory size an issue?
But how would we call this? Can't we require the dud...
dvd...
dwd...
to be called as argument, and let the call site take care of the memory? Then we would at least avoid these large temporaries
Actually, the nicest would be if you stored the stress tensor as 6 different fields, similar to the geometric factors. That way you would require 0 temporaries.
Actually, the nicest would be if you stored the stress tensor as 6 different fields, similar to the geometric factors. That way you would require 0 temporaries.
Exactly, that was what I tried to suggest, leave the data mgmt for the call site / consumer, and keep the kernels free
@njansson @MartinKarp OK. The function will eventually be used in the pressure residual. So I we can deffer the allocation there.
@njansson @MartinKarp OK. The function will eventually be used in the pressure residual. So I we can deffer the allocation there.
Good! 🍻 How about something that inherit from pnpn_prs_res_t
, then we can pick the one with additional fields inside the pnpn_res_fctry
depending on the case + parameters
@njansson @MartinKarp OK. The function will eventually be used in the pressure residual. So I we can deffer the allocation there.
Good! 🍻 How about something that inherit from
pnpn_prs_res_t
, then we can pick the one with additional fields inside thepnpn_res_fctry
depending on the case + parameters
We can add a simple logical stress
to the residual factory, then add all the cases for each backend
OK, something like this. To have 0 temporaries, I guess we need some work arrays as well.
subroutine strain_rate(s11, s22, s33, s12, s13, s23, &
wrk1, wrk2, u, v, w, coef)
type(field_t), intent(in) :: u, v, w
type(coef_t), intent(in) :: coef
real(kind=rp), intent(inout) :: s11(u%Xh%lx, u%Xh%ly, u%Xh%lz, u%msh%nelv)
real(kind=rp), intent(inout) :: s22(u%Xh%lx, u%Xh%ly, u%Xh%lz, u%msh%nelv)
real(kind=rp), intent(inout) :: s33(u%Xh%lx, u%Xh%ly, u%Xh%lz, u%msh%nelv)
real(kind=rp), intent(inout) :: s12(u%Xh%lx, u%Xh%ly, u%Xh%lz, u%msh%nelv)
real(kind=rp), intent(inout) :: s23(u%Xh%lx, u%Xh%ly, u%Xh%lz, u%msh%nelv)
real(kind=rp), intent(inout) :: s13(u%Xh%lx, u%Xh%ly, u%Xh%lz, u%msh%nelv)
real(kind=rp), intent(inout) :: wrk1(u%Xh%lx, u%Xh%ly, u%Xh%lz, u%msh%nelv)
real(kind=rp), intent(inout) :: wrk2(u%Xh%lx, u%Xh%ly, u%Xh%lz, u%msh%nelv)
integer :: i, e
call opgrad(s11, s12, s13, u%x, coef)
call opgrad(wrk1, s22, s23, v%x, coef)
do e=1,u%msh%nelv
do i=1,u%Xh%lxyz
s11(i, 1, 1, e) = 2 * s11(i, 1, 1, e) ! 2* du/dx
s22(i, 1, 1, e) = 2 * s22(i, 1, 1, e) ! 2* dv/dy
s12(i, 1, 1, e) = s12(i, 1, 1, e) + wrk1(i, 1, 1, e) ! du/dy + dv/dx
enddo
enddo
call opgrad(wrk1, wrk2, s33, w%x, coef)
do e=1,u%msh%nelv
do i=1,u%Xh%lxyz
s33(i, 1, 1, e) = 2 * s33(i, 1, 1, e) ! 2* dw/dz
s13(i, 1, 1, e) = s13(i, 1, 1, e) + wrk1(i, 1, 1, e) ! du/dz + dz/dx
s23(i, 1, 1, e) = s23(i, 1, 1, e) + wrk2(i, 1, 1, e) ! dv/dz + dw/dy
enddo
enddo
end subroutine strain_rate
Or with no work arrays, but no opgrad
!> Compute double the strain rate tensor, i.e du_i/dx_j + du_j/dx_i
!! Similar to comp_sij in Nek5000.
subroutine strain_rate(s11, s22, s33, s12, s13, s23, &
u, v, w, coef)
type(field_t), intent(in) :: u, v, w
type(coef_t), intent(in) :: coef
real(kind=rp), intent(inout) :: s11(u%Xh%lx, u%Xh%ly, u%Xh%lz, u%msh%nelv)
real(kind=rp), intent(inout) :: s22(u%Xh%lx, u%Xh%ly, u%Xh%lz, u%msh%nelv)
real(kind=rp), intent(inout) :: s33(u%Xh%lx, u%Xh%ly, u%Xh%lz, u%msh%nelv)
real(kind=rp), intent(inout) :: s12(u%Xh%lx, u%Xh%ly, u%Xh%lz, u%msh%nelv)
real(kind=rp), intent(inout) :: s23(u%Xh%lx, u%Xh%ly, u%Xh%lz, u%msh%nelv)
real(kind=rp), intent(inout) :: s13(u%Xh%lx, u%Xh%ly, u%Xh%lz, u%msh%nelv)
integer :: i, e
! we use s11 as a work array here
call dudxyz (s12, u%x, coef%drdy, coef%dsdy, coef%dtdy, coef)
call dudxyz (s11, v%x, coef%drdx, coef%dsdx, coef%dtdx, coef)
do e=1,u%msh%nelv
do i=1,u%Xh%lxyz
s12(i, 1, 1, e) = s12(i, 1, 1, e) + s11(i, 1, 1, e) ! du/dy + dv/dx
enddo
enddo
call dudxyz (s13, u%x, coef%drdz, coef%dsdz, coef%dtdz, coef)
call dudxyz (s11, w%x, coef%drdx, coef%dsdx, coef%dtdx, coef)
do e=1,u%msh%nelv
do i=1,u%Xh%lxyz
s13(i, 1, 1, e) = s13(i, 1, 1, e) + s11(i, 1, 1, e) ! du/dz + dw/dx
enddo
enddo
call dudxyz (s23, v%x, coef%drdz, coef%dsdz, coef%dtdz, coef)
call dudxyz (s11, w%x, coef%drdy, coef%dsdy, coef%dtdy, coef)
do e=1,u%msh%nelv
do i=1,u%Xh%lxyz
s23(i, 1, 1, e) = s23(i, 1, 1, e) + s11(i, 1, 1, e) ! dv/dz + dw/dy
enddo
enddo
call dudxyz (s11, u%x, coef%drdx, coef%dsdx, coef%dtdx, coef)
call dudxyz (s22, v%x, coef%drdy, coef%dsdy, coef%dtdy, coef)
call dudxyz (s33, w%x, coef%drdz, coef%dsdz, coef%dtdz, coef)
do e=1,u%msh%nelv
do i=1,u%Xh%lxyz
s11(i, 1, 1, e) = 2 * s11(i, 1, 1, e) ! 2* du/dx
s22(i, 1, 1, e) = 2 * s22(i, 1, 1, e) ! 2* dv/dy
s33(i, 1, 1, e) = 2 * s33(i, 1, 1, e) ! 2* dv/dy
enddo
enddo
end subroutine strain_rat
This issue got a bit polluted with all the stress operator stuff. I will close it, and open a new one.
I'm going to start looking at this at a somewhat slow pace. So far I tried to determine the scope of the work, based on Nek5000. In
plan4.f
there is the following relevant stuff:crespsp
.cresvsp
orcresvsp_weak
.ophx
callsaxhmsf
insubs1.f
, which computes the coupled Helmholtz matrix-vector products.hmhzsf
called inophinv