ExtremeFLOW / neko

/ᐠ. 。.ᐟ\ᵐᵉᵒʷˎˊ˗
https://neko.cfd/
Other
170 stars 30 forks source link

Stress tensor operator #768

Closed timofeymukha closed 1 year ago

timofeymukha commented 1 year ago

I'm going to start looking at this at a somewhat slow pace. So far I tried to determine the scope of the work, based on Nek5000. In plan4.f there is the following relevant stuff:

timofeymukha commented 1 year ago

@njansson Starting slow, with just providing a routine for the stress. Here is a first try.

subroutine stress_tensor(sij, u, v, w, coef)
    type(field_t), intent(in) :: u, v, w
    type(coef_t), intent(in) :: coef
    real(kind=rp), intent(out) :: sij(u%Xh%lx, u%Xh%ly, u%Xh%lz, u%msh%nelv, 6)
    real(kind=rp) :: dudx(u%Xh%lx, u%Xh%ly, u%Xh%lz, u%msh%nelv)
    real(kind=rp) :: dudy(u%Xh%lx, u%Xh%ly, u%Xh%lz, u%msh%nelv)
    real(kind=rp) :: dudz(u%Xh%lx, u%Xh%ly, u%Xh%lz, u%msh%nelv)
    real(kind=rp) :: dvdx(v%Xh%lx, v%Xh%ly, v%Xh%lz, u%msh%nelv)
    real(kind=rp) :: dvdy(v%Xh%lx, v%Xh%ly, v%Xh%lz, u%msh%nelv)
    real(kind=rp) :: dvdz(v%Xh%lx, v%Xh%ly, v%Xh%lz, u%msh%nelv)
    real(kind=rp) :: dwdx(w%Xh%lx, w%Xh%ly, w%Xh%lz, u%msh%nelv)
    real(kind=rp) :: dwdy(w%Xh%lx, w%Xh%ly, w%Xh%lz, u%msh%nelv)
    real(kind=rp) :: dwdz(w%Xh%lx, w%Xh%ly, w%Xh%lz, u%msh%nelv)

    integer :: i, e

    call opgrad(dudx, dudy, dudz, u%x, coef)
    call opgrad(dvdx, dvdy, dvdz, v%x, coef)
    call opgrad(dwdx, dwdy, dwdz, w%x, coef)

    do e=1,u%msh%nelv
      do i=1,u%Xh%lxyz
        sij(i, 1, 1, e, 1) = 2 * dudx(i, 1, 1, e)
        sij(i, 1, 1, e, 2) = 2 * dvdy(i, 1, 1, e)
        sij(i, 1, 1, e, 3) = 2 * dwdz(i, 1, 1, e)
        sij(i, 1, 1, e, 4) = dudy(i, 1, 1, e) + dvdx(i, 1, 1, e)
        sij(i, 1, 1, e, 5) = dvdz(i, 1, 1, e) + dwdy(i, 1, 1, e)
        sij(i, 1, 1, e, 6) = dudz(i, 1, 1, e) + dwdx(i, 1, 1, e)
       enddo
     enddo

  end subroutine stress_tensor

I guess this eats memory due to computing all the derivatives and storing them. But maybe good in terms of number of times kernels are called? Not sure what one should optimize for nowadays, is memory size an issue?

timofeymukha commented 1 year ago

New version with 1 temprary. Not very pretty maybe, but works?

!> Compute double the strain rate tensor, i.e du_i/dx_j + du_j/dx_i
  !! Similar to comp_sij in Nek5000, with the same component ordering
  !! Note that here the component is the last index in sij array though.
  subroutine strain_rate(sij, u, v, w, coef)
    type(field_t), intent(in) :: u, v, w
    type(coef_t), intent(in) :: coef
    real(kind=rp), intent(out) :: sij(u%Xh%lx, u%Xh%ly, u%Xh%lz, u%msh%nelv, 6)
    real(kind=rp) :: du(u%Xh%lx, u%Xh%ly, u%Xh%lz, u%msh%nelv)

    integer :: i, e

    ! du/dx
    call dudxyz (du, u%x, coef%drdx, coef%dsdx, coef%dtdx, coef)
    do e=1,u%msh%nelv
      do i=1,u%Xh%lxyz
        sij(i, 1, 1, e, 1) = 2 * du(i, 1, 1, e)
       enddo
    enddo

    ! du/dy
    call dudxyz (du, u%x, coef%drdy, coef%dsdy, coef%dtdy, coef)
    do e=1,u%msh%nelv
      do i=1,u%Xh%lxyz
        sij(i, 1, 1, e, 4) = du(i, 1, 1, e)
       enddo
    enddo

    ! du/dz
    call dudxyz (du, u%x, coef%drdz, coef%dsdz, coef%dtdz, coef)
    do e=1,u%msh%nelv
      do i=1,u%Xh%lxyz
       enddo
    enddo

    ! dv/dx
    call dudxyz (du, v%x, coef%drdx, coef%dsdx, coef%dtdx, coef)
    do e=1,u%msh%nelv
      do i=1,u%Xh%lxyz
        sij(i, 1, 1, e, 4) = sij(i, 1, 1, e, 4) + du(i, 1, 1, e)
       enddo
    enddo

    ! dv/dy
    call dudxyz (du, v%x, coef%drdy, coef%dsdy, coef%dtdy, coef)
    do e=1,u%msh%nelv
      do i=1,u%Xh%lxyz
        sij(i, 1, 1, e, 2) = 2 * du(i, 1, 1, e)
       enddo
    enddo

    ! dv/dz
    call dudxyz (du, v%x, coef%drdz, coef%dsdz, coef%dtdz, coef)
    do e=1,u%msh%nelv
      do i=1,u%Xh%lxyz
        sij(i, 1, 1, e, 5) = du(i, 1, 1, e)
       enddo
    enddo

    ! dw/dx
    call dudxyz (du, w%x, coef%drdx, coef%dsdx, coef%dtdx, coef)
    do e=1,u%msh%nelv
      do i=1,u%Xh%lxyz
        sij(i, 1, 1, e, 6) = sij(i, 1, 1, e, 6) + du(i, 1, 1, e)
       enddo
    enddo

    ! dw/dy
    call dudxyz (du, w%x, coef%drdy, coef%dsdy, coef%dtdy, coef)
    do e=1,u%msh%nelv
      do i=1,u%Xh%lxyz
        sij(i, 1, 1, e, 5) = sij(i, 1, 1, e, 5) + du(i, 1, 1, e)
       enddo
    enddo

    ! dw/dz
    call dudxyz (du, w%x, coef%drdz, coef%dsdz, coef%dtdz, coef)
    do e=1,u%msh%nelv
      do i=1,u%Xh%lxyz
        sij(i, 1, 1, e, 3) = 2 * du(i, 1, 1, e)
       enddo
    enddo

  end subroutine strain_rate
MartinKarp commented 1 year ago

From a performance perspective, likely the first one you made was faster as you compute all the derivatives at once, enabling more data locality per element(and I think looks a bit nicer). As you say, the memory footprint is larger, you could do a mix of these two, where you have three arrays that you reuse for opgrad instead. ``

timofeymukha commented 1 year ago

@MartinKarp Yeah, it is a balance... But now I am scared about this weak grad vs strong grad, because I don't know what the difference is :-)).

njansson commented 1 year ago

@njansson Starting slow, with just providing a routine for the stress. Here is a first try.

subroutine stress_tensor(sij, u, v, w, coef)
    type(field_t), intent(in) :: u, v, w
    type(coef_t), intent(in) :: coef
    real(kind=rp), intent(out) :: sij(u%Xh%lx, u%Xh%ly, u%Xh%lz, u%msh%nelv, 6)
    real(kind=rp) :: dudx(u%Xh%lx, u%Xh%ly, u%Xh%lz, u%msh%nelv)
    real(kind=rp) :: dudy(u%Xh%lx, u%Xh%ly, u%Xh%lz, u%msh%nelv)
    real(kind=rp) :: dudz(u%Xh%lx, u%Xh%ly, u%Xh%lz, u%msh%nelv)
    real(kind=rp) :: dvdx(v%Xh%lx, v%Xh%ly, v%Xh%lz, u%msh%nelv)
    real(kind=rp) :: dvdy(v%Xh%lx, v%Xh%ly, v%Xh%lz, u%msh%nelv)
    real(kind=rp) :: dvdz(v%Xh%lx, v%Xh%ly, v%Xh%lz, u%msh%nelv)
    real(kind=rp) :: dwdx(w%Xh%lx, w%Xh%ly, w%Xh%lz, u%msh%nelv)
    real(kind=rp) :: dwdy(w%Xh%lx, w%Xh%ly, w%Xh%lz, u%msh%nelv)
    real(kind=rp) :: dwdz(w%Xh%lx, w%Xh%ly, w%Xh%lz, u%msh%nelv)

    integer :: i, e

    call opgrad(dudx, dudy, dudz, u%x, coef)
    call opgrad(dvdx, dvdy, dvdz, v%x, coef)
    call opgrad(dwdx, dwdy, dwdz, w%x, coef)

    do e=1,u%msh%nelv
      do i=1,u%Xh%lxyz
        sij(i, 1, 1, e, 1) = 2 * dudx(i, 1, 1, e)
        sij(i, 1, 1, e, 2) = 2 * dvdy(i, 1, 1, e)
        sij(i, 1, 1, e, 3) = 2 * dwdz(i, 1, 1, e)
        sij(i, 1, 1, e, 4) = dudy(i, 1, 1, e) + dvdx(i, 1, 1, e)
        sij(i, 1, 1, e, 5) = dvdz(i, 1, 1, e) + dwdy(i, 1, 1, e)
        sij(i, 1, 1, e, 6) = dudz(i, 1, 1, e) + dwdx(i, 1, 1, e)
       enddo
     enddo

  end subroutine stress_tensor

I guess this eats memory due to computing all the derivatives and storing them. But maybe good in terms of number of times kernels are called? Not sure what one should optimize for nowadays, is memory size an issue?

But how would we call this? Can't we require the dud... dvd... dwd... to be called as argument, and let the call site take care of the memory? Then we would at least avoid these large temporaries

MartinKarp commented 1 year ago

Actually, the nicest would be if you stored the stress tensor as 6 different fields, similar to the geometric factors. That way you would require 0 temporaries.

njansson commented 1 year ago

Actually, the nicest would be if you stored the stress tensor as 6 different fields, similar to the geometric factors. That way you would require 0 temporaries.

Exactly, that was what I tried to suggest, leave the data mgmt for the call site / consumer, and keep the kernels free

timofeymukha commented 1 year ago

@njansson @MartinKarp OK. The function will eventually be used in the pressure residual. So I we can deffer the allocation there.

njansson commented 1 year ago

@njansson @MartinKarp OK. The function will eventually be used in the pressure residual. So I we can deffer the allocation there.

Good! 🍻 How about something that inherit from pnpn_prs_res_t, then we can pick the one with additional fields inside the pnpn_res_fctry depending on the case + parameters

njansson commented 1 year ago

@njansson @MartinKarp OK. The function will eventually be used in the pressure residual. So I we can deffer the allocation there.

Good! 🍻 How about something that inherit from pnpn_prs_res_t, then we can pick the one with additional fields inside the pnpn_res_fctry depending on the case + parameters

We can add a simple logical stress to the residual factory, then add all the cases for each backend

timofeymukha commented 1 year ago

OK, something like this. To have 0 temporaries, I guess we need some work arrays as well.

 subroutine strain_rate(s11, s22, s33, s12, s13, s23, &
                         wrk1, wrk2, u, v, w, coef)
    type(field_t), intent(in) :: u, v, w
    type(coef_t), intent(in) :: coef
    real(kind=rp), intent(inout) :: s11(u%Xh%lx, u%Xh%ly, u%Xh%lz, u%msh%nelv)
    real(kind=rp), intent(inout) :: s22(u%Xh%lx, u%Xh%ly, u%Xh%lz, u%msh%nelv)
    real(kind=rp), intent(inout) :: s33(u%Xh%lx, u%Xh%ly, u%Xh%lz, u%msh%nelv)
    real(kind=rp), intent(inout) :: s12(u%Xh%lx, u%Xh%ly, u%Xh%lz, u%msh%nelv)
    real(kind=rp), intent(inout) :: s23(u%Xh%lx, u%Xh%ly, u%Xh%lz, u%msh%nelv)
    real(kind=rp), intent(inout) :: s13(u%Xh%lx, u%Xh%ly, u%Xh%lz, u%msh%nelv)
    real(kind=rp), intent(inout) :: wrk1(u%Xh%lx, u%Xh%ly, u%Xh%lz, u%msh%nelv)
    real(kind=rp), intent(inout) :: wrk2(u%Xh%lx, u%Xh%ly, u%Xh%lz, u%msh%nelv)

    integer :: i, e
    call opgrad(s11, s12, s13, u%x, coef)
    call opgrad(wrk1, s22, s23, v%x, coef)

    do e=1,u%msh%nelv
      do i=1,u%Xh%lxyz
        s11(i, 1, 1, e) = 2 * s11(i, 1, 1, e) ! 2* du/dx
        s22(i, 1, 1, e) = 2 * s22(i, 1, 1, e) ! 2* dv/dy
        s12(i, 1, 1, e) = s12(i, 1, 1, e) + wrk1(i, 1, 1, e) ! du/dy + dv/dx
       enddo
    enddo
    call opgrad(wrk1, wrk2, s33, w%x, coef)

    do e=1,u%msh%nelv
      do i=1,u%Xh%lxyz
        s33(i, 1, 1, e) = 2 * s33(i, 1, 1, e) ! 2* dw/dz
        s13(i, 1, 1, e) = s13(i, 1, 1, e) + wrk1(i, 1, 1, e) ! du/dz + dz/dx
        s23(i, 1, 1, e) = s23(i, 1, 1, e) + wrk2(i, 1, 1, e) ! dv/dz + dw/dy
       enddo
    enddo

  end subroutine strain_rate
timofeymukha commented 1 year ago

Or with no work arrays, but no opgrad

  !> Compute double the strain rate tensor, i.e du_i/dx_j + du_j/dx_i
  !! Similar to comp_sij in Nek5000.
  subroutine strain_rate(s11, s22, s33, s12, s13, s23, &
                         u, v, w, coef)
    type(field_t), intent(in) :: u, v, w
    type(coef_t), intent(in) :: coef
    real(kind=rp), intent(inout) :: s11(u%Xh%lx, u%Xh%ly, u%Xh%lz, u%msh%nelv)
    real(kind=rp), intent(inout) :: s22(u%Xh%lx, u%Xh%ly, u%Xh%lz, u%msh%nelv)
    real(kind=rp), intent(inout) :: s33(u%Xh%lx, u%Xh%ly, u%Xh%lz, u%msh%nelv)
    real(kind=rp), intent(inout) :: s12(u%Xh%lx, u%Xh%ly, u%Xh%lz, u%msh%nelv)
    real(kind=rp), intent(inout) :: s23(u%Xh%lx, u%Xh%ly, u%Xh%lz, u%msh%nelv)
    real(kind=rp), intent(inout) :: s13(u%Xh%lx, u%Xh%ly, u%Xh%lz, u%msh%nelv)

    integer :: i, e

    ! we use s11 as a work array here
    call dudxyz (s12, u%x, coef%drdy, coef%dsdy, coef%dtdy, coef)
    call dudxyz (s11, v%x, coef%drdx, coef%dsdx, coef%dtdx, coef)
    do e=1,u%msh%nelv
      do i=1,u%Xh%lxyz
        s12(i, 1, 1, e) = s12(i, 1, 1, e) + s11(i, 1, 1, e) ! du/dy + dv/dx
       enddo
    enddo

    call dudxyz (s13, u%x, coef%drdz, coef%dsdz, coef%dtdz, coef)
    call dudxyz (s11, w%x, coef%drdx, coef%dsdx, coef%dtdx, coef)
    do e=1,u%msh%nelv
      do i=1,u%Xh%lxyz
        s13(i, 1, 1, e) = s13(i, 1, 1, e) + s11(i, 1, 1, e) ! du/dz + dw/dx
       enddo
    enddo

    call dudxyz (s23, v%x, coef%drdz, coef%dsdz, coef%dtdz, coef)
    call dudxyz (s11, w%x, coef%drdy, coef%dsdy, coef%dtdy, coef)
    do e=1,u%msh%nelv
      do i=1,u%Xh%lxyz
        s23(i, 1, 1, e) = s23(i, 1, 1, e) + s11(i, 1, 1, e) ! dv/dz + dw/dy
       enddo
    enddo

    call dudxyz (s11, u%x, coef%drdx, coef%dsdx, coef%dtdx, coef)
    call dudxyz (s22, v%x, coef%drdy, coef%dsdy, coef%dtdy, coef)
    call dudxyz (s33, w%x, coef%drdz, coef%dsdz, coef%dtdz, coef)

    do e=1,u%msh%nelv
      do i=1,u%Xh%lxyz
        s11(i, 1, 1, e) = 2 * s11(i, 1, 1, e) ! 2* du/dx
        s22(i, 1, 1, e) = 2 * s22(i, 1, 1, e) ! 2* dv/dy
        s33(i, 1, 1, e) = 2 * s33(i, 1, 1, e) ! 2* dv/dy
       enddo
    enddo

  end subroutine strain_rat
timofeymukha commented 1 year ago

This issue got a bit polluted with all the stress operator stuff. I will close it, and open a new one.