Closed zerothi closed 5 years ago
This was partially fixed in 563aa76, however, the performance increase is only slightly above 2?
Either: 1) My initial implementation was really fast, or 2) My Cython implementation is ridiculously slow (it is more or less a copy paste from TranSiesta)
I have just tried to move the calculation of the phases like this:
diff --git a/sisl/physics/_bloch.pyx b/sisl/physics/_bloch.pyx
index 0538e3e9..3227a33d 100644
--- a/sisl/physics/_bloch.pyx
+++ b/sisl/physics/_bloch.pyx
@@ -44,7 +44,7 @@ def bloch_unfold(np.ndarray[np.int32_t, ndim=1, mode='c'] B,
@cython.initializedcheck(False)
cdef void _unfold_M64(const double w,
const Py_ssize_t B0, const Py_ssize_t B1, const Py_ssize_t B2,
- const double k0, const double k1, const double k2,
+ const double k0, const double k1, const double k2, # with 2pi
const Py_ssize_t N1, const Py_ssize_t N2,
const float complex[:, ::1] m,
float complex[:, ::1] M) nogil:
@@ -138,12 +138,14 @@ def _unfold64(const int[::1] B, const double[:, ::1] K,
@cython.initializedcheck(False)
cdef void _unfold_M128(const double w,
const Py_ssize_t B0, const Py_ssize_t B1, const Py_ssize_t B2,
- const double k0, const double k1, const double k2,
+ const double k0, const double k1, const double k2, # with 2pi
const Py_ssize_t N1, const Py_ssize_t N2,
+ double complex[:, :, ::1] phase,
const double complex[:, ::1] m,
double complex[:, ::1] M) nogil:
cdef Py_ssize_t j0, j1, j2 # looping the output rows
+ cdef Py_ssize_t i0, i1, i2 # looping the output rows
cdef Py_ssize_t i, j # looping m[j, i]
cdef Py_ssize_t I, J # looping output M[J, I]
@@ -151,47 +153,75 @@ cdef void _unfold_M128(const double w,
cdef double complex[::1] MJ, mj
# Phase handling variables
- cdef double rph
- cdef double complex aph0, aph1, aph2
- cdef double complex ph, ph0, ph1, ph2
+ cdef double complex ph
- # Construct the phases to be added
- aph0 = aph1 = aph2 = 1.
- if B0 > 1:
- aph0 = cos(k0) + 1j * sin(k0)
- if B1 > 1:
- aph1 = cos(k1) + 1j * sin(k1)
- if B2 > 1:
- aph2 = cos(k2) + 1j * sin(k2)
+ # Setup all phases in the temporary array
+ # These phases are
+ _setup_phase_128(w, B0, B1, B2, k0, k1, k2, phase)
J = 0
for j2 in range(B2):
for j1 in range(B1):
for j0 in range(B0):
- rph = - j0 * k0 - j1 * k1 - j2 * k2
- ph = w * cos(rph) + 1j * (w * sin(rph))
for j in range(N1):
- # Every column starts from scratch
- ph2 = ph
-
# Retrieve sub-arrays that we are to write too
mj = m[j]
MJ = M[J]
I = 0
- for _ in range(B2):
- ph1 = ph2
- for _ in range(B1):
- ph0 = ph1
- for _ in range(B0):
+ for i2 in range(B2):
+ for i1 in range(B1):
+ for i0 in range(B0):
+ ph = phase[i2, i1, i0]
for i in range(N2):
- MJ[I] = MJ[I] + mj[i] * ph0
+ MJ[I] = MJ[I] + mj[i] * ph
I += 1
- ph0 = ph0 * aph0
- ph1 = ph1 * aph1
- ph2 = ph2 * aph2
J += 1
-
+ # Move all phases one step back
+ _add_phase_128(B0, B1, B2, -k0, phase)
+ _add_phase_128(B0, B1, B2, k0 * <double> B0 - k1, phase)
+ _add_phase_128(B0, B1, B2, k1 * <double> B1 - k2, phase)
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.initializedcheck(False)
+cdef void _setup_phase_128(const double w,
+ const Py_ssize_t B0, const Py_ssize_t B1, const Py_ssize_t B2,
+ const double k0, const double k1, const double k2,
+ double complex[:, :, ::1] phase) nogil:
+
+ cdef Py_ssize_t i0, i1, i2
+
+ # Phase handling variables
+ cdef double rph2, rph21, rph
+
+ for i2 in range(B2):
+ rph2 = i2 * k2
+ for i1 in range(B1):
+ rph21 = rph2 + i1 * k1
+ for i0 in range(B0):
+ rph = rph21 + i0 * k0
+ phase[i2, i1, i0] = w * cos(rph) + 1j * w * sin(rph)
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.initializedcheck(False)
+cdef void _add_phase_128(const Py_ssize_t B0, const Py_ssize_t B1, const Py_ssize_t B2,
+ const double k,
+ double complex[:, :, ::1] phase) nogil:
+
+ cdef Py_ssize_t i0, i1, i2
+
+ # Phase added
+ cdef double complex ph = cos(k) + 1j * sin(k)
+
+ for i2 in range(B2):
+ for i1 in range(B1):
+ for i0 in range(B0):
+ phase[i2, i1, i0] = phase[i2, i1, i0] * ph
+
@cython.cdivision(True)
@cython.boundscheck(False)
@@ -210,6 +240,7 @@ def _unfold128(const int[::1] B, const double[:, ::1] K,
cdef np.ndarray[np.complex128_t, ndim=2, mode='c'] M = np.zeros([N * N1, N * N2], dtype=np.complex128)
# Get view
cdef double complex[:, ::1] MM = M
+ cdef double complex[:, :, ::1] phase = np.empty([B2, B1, B0], dtype=np.complex128)
cdef double pi2 = pi * 2
cdef double k0, k1, k2
@@ -222,7 +253,7 @@ def _unfold128(const int[::1] B, const double[:, ::1] K,
k0 = K[I, 0] * pi2
k1 = K[I, 1] * pi2
k2 = K[I, 2] * pi2
- _unfold_M128(w, B0, B1, B2, k0, k1, k2, N1, N2, m[I], MM)
+ _unfold_M128(w, B0, B1, B2, k0, k1, k2, N1, N2, phase, m[I], MM)
return M
With only less than 1% performance increase. The only logical next step would be to branch the calculations as was done in the pure-python code
Branch bloch via this:
diff --git a/sisl/physics/_bloch.pyx b/sisl/physics/_bloch.pyx
index 0538e3e9..f13e9560 100644
--- a/sisl/physics/_bloch.pyx
+++ b/sisl/physics/_bloch.pyx
@@ -136,12 +136,12 @@ def _unfold64(const int[::1] B, const double[:, ::1] K,
@cython.boundscheck(False)
@cython.wraparound(False)
@cython.initializedcheck(False)
-cdef void _unfold_M128(const double w,
- const Py_ssize_t B0, const Py_ssize_t B1, const Py_ssize_t B2,
- const double k0, const double k1, const double k2,
- const Py_ssize_t N1, const Py_ssize_t N2,
- const double complex[:, ::1] m,
- double complex[:, ::1] M) nogil:
+cdef void _unfold3_M128(const double w,
+ const Py_ssize_t B0, const Py_ssize_t B1, const Py_ssize_t B2,
+ const double k0, const double k1, const double k2,
+ const Py_ssize_t N1, const Py_ssize_t N2,
+ const double complex[:, ::1] m,
+ double complex[:, ::1] M) nogil:
cdef Py_ssize_t j0, j1, j2 # looping the output rows
cdef Py_ssize_t i, j # looping m[j, i]
@@ -193,6 +193,108 @@ cdef void _unfold_M128(const double w,
J += 1
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.initializedcheck(False)
+cdef void _unfold2_M128(const double w,
+ const Py_ssize_t B0, const Py_ssize_t B1,
+ const double k0, const double k1,
+ const Py_ssize_t N1, const Py_ssize_t N2,
+ const double complex[:, ::1] m,
+ double complex[:, ::1] M) nogil:
+
+ cdef Py_ssize_t j0, j1 # looping the output rows
+ cdef Py_ssize_t i, j # looping m[j, i]
+ cdef Py_ssize_t I, J # looping output M[J, I]
+
+ # Faster memory views
+ cdef double complex[::1] MJ, mj
+
+ # Phase handling variables
+ cdef double rph
+ cdef double complex aph0, aph1
+ cdef double complex ph, ph0, ph1
+
+ # Construct the phases to be added
+ aph0 = aph1 = 1.
+ if B0 > 1:
+ aph0 = cos(k0) + 1j * sin(k0)
+ if B1 > 1:
+ aph1 = cos(k1) + 1j * sin(k1)
+
+ J = 0
+ for j1 in range(B1):
+ for j0 in range(B0):
+ rph = - j0 * k0 - j1 * k1
+ ph = w * cos(rph) + 1j * (w * sin(rph))
+ for j in range(N1):
+ # Every column starts from scratch
+ ph1 = ph
+
+ # Retrieve sub-arrays that we are to write too
+ mj = m[j]
+ MJ = M[J]
+
+ I = 0
+ for _ in range(B1):
+ ph0 = ph1
+ for _ in range(B0):
+ for i in range(N2):
+ MJ[I] = MJ[I] + mj[i] * ph0
+ I += 1
+ ph0 = ph0 * aph0
+ ph1 = ph1 * aph1
+ J += 1
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.initializedcheck(False)
+cdef void _unfold1_M128(const double w,
+ const Py_ssize_t B0,
+ const double k0,
+ const Py_ssize_t N1, const Py_ssize_t N2,
+ const double complex[:, ::1] m,
+ double complex[:, ::1] M) nogil:
+
+ cdef Py_ssize_t j0 # looping the output rows
+ cdef Py_ssize_t i, j # looping m[j, i]
+ cdef Py_ssize_t I, J # looping output M[J, I]
+
+ # Faster memory views
+ cdef double complex[::1] MJ, mj
+
+ # Phase handling variables
+ cdef double rph
+ cdef double complex aph0
+ cdef double complex ph, ph0
+
+ # Construct the phases to be added
+ aph0 = 1.
+ if B0 > 1:
+ aph0 = cos(k0) + 1j * sin(k0)
+
+ J = 0
+ for j0 in range(B0):
+ rph = - j0 * k0
+ ph = w * cos(rph) + 1j * (w * sin(rph))
+ for j in range(N1):
+ # Every column starts from scratch
+ ph0 = ph
+
+ # Retrieve sub-arrays that we are to write too
+ mj = m[j]
+ MJ = M[J]
+
+ I = 0
+ for _ in range(B0):
+ for i in range(N2):
+ MJ[I] = MJ[I] + mj[i] * ph0
+ I += 1
+ ph0 = ph0 * aph0
+ J += 1
+
+
@cython.cdivision(True)
@cython.boundscheck(False)
@cython.wraparound(False)
@@ -217,12 +319,42 @@ def _unfold128(const int[::1] B, const double[:, ::1] K,
cdef double w = 1. / N
cdef Py_ssize_t I
- # Now perform expansion
- for I in range(N):
- k0 = K[I, 0] * pi2
- k1 = K[I, 1] * pi2
- k2 = K[I, 2] * pi2
- _unfold_M128(w, B0, B1, B2, k0, k1, k2, N1, N2, m[I], MM)
+ if B0 == 1:
+ if B1 == 1:
+ for I in range(N):
+ k2 = K[I, 2] * pi2
+ _unfold1_M128(w, B2, k2, N1, N2, m[I], MM)
+ elif B2 == 1:
+ for I in range(N):
+ k1 = K[I, 1] * pi2
+ _unfold1_M128(w, B1, k1, N1, N2, m[I], MM)
+ else:
+ for I in range(N):
+ k1 = K[I, 1] * pi2
+ k2 = K[I, 2] * pi2
+ _unfold2_M128(w, B1, B2, k1, k2, N1, N2, m[I], MM)
+ elif B1 == 1:
+ if B2 == 1:
+ for I in range(N):
+ k0 = K[I, 0] * pi2
+ _unfold1_M128(w, B0, k0, N1, N2, m[I], MM)
+ else:
+ for I in range(N):
+ k0 = K[I, 0] * pi2
+ k2 = K[I, 2] * pi2
+ _unfold2_M128(w, B0, B2, k0, k2, N1, N2, m[I], MM)
+ elif B2 == 1:
+ # B0 > 1 and B1 > 1
+ for I in range(N):
+ k0 = K[I, 0] * pi2
+ k1 = K[I, 1] * pi2
+ _unfold2_M128(w, B0, B1, k0, k1, N1, N2, m[I], MM)
+ else:
+ for I in range(N):
+ k0 = K[I, 0] * pi2
+ k1 = K[I, 1] * pi2
+ k2 = K[I, 2] * pi2
+ _unfold3_M128(w, B0, B1, B2, k0, k1, k2, N1, N2, m[I], MM)
return M
It gave a bit more than the prior thing.
Last thing to do, is to do it in fortran (since there OpenMP with collapse statements work!)
I have now also tried fortran, to no avail:
subroutine bloch_z(B, nk, K, n1, n2, M, A)
implicit none
integer, parameter :: dp = selected_real_kind(p=15)
! Input parameters
integer, intent(in) :: B(3)
integer, intent(in) :: nk
real(dp), intent(in) :: K(3,nk)
integer, intent(in) :: n1, n2
complex(dp), intent(in) :: M(n1,n2,nk)
complex(dp), intent(out) :: A(n1*nk, n2*nk)
! Define f2py intents
!f2py intent(in) :: B
!f2py intent(in) :: nk, K
!f2py intent(in) :: n1, n2
!f2py intent(in) :: M
!f2py intent(out) :: A
call bloch_dim_z(B(1), B(2), B(3), nk, K(1,1), n1, n2, M(1,1,1), A(1,1))
end subroutine bloch_z
subroutine bloch_dim_z(B1, B2, B3, nk, K, n1, n2, M, A)
!$ use omp_lib, only: omp_get_wtime
implicit none
integer, parameter :: dp = selected_real_kind(p=15)
real(dp), parameter :: Pi = 3.14159265358979323846264338327950288419716939937510_dp
! Input parameters
integer, intent(in) :: B1, B2, B3
integer, intent(in) :: nk
real(dp), intent(in) :: K(3, B1, B2, B3)
integer, intent(in) :: n1, n2
complex(dp), intent(in) :: M(n1, n2, B1, B2, B3)
complex(dp), intent(out) :: A(n1*nk, n2*nk)
! Local variables
integer :: c1, c2, c3
integer :: iA, iM, i1, i2, i3
integer :: jA, jM, j1, j2, j3
! Phases are calculated using double precision due to phase accummulations
complex(dp) :: p(3), qPi
real(dp) :: rPi(3), wq, t
!$ t = omp_get_wtime()
!$OMP parallel default(shared), private(wq,rPi,qPi,p), &
!$OMP& private(c1, c2, c3), &
!$OMP& private(iA, iM, i1, i2, i3), &
!$OMP& private(jA, jM, j1, j2, j3)
!$OMP workshare
A(:,:) = dcmplx(0._dp, 0._dp)
!$OMP end workshare
! Save some multiplications
wq = log(1._dp / real(nk, dp))
!$OMP single
do c3 = 1, B3
do c2 = 1, B2
do c1 = 1, B1
rPi(:) = K(:,c1,c2,c3) * Pi * 2._dp
qPi = cdexp(dcmplx(0._dp, rPi(1)))
do j3 = 1 , B3
do j2 = 1 , B2
do j1 = 1 , B1
!$OMP task firstprivate(c3,c2,c1,j3,j2,j1,rPi,qPi)
p(3) = cdexp(dcmplx(wq, -j1*rPi(1) -j2*rPi(2) -j3*rPi(3)))
jA = (( (j3-1)*B2 + (j2-1) ) * B1 + (j1-1)) * n1
do jM = 1, n2
jA = jA + 1
iA = 0
do i3 = 1, B3
p(2) = p(3) * cdexp(dcmplx(0._dp, i3*rPi(3)))
do i2 = 1, B2
p(1) = p(2) * cdexp(dcmplx(0._dp, i2*rPi(2)))
do i1 = 1, B1
p(1) = p(1) * qPi
do iM = 1, n1
iA = iA + 1
A(iA,jA) = A(iA,jA) + p(1) * M(iM,jM,c1,c2,c3)
end do
end do
end do
end do
end do
!$OMP end task
end do
end do
end do
end do
end do
end do
!$OMP end single nowait
!$OMP end parallel
!$ print*, omp_get_wtime() - t
end subroutine bloch_dim_z
It really seems that the routine in general needs a closer look, expansion for 10x10
for matrices with 64 orbitals takes around 10 secs...
Describe the feature The Bloch expansion has to be done in Cython, it is so immensely slow :)