Open Lancashire3000 opened 2 years ago
Isn't numpy row major? Do you need to flip the indices on the Python version to get the equivalent data access pattern as Fortran?
In any case, I'm afraid that a contracted dimension of 32 is about an order-of-magnitude too small to saturate the compute capability of a modern Intel CPU (K=384 saturates, according to former colleagues in the MKL team), so I'm afraid you are limited by bandwidth limitations and other overheads.
It would be very interesting to perform the CTF comparisons on a very contractions, particularly the more expensive N^6 terms in CCSD. The so-called "four-particle ladder" term (https://github.com/nwchemgit/nwchem/blob/master/src/tce/ccsd/ccsd_t2_8.F) is the most expensive in TCE's CCSD, for a reasonable number of virtual orbitals.
All the terms in CC2 are N^5 (like MP2) and aren't super interesting from a compute perspective, because all the efficient implementations of CC2 and MP2 don't store the two-electron integrals in the fully transformed representation.
Unrelated to CTF, I had some fun porting your Fortran test to use GPUs using NVIDIA StdPar support (maps DO CONCURRENT
and MATMUL
to OpenACC and CUTENSOR, respectively). I'm happy to share and discuss that, but this isn't the right place for it.
Hi, thank you for your comments and suggestions.
I tried to increase the contracted dimension to 384, others 32, the output is
ctf 0.05176 ± 0.00117 s
ctf-symm 0.05084 ± 0.00063 s
I can transpose the array A
before the contraction, the result is similar
ctf 0.0558 ± 0.00089 s
ctf-symm 0.0546 ± 0.00062 s
The revised Python
and Fortran
code (admittedly it is messy) for different dimensions in each index
import numpy as np
import time
import scipy.stats
import warnings
#import timeit
import ctf
warnings.filterwarnings("ignore", category=DeprecationWarning)
# from https://stackoverflow.com/questions/15033511/compute-a-confidence-interval-from-sample-data
def mean_confidence_interval(data, var_name, unit, confidence=0.95):
a = 1.0 * np.array(data)
n = len(a)
m, se = np.mean(a), scipy.stats.sem(a)
h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)
print(var_name, round(m, 5), "\u00B1", round(h, 5), unit )
print('ab,bcde->acd')
# A[a,b] * B[b,c,d,e] = C[a,c,d,e]
na = nc = nd = ne = 32
nb = 384
#nt = 10
n_times = 128
#print('number of common dimension', dim_comm)
print('number of averaged time', n_times)
A = np.random.random((na,nb))
B = np.random.random((nb,nc,nd,ne))
B_symm= np.zeros((nb,nc,nd,ne))
# symmetrize B
for id in range(nd):
for ie in range(ne):
B_symm[:,:,id,ie] = (B[:,:,id,ie] + B[:,:,ie,id]) * 0.5
#N = dim_comm
C_ref = np.einsum('ab,bcde->acde', A, B_symm)
tA = ctf.tensor([na,nb],sym=[ctf.SYM.NS,ctf.SYM.NS])
tAT = ctf.tensor([nb,na],sym=[ctf.SYM.NS,ctf.SYM.NS])
tB2 = ctf.tensor([nb,nc,nd,ne],sym=[ctf.SYM.NS, ctf.SYM.NS, ctf.SYM.NS,ctf.SYM.NS])
tB2_symm = ctf.tensor([nb,nc,nd,ne],sym=[ctf.SYM.NS, ctf.SYM.NS, ctf.SYM.SY,ctf.SYM.NS])
tC = ctf.tensor([na,nc,nd,ne],sym=[ctf.SYM.NS, ctf.SYM.NS, ctf.SYM.NS,ctf.SYM.NS])
tC_symm = ctf.tensor([na,nc,nd,ne],sym=[ctf.SYM.NS, ctf.SYM.NS, ctf.SYM.SY,ctf.SYM.NS])
tA = ctf.from_nparray(A)
tAT = ctf.from_nparray(np.transpose(A))
tB2 = ctf.from_nparray(B_symm)
tB2_symm = ctf.from_nparray(B_symm)
list_time = []
for i in range(n_times):
start_time = time.time()
tC = ctf.einsum('ab,bcde->acde', tA, tB2)
finish_time = time.time()
list_time.append(finish_time - start_time)
mean_confidence_interval(list_time, 'ctf', 's' )
list_time = []
for i in range(n_times):
start_time = time.time()
tC_symm = ctf.einsum('ab,bcde->acde', tA, tB2_symm)
finish_time = time.time()
list_time.append(finish_time - start_time)
mean_confidence_interval(list_time, 'ctf-symm', 's' )
tC_temp = ctf.to_nparray(tC)
print('check ctf',np.allclose(C_ref,tC_temp))
tC_symm_temp = ctf.to_nparray(tC_symm)
print('check ctf symm',np.allclose(C_ref,tC_symm_temp))
list_time = []
for i in range(n_times):
start_time = time.time()
tC = ctf.einsum('ba,bcde->acde', tAT, tB2)
finish_time = time.time()
list_time.append(finish_time - start_time)
mean_confidence_interval(list_time, 'ctf', 's' )
list_time = []
for i in range(n_times):
start_time = time.time()
tC_symm = ctf.einsum('ba,bcde->acde', tAT, tB2_symm)
finish_time = time.time()
list_time.append(finish_time - start_time)
mean_confidence_interval(list_time, 'ctf-symm', 's' )
tC_temp = ctf.to_nparray(tC)
print('check ctf',np.allclose(C_ref,tC_temp))
tC_symm_temp = ctf.to_nparray(tC_symm)
print('check ctf symm',np.allclose(C_ref,tC_symm_temp))
Program test_contraction
!A[a,b] * B[b,c,d,e] = C[a,c,d,e]
integer, parameter :: dp = selected_real_kind(15, 307)
Real( dp ), Dimension( :, : ), Allocatable :: a
Real( dp ), Dimension( :, :, :, : ), Allocatable :: b0, b
Real( dp ), Dimension( :, :, :, : ), Allocatable :: c0, c1, c2
Integer :: na, nb, nc, nd, ne, m, n, m_iter
Integer :: ia, ib, ic, id, ie
Integer :: start, finish, rate
real(dp) :: sum_time
Write( *, * ) 'na, nb, nc, nd, ne ?'
Read( *, * ) na, nb, nc, nd, ne
Allocate( a ( 1:na, 1:nb ) )
Allocate( b0 ( 1:nb, 1:nc, 1:nd, 1:ne ) )
Allocate( b ( 1:nb, 1:nc, 1:nd, 1:ne ) )
Allocate( c0( 1:na, 1:nc, 1:nd, 1:ne ) )
Allocate( c1( 1:na, 1:nc, 1:nd, 1:ne ) )
Allocate( c2( 1:na, 1:nc, 1:nd, 1:ne ) )
Call Random_number( a )
Call Random_number( b0 )
c1 = 0.0_dp
c2 = 0.0_dp
m_iter = 10
write (*,*) 'm_iter average', m_iter
!symmetrize
do id = 1, nd
do ie = 1, ne
b(:, :, id, ie) = ( b0(:, :, id, ie) + b0(:, :, ie, id) )*0.5
end do
end do
! refrence value by nested loops
sum_time = 0.0
do m = 1, m_iter
c0 = 0.0_dp
Call System_clock( start, rate )
do ie = 1, ne
do id = 1, nd
do ic = 1, nc
do ib = 1, nb
do ia = 1, na
c0(ia, ic, id, ie) = c0(ia, ic, id, ie) + a(ia, ib) * b(ib, ic, id, ie)
end do
end do
end do
end do
end do
Call System_clock( finish, rate )
sum_time = sum_time + Real( finish - start, dp ) / rate
end do
Write( *, * ) 'Time for naive loop edcba', sum_time / m_iter
sum_time = 0.0
do m = 1, m_iter
do ie = 1, ne
do id = 1, nd
Call System_clock( start, rate )
Call dgemm( 'N', 'N', na, nc, nb, 1.0_dp, a , Size( a , Dim = 1 ), &
b(:,:,id, ie) , Size( b , Dim = 1 ), &
0.0_dp, c1(:,:,id, ie), Size( c1, Dim = 1 ) )
!c5(:,id)
Call System_clock( finish, rate )
sum_time = sum_time + Real( finish - start, dp ) / rate
end do
end do
end do
Write( *, * ) 'Time for loop dgemm-2', sum_time / m_iter
sum_time = 0.0
do m = 1, m_iter
do ie = 1, ne
do id = 1, ie
Call System_clock( start, rate )
Call dgemm( 'N', 'N', na, nc, nb, 1.0_dp, a , Size( a , Dim = 1 ), &
b(:,:,id, ie) , Size( b , Dim = 1 ), &
0.0_dp, c2(:,:,id, ie), Size( c2, Dim = 1 ) )
!c5(:,id)
Call System_clock( finish, rate )
sum_time = sum_time + Real( finish - start, dp ) / rate
c2(:,:,ie, id) = c2(:,:,id, ie)
end do
end do
end do
Write( *, * ) 'Time for loop dgemm-symmetry', sum_time / m_iter
do ia = 1, na
do ic = 1, nc
do id = 1, nd
do ie = 1, ne
if ( dabs(c1(ia,ic,id,ie) - c0(ia,ic,id,ie)) > 1.e-6 ) then
write (*,*) '!!! c1', ia,ic,id,ie, c0(ia,ic,id,ie), c1(ia,ic,id,ie)
endif
if ( dabs(c2(ia,ic,id,ie) - c0(ia,ic,id,ie)) > 1.e-6 ) then
write (*,*) '!!! c2', ia,ic,id,ie, c2(ia,ic,id,ie), c1(ia,ic,id,ie)
endif
enddo
enddo
enddo
enddo
end
Hi,
I made a comparison for the contraction
A[a,b] B[b,c,d,e] = C[a,c,d,e]
, whereB[:,:,d,e] = B[:,:,e,d]
. I used Fortran by loop over external indicesd,e
to utilize the symmetry thendgemm
. InCTF
, I tried to setctf.tensor([N,N,N,N],sym=[ctf.SYM.NS, ctf.SYM.NS, ctf.SYM.SY,ctf.SYM.NS])
. Both Fortran (gfortran) andCTF
usedopenblas
library. The loop over indices strategy appears (does not mean first) in tensor contraction engine J. Phys. Chem. A 2003, 107, 9887-9897 and innwchem
http://gitlab.hpcrl.cse.ohio-state.edu/jinsung/nwchem-cogent-master/-/blob/eac3c06962a89c597fab04aa19bcf9c3989b3ae4/nwchem/src/tce/ccsd/cc2_t2.FHere is the Fortran code
Here is the
CTF
called from pythonThe results are, Fortran
Python
It seems I can get ~ 90% speed up by symmetry in Fortran, but ~ 0 in
CTF
. If I use python to do similar loops over external indices, witheinsum
as the Fortran code did, the results will be much slower (>a factor of 10). Is my setting onCTF
correct?From intel people, the efficiency of calling
MKL
from Fotran and C++ is similar https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/performance-of-MKL-by-c-or-fortran/td-p/936921 I suppose it holds foropenblas
. So I think if I compare with C++ with dgemm and python loadedCTF
, the results will be similar.