Comparing Fortran and CTF performance on symmetries in tensor contractions

Lancashire3000 commented 2 years ago

Hi,

I made a comparison for the contraction A[a,b] B[b,c,d,e] = C[a,c,d,e], where B[:,:,d,e] = B[:,:,e,d]. I used Fortran by loop over external indices d,e to utilize the symmetry then dgemm. In CTF, I tried to set ctf.tensor([N,N,N,N],sym=[ctf.SYM.NS, ctf.SYM.NS, ctf.SYM.SY,ctf.SYM.NS]). Both Fortran (gfortran) and CTF used openblas library. The loop over indices strategy appears (does not mean first) in tensor contraction engine J. Phys. Chem. A 2003, 107, 9887-9897 and in nwchem http://gitlab.hpcrl.cse.ohio-state.edu/jinsung/nwchem-cogent-master/-/blob/eac3c06962a89c597fab04aa19bcf9c3989b3ae4/nwchem/src/tce/ccsd/cc2_t2.F

Here is the Fortran code

Program test_contraction
  !A[a,b] * B[b,c,d,e] = C[a,c,d,e]

    integer, parameter :: dp = selected_real_kind(15, 307)

    Real( dp ), Dimension( :, :    ), Allocatable :: a
    Real( dp ), Dimension( :, :, :, : ), Allocatable :: b0, b
    Real( dp ), Dimension( :, :, :, : ), Allocatable :: c0, c1, c2

    Integer :: na, nb, nc, nd, ne, m, n, m_iter
    Integer :: ia, ib, ic, id, ie  
    Integer :: start, finish, rate
    real(dp) :: sum_time

    Write( *, * ) 'na, nb, nc, nd, ne ?'
    Read( *, * ) na, nb, nc, nd, ne

    Allocate( a ( 1:na, 1:nb ) ) 
    Allocate( b0 ( 1:nb, 1:nc, 1:nd, 1:ne ) )
    Allocate( b ( 1:nb, 1:nc, 1:nd, 1:ne ) )

    Allocate( c0( 1:na, 1:nc, 1:nd, 1:ne ) )   
    Allocate( c1( 1:na, 1:nc, 1:nd, 1:ne ) )   
    Allocate( c2( 1:na, 1:nc, 1:nd, 1:ne ) )  

    Call Random_number( a )
    Call Random_number( b0 )
    c1 = 0.0_dp
    c2 = 0.0_dp

    m_iter = 20

    write (*,*) 'm_iter average', m_iter 

    !symmetrize
    do id = 1, nd
      do ie = 1, ne
        b(:, :, id, ie) = ( b0(:, :, id, ie) + b0(:, :, ie, id) )*0.5
      end do  
    end do  

! refrence value by nested loops
  sum_time = 0.0
  do m = 1, m_iter
    c0 = 0.0_dp
    Call System_clock( start, rate )
    do ie = 1, ne 
      do id = 1, nd
        do ic = 1, nc
          do ib = 1, nb
            do ia = 1, na
              c0(ia, ic, id, ie) = c0(ia, ic, id, ie)  + a(ia, ib) * b(ib, ic, id, ie)
            end do  
          end do  
        end do
      end do
    end do  
    Call System_clock( finish, rate )
    sum_time = sum_time + Real( finish - start, dp ) / rate  
  end do
  Write( *, * ) 'Time for naive loop edcba', sum_time / m_iter 

  sum_time = 0.0  
  do m = 1, m_iter     
    do ie = 1, ne 
      do id = 1, nd
        Call System_clock( start, rate )
        Call dgemm( 'N', 'N', na, nb, nb, 1.0_dp, a , Size( a , Dim = 1 ), &
                                            b(:,:,id, ie) , Size( b , Dim = 1 ), &
                                            0.0_dp, c1(:,:,id, ie), Size( c1, Dim = 1 ) )
      !c5(:,id)                                      
        Call System_clock( finish, rate )
        sum_time = sum_time + Real( finish - start, dp ) / rate
      end do    
    end do
  end do  
  Write( *, * ) 'Time for loop dgemm-2', sum_time / m_iter

  sum_time = 0.0  
  do m = 1, m_iter     
    do ie = 1, ne 
      do id = 1, ie
        Call System_clock( start, rate )
        Call dgemm( 'N', 'N', na, nb, nb, 1.0_dp, a , Size( a , Dim = 1 ), &
                                            b(:,:,id, ie) , Size( b , Dim = 1 ), &
                                            0.0_dp, c2(:,:,id, ie), Size( c2, Dim = 1 ) )
      !c5(:,id)                                      
        Call System_clock( finish, rate )
        sum_time = sum_time + Real( finish - start, dp ) / rate

        c2(:,:,ie, id) = c2(:,:,id, ie) 

      end do    
    end do
  end do  
  Write( *, * ) 'Time for loop dgemm-symmetry', sum_time / m_iter

  do ia = 1, na
    do ic = 1, nc
      do id = 1, nd
        do ie = 1, ne

          if ( dabs(c1(ia,ic,id,ie) - c0(ia,ic,id,ie))  > 1.e-6 ) then 
            write (*,*) '!!! c1', ia,ic,id,ie, c0(ia,ic,id,ie), c1(ia,ic,id,ie)
          endif   

          if ( dabs(c2(ia,ic,id,ie) - c0(ia,ic,id,ie))  > 1.e-6 ) then 
            write (*,*) '!!! c2', ia,ic,id,ie, c2(ia,ic,id,ie), c1(ia,ic,id,ie)
          endif      

        enddo 
      enddo
    enddo
  enddo  

end

Here is the CTF called from python

import numpy as np
import time
import scipy.stats
import warnings
#import timeit
import ctf

warnings.filterwarnings("ignore", category=DeprecationWarning)
# from https://stackoverflow.com/questions/15033511/compute-a-confidence-interval-from-sample-data
def mean_confidence_interval(data, var_name, unit, confidence=0.95):
    a = 1.0 * np.array(data)
    n = len(a)
    m, se = np.mean(a), scipy.stats.sem(a)
    h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)

    print(var_name, round(m, 5), "\u00B1",  round(h, 5), unit )

print('ab,bcde->acd')
# A[a,b] * B[b,c,d,e] = C[a,c,d,e]
na = nb = nc = nd = ne = dim_comm = 32
#nt = 10
n_times = 64

print('number of common dimension', dim_comm)
print('number of averaged time', n_times)

A = np.random.random((na,nb))
B = np.random.random((nb,nc,nd,ne))
B_symm= np.zeros((na,nc,nd,ne))
# symmetrize B
for id in range(nd):
    for ie in range(ne):            
        B_symm[:,:,id,ie] = (B[:,:,id,ie] + B[:,:,ie,id]) * 0.5

N = dim_comm

C_ref = np.einsum('ab,bcde->acde', A, B_symm)

tA = ctf.tensor([N,N],sym=[ctf.SYM.NS,ctf.SYM.NS])
tB2 = ctf.tensor([N,N,N,N],sym=[ctf.SYM.NS, ctf.SYM.NS, ctf.SYM.NS,ctf.SYM.NS])
tB2_symm = ctf.tensor([N,N,N,N],sym=[ctf.SYM.NS, ctf.SYM.NS, ctf.SYM.SY,ctf.SYM.NS])

tC = ctf.tensor([N,N,N,N],sym=[ctf.SYM.NS, ctf.SYM.NS, ctf.SYM.NS,ctf.SYM.NS])
tC_symm = ctf.tensor([N,N,N,N],sym=[ctf.SYM.NS, ctf.SYM.NS, ctf.SYM.SY,ctf.SYM.NS])

tA = ctf.from_nparray(A)
tB2 = ctf.from_nparray(B_symm)
tB2_symm = ctf.from_nparray(B_symm)

list_time = []

for i in range(n_times):
    start_time = time.time()
    tC = ctf.einsum('ab,bcde->acde', tA, tB2) 
    finish_time = time.time()

    list_time.append(finish_time - start_time)    

mean_confidence_interval(list_time, 'ctf', 's' ) 

list_time = []

for i in range(n_times):
    start_time = time.time()
    tC_symm = ctf.einsum('ab,bcde->acde', tA, tB2_symm) 
    finish_time = time.time()

    list_time.append(finish_time - start_time)    

mean_confidence_interval(list_time, 'ctf-symm', 's' ) 

tC_temp = ctf.to_nparray(tC)
print('check ctf',np.allclose(C_ref,tC_temp))
tC_symm_temp = ctf.to_nparray(tC_symm)
print('check ctf symm',np.allclose(C_ref,tC_symm_temp))

The results are, Fortran

 na, nb, nc, nd, ne ?
32 32 32 32 32
 m_iter average          20
 Time for naive loop edcba  0.31620000000000004
 Time for loop dgemm-2   6.9000000000000051E-003
 Time for loop dgemm-symmetry   3.5500000000000024E-003

Python

ab,bcde->acd
number of common dimension 32
number of averaged time 64
ctf 0.00809 ± 0.00031 s
ctf-symm 0.00863 ± 0.00054 s
check ctf True
check ctf symm True

It seems I can get ~ 90% speed up by symmetry in Fortran, but ~ 0 in CTF. If I use python to do similar loops over external indices, with einsum as the Fortran code did, the results will be much slower (>a factor of 10). Is my setting on CTF correct?

From intel people, the efficiency of calling MKL from Fotran and C++ is similar https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/performance-of-MKL-by-c-or-fortran/td-p/936921 I suppose it holds for openblas. So I think if I compare with C++ with dgemm and python loaded CTF, the results will be similar.

jeffhammond commented 2 years ago

Isn't numpy row major? Do you need to flip the indices on the Python version to get the equivalent data access pattern as Fortran?

In any case, I'm afraid that a contracted dimension of 32 is about an order-of-magnitude too small to saturate the compute capability of a modern Intel CPU (K=384 saturates, according to former colleagues in the MKL team), so I'm afraid you are limited by bandwidth limitations and other overheads.

It would be very interesting to perform the CTF comparisons on a very contractions, particularly the more expensive N^6 terms in CCSD. The so-called "four-particle ladder" term (https://github.com/nwchemgit/nwchem/blob/master/src/tce/ccsd/ccsd_t2_8.F) is the most expensive in TCE's CCSD, for a reasonable number of virtual orbitals.

All the terms in CC2 are N^5 (like MP2) and aren't super interesting from a compute perspective, because all the efficient implementations of CC2 and MP2 don't store the two-electron integrals in the fully transformed representation.

Unrelated to CTF, I had some fun porting your Fortran test to use GPUs using NVIDIA StdPar support (maps DO CONCURRENT and MATMUL to OpenACC and CUTENSOR, respectively). I'm happy to share and discuss that, but this isn't the right place for it.

Lancashire3000 commented 2 years ago

Hi, thank you for your comments and suggestions.

I tried to increase the contracted dimension to 384, others 32, the output is

ctf 0.05176 ± 0.00117 s
ctf-symm 0.05084 ± 0.00063 s

I can transpose the array A before the contraction, the result is similar

ctf 0.0558 ± 0.00089 s
ctf-symm 0.0546 ± 0.00062 s

The revised Python and Fortran code (admittedly it is messy) for different dimensions in each index

import numpy as np
import time
import scipy.stats
import warnings
#import timeit
import ctf

warnings.filterwarnings("ignore", category=DeprecationWarning)
# from https://stackoverflow.com/questions/15033511/compute-a-confidence-interval-from-sample-data
def mean_confidence_interval(data, var_name, unit, confidence=0.95):
    a = 1.0 * np.array(data)
    n = len(a)
    m, se = np.mean(a), scipy.stats.sem(a)
    h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)

    print(var_name, round(m, 5), "\u00B1",  round(h, 5), unit )

print('ab,bcde->acd')
# A[a,b] * B[b,c,d,e] = C[a,c,d,e]
na = nc = nd = ne = 32
nb = 384
#nt = 10
n_times = 128

#print('number of common dimension', dim_comm)
print('number of averaged time', n_times)

A = np.random.random((na,nb))
B = np.random.random((nb,nc,nd,ne))
B_symm= np.zeros((nb,nc,nd,ne))
# symmetrize B
for id in range(nd):
    for ie in range(ne):            
        B_symm[:,:,id,ie] = (B[:,:,id,ie] + B[:,:,ie,id]) * 0.5

#N = dim_comm

C_ref = np.einsum('ab,bcde->acde', A, B_symm)

tA = ctf.tensor([na,nb],sym=[ctf.SYM.NS,ctf.SYM.NS])
tAT = ctf.tensor([nb,na],sym=[ctf.SYM.NS,ctf.SYM.NS])

tB2 = ctf.tensor([nb,nc,nd,ne],sym=[ctf.SYM.NS, ctf.SYM.NS, ctf.SYM.NS,ctf.SYM.NS])
tB2_symm = ctf.tensor([nb,nc,nd,ne],sym=[ctf.SYM.NS, ctf.SYM.NS, ctf.SYM.SY,ctf.SYM.NS])

tC = ctf.tensor([na,nc,nd,ne],sym=[ctf.SYM.NS, ctf.SYM.NS, ctf.SYM.NS,ctf.SYM.NS])
tC_symm = ctf.tensor([na,nc,nd,ne],sym=[ctf.SYM.NS, ctf.SYM.NS, ctf.SYM.SY,ctf.SYM.NS])

tA = ctf.from_nparray(A)
tAT  = ctf.from_nparray(np.transpose(A))
tB2 = ctf.from_nparray(B_symm)
tB2_symm = ctf.from_nparray(B_symm)

list_time = []

for i in range(n_times):
    start_time = time.time()
    tC = ctf.einsum('ab,bcde->acde', tA, tB2) 
    finish_time = time.time()

    list_time.append(finish_time - start_time)    

mean_confidence_interval(list_time, 'ctf', 's' ) 

list_time = []

for i in range(n_times):
    start_time = time.time()
    tC_symm = ctf.einsum('ab,bcde->acde', tA, tB2_symm) 
    finish_time = time.time()

    list_time.append(finish_time - start_time)    

mean_confidence_interval(list_time, 'ctf-symm', 's' ) 

tC_temp = ctf.to_nparray(tC)
print('check ctf',np.allclose(C_ref,tC_temp))
tC_symm_temp = ctf.to_nparray(tC_symm)
print('check ctf symm',np.allclose(C_ref,tC_symm_temp))

list_time = []

for i in range(n_times):
    start_time = time.time()
    tC = ctf.einsum('ba,bcde->acde', tAT, tB2) 
    finish_time = time.time()

    list_time.append(finish_time - start_time)    

mean_confidence_interval(list_time, 'ctf', 's' ) 

list_time = []

for i in range(n_times):
    start_time = time.time()
    tC_symm = ctf.einsum('ba,bcde->acde', tAT, tB2_symm) 
    finish_time = time.time()

    list_time.append(finish_time - start_time)    

mean_confidence_interval(list_time, 'ctf-symm', 's' ) 

tC_temp = ctf.to_nparray(tC)
print('check ctf',np.allclose(C_ref,tC_temp))
tC_symm_temp = ctf.to_nparray(tC_symm)
print('check ctf symm',np.allclose(C_ref,tC_symm_temp))

Program test_contraction
  !A[a,b] * B[b,c,d,e] = C[a,c,d,e]

    integer, parameter :: dp = selected_real_kind(15, 307)

    Real( dp ), Dimension( :, :    ), Allocatable :: a
    Real( dp ), Dimension( :, :, :, : ), Allocatable :: b0, b
    Real( dp ), Dimension( :, :, :, : ), Allocatable :: c0, c1, c2

    Integer :: na, nb, nc, nd, ne, m, n, m_iter
    Integer :: ia, ib, ic, id, ie  
    Integer :: start, finish, rate
    real(dp) :: sum_time

    Write( *, * ) 'na, nb, nc, nd, ne ?'
    Read( *, * ) na, nb, nc, nd, ne

    Allocate( a ( 1:na, 1:nb ) ) 
    Allocate( b0 ( 1:nb, 1:nc, 1:nd, 1:ne ) )
    Allocate( b ( 1:nb, 1:nc, 1:nd, 1:ne ) )

    Allocate( c0( 1:na, 1:nc, 1:nd, 1:ne ) )   
    Allocate( c1( 1:na, 1:nc, 1:nd, 1:ne ) )   
    Allocate( c2( 1:na, 1:nc, 1:nd, 1:ne ) )  

    Call Random_number( a )
    Call Random_number( b0 )
    c1 = 0.0_dp
    c2 = 0.0_dp

    m_iter = 10

    write (*,*) 'm_iter average', m_iter 

    !symmetrize
    do id = 1, nd
      do ie = 1, ne
        b(:, :, id, ie) = ( b0(:, :, id, ie) + b0(:, :, ie, id) )*0.5
      end do  
    end do  

! refrence value by nested loops
  sum_time = 0.0
  do m = 1, m_iter
    c0 = 0.0_dp
    Call System_clock( start, rate )
    do ie = 1, ne 
      do id = 1, nd
        do ic = 1, nc
          do ib = 1, nb
            do ia = 1, na
              c0(ia, ic, id, ie) = c0(ia, ic, id, ie)  + a(ia, ib) * b(ib, ic, id, ie)
            end do  
          end do  
        end do
      end do
    end do  
    Call System_clock( finish, rate )
    sum_time = sum_time + Real( finish - start, dp ) / rate  
  end do
  Write( *, * ) 'Time for naive loop edcba', sum_time / m_iter 

  sum_time = 0.0  
  do m = 1, m_iter     
    do ie = 1, ne 
      do id = 1, nd
        Call System_clock( start, rate )
        Call dgemm( 'N', 'N', na, nc, nb, 1.0_dp, a , Size( a , Dim = 1 ), &
                                            b(:,:,id, ie) , Size( b , Dim = 1 ), &
                                            0.0_dp, c1(:,:,id, ie), Size( c1, Dim = 1 ) )
      !c5(:,id)                                      
        Call System_clock( finish, rate )
        sum_time = sum_time + Real( finish - start, dp ) / rate
      end do    
    end do
  end do  
  Write( *, * ) 'Time for loop dgemm-2', sum_time / m_iter

  sum_time = 0.0  
  do m = 1, m_iter     
    do ie = 1, ne 
      do id = 1, ie
        Call System_clock( start, rate )
        Call dgemm( 'N', 'N', na, nc, nb, 1.0_dp, a , Size( a , Dim = 1 ), &
                                            b(:,:,id, ie) , Size( b , Dim = 1 ), &
                                            0.0_dp, c2(:,:,id, ie), Size( c2, Dim = 1 ) )
      !c5(:,id)                                      
        Call System_clock( finish, rate )
        sum_time = sum_time + Real( finish - start, dp ) / rate

        c2(:,:,ie, id) = c2(:,:,id, ie) 

      end do    
    end do
  end do  
  Write( *, * ) 'Time for loop dgemm-symmetry', sum_time / m_iter

  do ia = 1, na
    do ic = 1, nc
      do id = 1, nd
        do ie = 1, ne

          if ( dabs(c1(ia,ic,id,ie) - c0(ia,ic,id,ie))  > 1.e-6 ) then 
            write (*,*) '!!! c1', ia,ic,id,ie, c0(ia,ic,id,ie), c1(ia,ic,id,ie)
          endif   

          if ( dabs(c2(ia,ic,id,ie) - c0(ia,ic,id,ie))  > 1.e-6 ) then 
            write (*,*) '!!! c2', ia,ic,id,ie, c2(ia,ic,id,ie), c1(ia,ic,id,ie)
          endif      

        enddo 
      enddo
    enddo
  enddo  

end

cyclops-community / ctf

Comparing Fortran and CTF performance on symmetries in tensor contractions #136