JeffersonLab / qphix-codegen

Code Generator for the QPhiX library, Wilson Fermions
http://jeffersonlab.github.io/qphix-codegen/
1 stars 4 forks source link

Twisted bc face packers cleanup #9

Closed kostrzewa closed 7 years ago

kostrzewa commented 7 years ago

This completes the implementation of twisted boundary conditions. @bjoo: Personally, I'm not quite sure whether we got the declarations quite right. Essentially, there are instructions:

__m256d tbc_phase_re = _mm256_setzero_pd();                                                                                                        
__m256d tbc_phase_im = _mm256_setzero_pd();

also in those kernels which don't do twisted boundary conditions. (such as dslash_plus_body_double_double_v4_s2_12_ssss). If this is a problem, I can try to find a better solution.

kostrzewa commented 7 years ago

An example kernel ( dslash_plus_body_double_double_v4_s2_12_ssst) with TBC in itme direction looks as follows:

__m256d b_S0_C0_RE = _mm256_setzero_pd();
__m256d b_S0_C0_IM = _mm256_setzero_pd();
__m256d b_S0_C1_RE = _mm256_setzero_pd();
__m256d b_S0_C1_IM = _mm256_setzero_pd();
__m256d b_S0_C2_RE = _mm256_setzero_pd();
__m256d b_S0_C2_IM = _mm256_setzero_pd();
__m256d b_S1_C0_RE = _mm256_setzero_pd();
__m256d b_S1_C0_IM = _mm256_setzero_pd();
__m256d b_S1_C1_RE = _mm256_setzero_pd();
__m256d b_S1_C1_IM = _mm256_setzero_pd();
__m256d b_S1_C2_RE = _mm256_setzero_pd();
__m256d b_S1_C2_IM = _mm256_setzero_pd();
__m256d ub_S0_C0_RE = _mm256_setzero_pd();
__m256d ub_S0_C0_IM = _mm256_setzero_pd();
__m256d ub_S0_C1_RE = _mm256_setzero_pd();
__m256d ub_S0_C1_IM = _mm256_setzero_pd();
__m256d ub_S0_C2_RE = _mm256_setzero_pd();
__m256d ub_S0_C2_IM = _mm256_setzero_pd();
__m256d ub_S1_C0_RE = _mm256_setzero_pd();
__m256d ub_S1_C0_IM = _mm256_setzero_pd();
__m256d ub_S1_C1_RE = _mm256_setzero_pd();
__m256d ub_S1_C1_IM = _mm256_setzero_pd();
__m256d ub_S1_C2_RE = _mm256_setzero_pd();
__m256d ub_S1_C2_IM = _mm256_setzero_pd();
__m256d u_00_re = _mm256_setzero_pd();
__m256d u_00_im = _mm256_setzero_pd();
__m256d u_01_re = _mm256_setzero_pd();
__m256d u_01_im = _mm256_setzero_pd();
__m256d u_02_re = _mm256_setzero_pd();
__m256d u_02_im = _mm256_setzero_pd();
__m256d u_10_re = _mm256_setzero_pd();
__m256d u_10_im = _mm256_setzero_pd();
__m256d u_11_re = _mm256_setzero_pd();
__m256d u_11_im = _mm256_setzero_pd();
__m256d u_12_re = _mm256_setzero_pd();
__m256d u_12_im = _mm256_setzero_pd();
__m256d u_20_re = _mm256_setzero_pd();
__m256d u_20_im = _mm256_setzero_pd();
__m256d u_21_re = _mm256_setzero_pd();
__m256d u_21_im = _mm256_setzero_pd();
__m256d u_22_re = _mm256_setzero_pd();
__m256d u_22_im = _mm256_setzero_pd();
__m256d psi_S0_RE = _mm256_setzero_pd();
__m256d psi_S0_IM = _mm256_setzero_pd();
__m256d psi_S1_RE = _mm256_setzero_pd();
__m256d psi_S1_IM = _mm256_setzero_pd();
__m256d tmp_1_re = _mm256_setzero_pd();
__m256d tmp_1_im = _mm256_setzero_pd();
__m256d tmp_2_re = _mm256_setzero_pd();
__m256d tmp_2_im = _mm256_setzero_pd();
__m256d tmp_3_re = _mm256_setzero_pd();
__m256d tmp_3_im = _mm256_setzero_pd();
__m256d tmp_4_re = _mm256_setzero_pd();
__m256d tmp_4_im = _mm256_setzero_pd();
__m256d zero = _mm256_setzero_pd();
zero = _mm256_setzero_pd(); 
__m256d out_S0_C0_RE = _mm256_setzero_pd();
__m256d out_S0_C0_IM = _mm256_setzero_pd();
__m256d out_S0_C1_RE = _mm256_setzero_pd();
__m256d out_S0_C1_IM = _mm256_setzero_pd();
__m256d out_S0_C2_RE = _mm256_setzero_pd();
__m256d out_S0_C2_IM = _mm256_setzero_pd();
__m256d out_S1_C0_RE = _mm256_setzero_pd();
__m256d out_S1_C0_IM = _mm256_setzero_pd();
__m256d out_S1_C1_RE = _mm256_setzero_pd();
__m256d out_S1_C1_IM = _mm256_setzero_pd();
__m256d out_S1_C2_RE = _mm256_setzero_pd();
__m256d out_S1_C2_IM = _mm256_setzero_pd();
__m256d out_S2_C0_RE = _mm256_setzero_pd();
__m256d out_S2_C0_IM = _mm256_setzero_pd();
__m256d out_S2_C1_RE = _mm256_setzero_pd();
__m256d out_S2_C1_IM = _mm256_setzero_pd();
__m256d out_S2_C2_RE = _mm256_setzero_pd();
__m256d out_S2_C2_IM = _mm256_setzero_pd();
__m256d out_S3_C0_RE = _mm256_setzero_pd();
__m256d out_S3_C0_IM = _mm256_setzero_pd();
__m256d out_S3_C1_RE = _mm256_setzero_pd();
__m256d out_S3_C1_IM = _mm256_setzero_pd();
__m256d out_S3_C2_RE = _mm256_setzero_pd();
__m256d out_S3_C2_IM = _mm256_setzero_pd();
out_S0_C0_RE = _mm256_setzero_pd(); 
out_S0_C0_IM = _mm256_setzero_pd(); 
out_S0_C1_RE = _mm256_setzero_pd(); 
out_S0_C1_IM = _mm256_setzero_pd(); 
out_S0_C2_RE = _mm256_setzero_pd(); 
out_S0_C2_IM = _mm256_setzero_pd(); 
out_S1_C0_RE = _mm256_setzero_pd(); 
out_S1_C0_IM = _mm256_setzero_pd(); 
out_S1_C1_RE = _mm256_setzero_pd(); 
out_S1_C1_IM = _mm256_setzero_pd(); 
out_S1_C2_RE = _mm256_setzero_pd(); 
out_S1_C2_IM = _mm256_setzero_pd(); 
out_S2_C0_RE = _mm256_setzero_pd(); 
out_S2_C0_IM = _mm256_setzero_pd(); 
out_S2_C1_RE = _mm256_setzero_pd(); 
out_S2_C1_IM = _mm256_setzero_pd(); 
out_S2_C2_RE = _mm256_setzero_pd(); 
out_S2_C2_IM = _mm256_setzero_pd(); 
out_S3_C0_RE = _mm256_setzero_pd(); 
out_S3_C0_IM = _mm256_setzero_pd(); 
out_S3_C1_RE = _mm256_setzero_pd(); 
out_S3_C1_IM = _mm256_setzero_pd(); 
out_S3_C2_RE = _mm256_setzero_pd(); 
out_S3_C2_IM = _mm256_setzero_pd(); 
__m256d tbc_phase_re = _mm256_setzero_pd();
__m256d tbc_phase_im = _mm256_setzero_pd();
 if ( accumulate[0] ) { 
__m256d beta_vec = _mm256_setzero_pd();
beta_vec = _mm256_broadcast_sd((&coeff_s));

 if ((accumulate[0] & 0xF) == 0xF) { 
psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[0][0][0] + xbOffs[1]))-1), _mm_loaddup_pd(((*xyBase)[0][0][0] + xbOffs[0])), 1), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[0][0][0] + xbOffs[3]))-1), _mm_loaddup_pd(((*xyBase)[0][0][0] + xbOffs[2])), 1), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[0][0][1] + xbOffs[1]))-1), _mm_loaddup_pd(((*xyBase)[0][0][1] + xbOffs[0])), 1), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[0][0][1] + xbOffs[3]))-1), _mm_loaddup_pd(((*xyBase)[0][0][1] + xbOffs[2])), 1), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[0][3][0] + xbOffs[1]))-1), _mm_loaddup_pd(((*xyBase)[0][3][0] + xbOffs[0])), 1), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[0][3][0] + xbOffs[3]))-1), _mm_loaddup_pd(((*xyBase)[0][3][0] + xbOffs[2])), 1), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[0][3][1] + xbOffs[1]))-1), _mm_loaddup_pd(((*xyBase)[0][3][1] + xbOffs[0])), 1), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[0][3][1] + xbOffs[3]))-1), _mm_loaddup_pd(((*xyBase)[0][3][1] + xbOffs[2])), 1), 1);

b_S0_C0_RE = _mm256_sub_pd( psi_S0_RE , psi_S1_IM );
b_S0_C0_IM = _mm256_add_pd( psi_S0_IM , psi_S1_RE );
psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[1][0][0] + xbOffs[1]))-1), _mm_loaddup_pd(((*xyBase)[1][0][0] + xbOffs[0])), 1), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[1][0][0] + xbOffs[3]))-1), _mm_loaddup_pd(((*xyBase)[1][0][0] + xbOffs[2])), 1), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[1][0][1] + xbOffs[1]))-1), _mm_loaddup_pd(((*xyBase)[1][0][1] + xbOffs[0])), 1), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[1][0][1] + xbOffs[3]))-1), _mm_loaddup_pd(((*xyBase)[1][0][1] + xbOffs[2])), 1), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[1][3][0] + xbOffs[1]))-1), _mm_loaddup_pd(((*xyBase)[1][3][0] + xbOffs[0])), 1), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[1][3][0] + xbOffs[3]))-1), _mm_loaddup_pd(((*xyBase)[1][3][0] + xbOffs[2])), 1), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[1][3][1] + xbOffs[1]))-1), _mm_loaddup_pd(((*xyBase)[1][3][1] + xbOffs[0])), 1), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[1][3][1] + xbOffs[3]))-1), _mm_loaddup_pd(((*xyBase)[1][3][1] + xbOffs[2])), 1), 1);

b_S0_C1_RE = _mm256_sub_pd( psi_S0_RE , psi_S1_IM );
b_S0_C1_IM = _mm256_add_pd( psi_S0_IM , psi_S1_RE );
psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[2][0][0] + xbOffs[1]))-1), _mm_loaddup_pd(((*xyBase)[2][0][0] + xbOffs[0])), 1), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[2][0][0] + xbOffs[3]))-1), _mm_loaddup_pd(((*xyBase)[2][0][0] + xbOffs[2])), 1), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[2][0][1] + xbOffs[1]))-1), _mm_loaddup_pd(((*xyBase)[2][0][1] + xbOffs[0])), 1), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[2][0][1] + xbOffs[3]))-1), _mm_loaddup_pd(((*xyBase)[2][0][1] + xbOffs[2])), 1), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[2][3][0] + xbOffs[1]))-1), _mm_loaddup_pd(((*xyBase)[2][3][0] + xbOffs[0])), 1), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[2][3][0] + xbOffs[3]))-1), _mm_loaddup_pd(((*xyBase)[2][3][0] + xbOffs[2])), 1), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[2][3][1] + xbOffs[1]))-1), _mm_loaddup_pd(((*xyBase)[2][3][1] + xbOffs[0])), 1), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[2][3][1] + xbOffs[3]))-1), _mm_loaddup_pd(((*xyBase)[2][3][1] + xbOffs[2])), 1), 1);

b_S0_C2_RE = _mm256_sub_pd( psi_S0_RE , psi_S1_IM );
b_S0_C2_IM = _mm256_add_pd( psi_S0_IM , psi_S1_RE );
u_00_re = _mm256_load_pd((*gBase)[0][0][0][0]);

u_00_im = _mm256_load_pd((*gBase)[0][0][0][1]);

u_01_re = _mm256_load_pd((*gBase)[0][0][1][0]);

u_01_im = _mm256_load_pd((*gBase)[0][0][1][1]);

u_02_re = _mm256_load_pd((*gBase)[0][0][2][0]);

u_02_im = _mm256_load_pd((*gBase)[0][0][2][1]);

u_10_re = _mm256_load_pd((*gBase)[0][1][0][0]);

u_10_im = _mm256_load_pd((*gBase)[0][1][0][1]);

u_11_re = _mm256_load_pd((*gBase)[0][1][1][0]);

u_11_im = _mm256_load_pd((*gBase)[0][1][1][1]);

u_12_re = _mm256_load_pd((*gBase)[0][1][2][0]);

u_12_im = _mm256_load_pd((*gBase)[0][1][2][1]);

u_20_re = _mm256_mul_pd( u_01_re , u_12_re );
u_20_re = _mm256_fnmadd_pd( u_01_im , u_12_im , u_20_re );
u_20_re = _mm256_fnmadd_pd( u_02_re , u_11_re , u_20_re );
u_20_re = _mm256_fmadd_pd( u_02_im , u_11_im , u_20_re );
u_20_im = _mm256_mul_pd( u_02_re , u_11_im );
u_20_im = _mm256_fmadd_pd( u_02_im , u_11_re , u_20_im );
u_20_im = _mm256_fnmadd_pd( u_01_re , u_12_im , u_20_im );
u_20_im = _mm256_fnmadd_pd( u_01_im , u_12_re , u_20_im );
u_21_re = _mm256_mul_pd( u_02_re , u_10_re );
u_21_re = _mm256_fnmadd_pd( u_02_im , u_10_im , u_21_re );
u_21_re = _mm256_fnmadd_pd( u_00_re , u_12_re , u_21_re );
u_21_re = _mm256_fmadd_pd( u_00_im , u_12_im , u_21_re );
u_21_im = _mm256_mul_pd( u_00_re , u_12_im );
u_21_im = _mm256_fmadd_pd( u_00_im , u_12_re , u_21_im );
u_21_im = _mm256_fnmadd_pd( u_02_re , u_10_im , u_21_im );
u_21_im = _mm256_fnmadd_pd( u_02_im , u_10_re , u_21_im );
u_22_re = _mm256_mul_pd( u_00_re , u_11_re );
u_22_re = _mm256_fnmadd_pd( u_00_im , u_11_im , u_22_re );
u_22_re = _mm256_fnmadd_pd( u_01_re , u_10_re , u_22_re );
u_22_re = _mm256_fmadd_pd( u_01_im , u_10_im , u_22_re );
u_22_im = _mm256_mul_pd( u_01_re , u_10_im );
u_22_im = _mm256_fmadd_pd( u_01_im , u_10_re , u_22_im );
u_22_im = _mm256_fnmadd_pd( u_00_re , u_11_im , u_22_im );
u_22_im = _mm256_fnmadd_pd( u_00_im , u_11_re , u_22_im );
ub_S0_C0_RE = _mm256_mul_pd( u_00_re , b_S0_C0_RE );
ub_S0_C0_RE = _mm256_fmadd_pd( u_00_im , b_S0_C0_IM , ub_S0_C0_RE );
ub_S0_C0_IM = _mm256_mul_pd( u_00_re , b_S0_C0_IM );
ub_S0_C0_IM = _mm256_fnmadd_pd( u_00_im , b_S0_C0_RE , ub_S0_C0_IM );
ub_S0_C0_RE = _mm256_fmadd_pd( u_01_re , b_S0_C1_RE , ub_S0_C0_RE );
ub_S0_C0_RE = _mm256_fmadd_pd( u_01_im , b_S0_C1_IM , ub_S0_C0_RE );
ub_S0_C0_IM = _mm256_fmadd_pd( u_01_re , b_S0_C1_IM , ub_S0_C0_IM );
ub_S0_C0_IM = _mm256_fnmadd_pd( u_01_im , b_S0_C1_RE , ub_S0_C0_IM );
ub_S0_C0_RE = _mm256_fmadd_pd( u_02_re , b_S0_C2_RE , ub_S0_C0_RE );
ub_S0_C0_RE = _mm256_fmadd_pd( u_02_im , b_S0_C2_IM , ub_S0_C0_RE );
ub_S0_C0_IM = _mm256_fmadd_pd( u_02_re , b_S0_C2_IM , ub_S0_C0_IM );
ub_S0_C0_IM = _mm256_fnmadd_pd( u_02_im , b_S0_C2_RE , ub_S0_C0_IM );
ub_S0_C1_RE = _mm256_mul_pd( u_10_re , b_S0_C0_RE );
ub_S0_C1_RE = _mm256_fmadd_pd( u_10_im , b_S0_C0_IM , ub_S0_C1_RE );
ub_S0_C1_IM = _mm256_mul_pd( u_10_re , b_S0_C0_IM );
ub_S0_C1_IM = _mm256_fnmadd_pd( u_10_im , b_S0_C0_RE , ub_S0_C1_IM );
ub_S0_C1_RE = _mm256_fmadd_pd( u_11_re , b_S0_C1_RE , ub_S0_C1_RE );
ub_S0_C1_RE = _mm256_fmadd_pd( u_11_im , b_S0_C1_IM , ub_S0_C1_RE );
ub_S0_C1_IM = _mm256_fmadd_pd( u_11_re , b_S0_C1_IM , ub_S0_C1_IM );
ub_S0_C1_IM = _mm256_fnmadd_pd( u_11_im , b_S0_C1_RE , ub_S0_C1_IM );
ub_S0_C1_RE = _mm256_fmadd_pd( u_12_re , b_S0_C2_RE , ub_S0_C1_RE );
ub_S0_C1_RE = _mm256_fmadd_pd( u_12_im , b_S0_C2_IM , ub_S0_C1_RE );
ub_S0_C1_IM = _mm256_fmadd_pd( u_12_re , b_S0_C2_IM , ub_S0_C1_IM );
ub_S0_C1_IM = _mm256_fnmadd_pd( u_12_im , b_S0_C2_RE , ub_S0_C1_IM );
ub_S0_C2_RE = _mm256_mul_pd( u_20_re , b_S0_C0_RE );
ub_S0_C2_RE = _mm256_fmadd_pd( u_20_im , b_S0_C0_IM , ub_S0_C2_RE );
ub_S0_C2_IM = _mm256_mul_pd( u_20_re , b_S0_C0_IM );
ub_S0_C2_IM = _mm256_fnmadd_pd( u_20_im , b_S0_C0_RE , ub_S0_C2_IM );
ub_S0_C2_RE = _mm256_fmadd_pd( u_21_re , b_S0_C1_RE , ub_S0_C2_RE );
ub_S0_C2_RE = _mm256_fmadd_pd( u_21_im , b_S0_C1_IM , ub_S0_C2_RE );
ub_S0_C2_IM = _mm256_fmadd_pd( u_21_re , b_S0_C1_IM , ub_S0_C2_IM );
ub_S0_C2_IM = _mm256_fnmadd_pd( u_21_im , b_S0_C1_RE , ub_S0_C2_IM );
ub_S0_C2_RE = _mm256_fmadd_pd( u_22_re , b_S0_C2_RE , ub_S0_C2_RE );
ub_S0_C2_RE = _mm256_fmadd_pd( u_22_im , b_S0_C2_IM , ub_S0_C2_RE );
ub_S0_C2_IM = _mm256_fmadd_pd( u_22_re , b_S0_C2_IM , ub_S0_C2_IM );
ub_S0_C2_IM = _mm256_fnmadd_pd( u_22_im , b_S0_C2_RE , ub_S0_C2_IM );
out_S0_C0_RE = _mm256_fmadd_pd( beta_vec , ub_S0_C0_RE , out_S0_C0_RE );
out_S0_C0_IM = _mm256_fmadd_pd( beta_vec , ub_S0_C0_IM , out_S0_C0_IM );
out_S0_C1_RE = _mm256_fmadd_pd( beta_vec , ub_S0_C1_RE , out_S0_C1_RE );
out_S0_C1_IM = _mm256_fmadd_pd( beta_vec , ub_S0_C1_IM , out_S0_C1_IM );
out_S0_C2_RE = _mm256_fmadd_pd( beta_vec , ub_S0_C2_RE , out_S0_C2_RE );
out_S0_C2_IM = _mm256_fmadd_pd( beta_vec , ub_S0_C2_IM , out_S0_C2_IM );
out_S3_C0_RE = _mm256_fmadd_pd( beta_vec , ub_S0_C0_IM , out_S3_C0_RE );
out_S3_C0_IM = _mm256_fnmadd_pd( beta_vec , ub_S0_C0_RE , out_S3_C0_IM );
out_S3_C1_RE = _mm256_fmadd_pd( beta_vec , ub_S0_C1_IM , out_S3_C1_RE );
out_S3_C1_IM = _mm256_fnmadd_pd( beta_vec , ub_S0_C1_RE , out_S3_C1_IM );
out_S3_C2_RE = _mm256_fmadd_pd( beta_vec , ub_S0_C2_IM , out_S3_C2_RE );
out_S3_C2_IM = _mm256_fnmadd_pd( beta_vec , ub_S0_C2_RE , out_S3_C2_IM );
psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[0][1][0] + xbOffs[1]))-1), _mm_loaddup_pd(((*xyBase)[0][1][0] + xbOffs[0])), 1), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[0][1][0] + xbOffs[3]))-1), _mm_loaddup_pd(((*xyBase)[0][1][0] + xbOffs[2])), 1), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[0][1][1] + xbOffs[1]))-1), _mm_loaddup_pd(((*xyBase)[0][1][1] + xbOffs[0])), 1), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[0][1][1] + xbOffs[3]))-1), _mm_loaddup_pd(((*xyBase)[0][1][1] + xbOffs[2])), 1), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[0][2][0] + xbOffs[1]))-1), _mm_loaddup_pd(((*xyBase)[0][2][0] + xbOffs[0])), 1), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[0][2][0] + xbOffs[3]))-1), _mm_loaddup_pd(((*xyBase)[0][2][0] + xbOffs[2])), 1), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[0][2][1] + xbOffs[1]))-1), _mm_loaddup_pd(((*xyBase)[0][2][1] + xbOffs[0])), 1), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[0][2][1] + xbOffs[3]))-1), _mm_loaddup_pd(((*xyBase)[0][2][1] + xbOffs[2])), 1), 1);

b_S1_C0_RE = _mm256_sub_pd( psi_S0_RE , psi_S1_IM );
b_S1_C0_IM = _mm256_add_pd( psi_S0_IM , psi_S1_RE );
psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[1][1][0] + xbOffs[1]))-1), _mm_loaddup_pd(((*xyBase)[1][1][0] + xbOffs[0])), 1), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[1][1][0] + xbOffs[3]))-1), _mm_loaddup_pd(((*xyBase)[1][1][0] + xbOffs[2])), 1), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[1][1][1] + xbOffs[1]))-1), _mm_loaddup_pd(((*xyBase)[1][1][1] + xbOffs[0])), 1), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[1][1][1] + xbOffs[3]))-1), _mm_loaddup_pd(((*xyBase)[1][1][1] + xbOffs[2])), 1), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[1][2][0] + xbOffs[1]))-1), _mm_loaddup_pd(((*xyBase)[1][2][0] + xbOffs[0])), 1), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[1][2][0] + xbOffs[3]))-1), _mm_loaddup_pd(((*xyBase)[1][2][0] + xbOffs[2])), 1), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[1][2][1] + xbOffs[1]))-1), _mm_loaddup_pd(((*xyBase)[1][2][1] + xbOffs[0])), 1), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[1][2][1] + xbOffs[3]))-1), _mm_loaddup_pd(((*xyBase)[1][2][1] + xbOffs[2])), 1), 1);

b_S1_C1_RE = _mm256_sub_pd( psi_S0_RE , psi_S1_IM );
b_S1_C1_IM = _mm256_add_pd( psi_S0_IM , psi_S1_RE );
psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[2][1][0] + xbOffs[1]))-1), _mm_loaddup_pd(((*xyBase)[2][1][0] + xbOffs[0])), 1), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[2][1][0] + xbOffs[3]))-1), _mm_loaddup_pd(((*xyBase)[2][1][0] + xbOffs[2])), 1), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[2][1][1] + xbOffs[1]))-1), _mm_loaddup_pd(((*xyBase)[2][1][1] + xbOffs[0])), 1), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[2][1][1] + xbOffs[3]))-1), _mm_loaddup_pd(((*xyBase)[2][1][1] + xbOffs[2])), 1), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[2][2][0] + xbOffs[1]))-1), _mm_loaddup_pd(((*xyBase)[2][2][0] + xbOffs[0])), 1), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[2][2][0] + xbOffs[3]))-1), _mm_loaddup_pd(((*xyBase)[2][2][0] + xbOffs[2])), 1), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[2][2][1] + xbOffs[1]))-1), _mm_loaddup_pd(((*xyBase)[2][2][1] + xbOffs[0])), 1), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[2][2][1] + xbOffs[3]))-1), _mm_loaddup_pd(((*xyBase)[2][2][1] + xbOffs[2])), 1), 1);

b_S1_C2_RE = _mm256_sub_pd( psi_S0_RE , psi_S1_IM );
b_S1_C2_IM = _mm256_add_pd( psi_S0_IM , psi_S1_RE );
ub_S1_C0_RE = _mm256_mul_pd( u_00_re , b_S1_C0_RE );
ub_S1_C0_RE = _mm256_fmadd_pd( u_00_im , b_S1_C0_IM , ub_S1_C0_RE );
ub_S1_C0_IM = _mm256_mul_pd( u_00_re , b_S1_C0_IM );
ub_S1_C0_IM = _mm256_fnmadd_pd( u_00_im , b_S1_C0_RE , ub_S1_C0_IM );
ub_S1_C0_RE = _mm256_fmadd_pd( u_01_re , b_S1_C1_RE , ub_S1_C0_RE );
ub_S1_C0_RE = _mm256_fmadd_pd( u_01_im , b_S1_C1_IM , ub_S1_C0_RE );
ub_S1_C0_IM = _mm256_fmadd_pd( u_01_re , b_S1_C1_IM , ub_S1_C0_IM );
ub_S1_C0_IM = _mm256_fnmadd_pd( u_01_im , b_S1_C1_RE , ub_S1_C0_IM );
ub_S1_C0_RE = _mm256_fmadd_pd( u_02_re , b_S1_C2_RE , ub_S1_C0_RE );
ub_S1_C0_RE = _mm256_fmadd_pd( u_02_im , b_S1_C2_IM , ub_S1_C0_RE );
ub_S1_C0_IM = _mm256_fmadd_pd( u_02_re , b_S1_C2_IM , ub_S1_C0_IM );
ub_S1_C0_IM = _mm256_fnmadd_pd( u_02_im , b_S1_C2_RE , ub_S1_C0_IM );
ub_S1_C1_RE = _mm256_mul_pd( u_10_re , b_S1_C0_RE );
ub_S1_C1_RE = _mm256_fmadd_pd( u_10_im , b_S1_C0_IM , ub_S1_C1_RE );
ub_S1_C1_IM = _mm256_mul_pd( u_10_re , b_S1_C0_IM );
ub_S1_C1_IM = _mm256_fnmadd_pd( u_10_im , b_S1_C0_RE , ub_S1_C1_IM );
ub_S1_C1_RE = _mm256_fmadd_pd( u_11_re , b_S1_C1_RE , ub_S1_C1_RE );
ub_S1_C1_RE = _mm256_fmadd_pd( u_11_im , b_S1_C1_IM , ub_S1_C1_RE );
ub_S1_C1_IM = _mm256_fmadd_pd( u_11_re , b_S1_C1_IM , ub_S1_C1_IM );
ub_S1_C1_IM = _mm256_fnmadd_pd( u_11_im , b_S1_C1_RE , ub_S1_C1_IM );
ub_S1_C1_RE = _mm256_fmadd_pd( u_12_re , b_S1_C2_RE , ub_S1_C1_RE );
ub_S1_C1_RE = _mm256_fmadd_pd( u_12_im , b_S1_C2_IM , ub_S1_C1_RE );
ub_S1_C1_IM = _mm256_fmadd_pd( u_12_re , b_S1_C2_IM , ub_S1_C1_IM );
ub_S1_C1_IM = _mm256_fnmadd_pd( u_12_im , b_S1_C2_RE , ub_S1_C1_IM );
ub_S1_C2_RE = _mm256_mul_pd( u_20_re , b_S1_C0_RE );
ub_S1_C2_RE = _mm256_fmadd_pd( u_20_im , b_S1_C0_IM , ub_S1_C2_RE );
ub_S1_C2_IM = _mm256_mul_pd( u_20_re , b_S1_C0_IM );
ub_S1_C2_IM = _mm256_fnmadd_pd( u_20_im , b_S1_C0_RE , ub_S1_C2_IM );
ub_S1_C2_RE = _mm256_fmadd_pd( u_21_re , b_S1_C1_RE , ub_S1_C2_RE );
ub_S1_C2_RE = _mm256_fmadd_pd( u_21_im , b_S1_C1_IM , ub_S1_C2_RE );
ub_S1_C2_IM = _mm256_fmadd_pd( u_21_re , b_S1_C1_IM , ub_S1_C2_IM );
ub_S1_C2_IM = _mm256_fnmadd_pd( u_21_im , b_S1_C1_RE , ub_S1_C2_IM );
ub_S1_C2_RE = _mm256_fmadd_pd( u_22_re , b_S1_C2_RE , ub_S1_C2_RE );
ub_S1_C2_RE = _mm256_fmadd_pd( u_22_im , b_S1_C2_IM , ub_S1_C2_RE );
ub_S1_C2_IM = _mm256_fmadd_pd( u_22_re , b_S1_C2_IM , ub_S1_C2_IM );
ub_S1_C2_IM = _mm256_fnmadd_pd( u_22_im , b_S1_C2_RE , ub_S1_C2_IM );
out_S1_C0_RE = _mm256_fmadd_pd( beta_vec , ub_S1_C0_RE , out_S1_C0_RE );
out_S1_C0_IM = _mm256_fmadd_pd( beta_vec , ub_S1_C0_IM , out_S1_C0_IM );
out_S1_C1_RE = _mm256_fmadd_pd( beta_vec , ub_S1_C1_RE , out_S1_C1_RE );
out_S1_C1_IM = _mm256_fmadd_pd( beta_vec , ub_S1_C1_IM , out_S1_C1_IM );
out_S1_C2_RE = _mm256_fmadd_pd( beta_vec , ub_S1_C2_RE , out_S1_C2_RE );
out_S1_C2_IM = _mm256_fmadd_pd( beta_vec , ub_S1_C2_IM , out_S1_C2_IM );
out_S2_C0_RE = _mm256_fmadd_pd( beta_vec , ub_S1_C0_IM , out_S2_C0_RE );
out_S2_C0_IM = _mm256_fnmadd_pd( beta_vec , ub_S1_C0_RE , out_S2_C0_IM );
out_S2_C1_RE = _mm256_fmadd_pd( beta_vec , ub_S1_C1_IM , out_S2_C1_RE );
out_S2_C1_IM = _mm256_fnmadd_pd( beta_vec , ub_S1_C1_RE , out_S2_C1_IM );
out_S2_C2_RE = _mm256_fmadd_pd( beta_vec , ub_S1_C2_IM , out_S2_C2_RE );
out_S2_C2_IM = _mm256_fnmadd_pd( beta_vec , ub_S1_C2_RE , out_S2_C2_IM );
} else {
__m256d accMask;

accMask = _mm256_int2mask_pd(accumulate[0]);

psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[0][0][0] + xbOffs[1]))-1), _mm_loaddup_pd(((*xyBase)[0][0][0] + xbOffs[0])), 1), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[0][0][0] + xbOffs[3]))-1), _mm_loaddup_pd(((*xyBase)[0][0][0] + xbOffs[2])), 1), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[0][0][1] + xbOffs[1]))-1), _mm_loaddup_pd(((*xyBase)[0][0][1] + xbOffs[0])), 1), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[0][0][1] + xbOffs[3]))-1), _mm_loaddup_pd(((*xyBase)[0][0][1] + xbOffs[2])), 1), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[0][3][0] + xbOffs[1]))-1), _mm_loaddup_pd(((*xyBase)[0][3][0] + xbOffs[0])), 1), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[0][3][0] + xbOffs[3]))-1), _mm_loaddup_pd(((*xyBase)[0][3][0] + xbOffs[2])), 1), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[0][3][1] + xbOffs[1]))-1), _mm_loaddup_pd(((*xyBase)[0][3][1] + xbOffs[0])), 1), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[0][3][1] + xbOffs[3]))-1), _mm_loaddup_pd(((*xyBase)[0][3][1] + xbOffs[2])), 1), 1);

b_S0_C0_RE = _mm256_sub_pd( psi_S0_RE , psi_S1_IM );
b_S0_C0_IM = _mm256_add_pd( psi_S0_IM , psi_S1_RE );
psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[1][0][0] + xbOffs[1]))-1), _mm_loaddup_pd(((*xyBase)[1][0][0] + xbOffs[0])), 1), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[1][0][0] + xbOffs[3]))-1), _mm_loaddup_pd(((*xyBase)[1][0][0] + xbOffs[2])), 1), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[1][0][1] + xbOffs[1]))-1), _mm_loaddup_pd(((*xyBase)[1][0][1] + xbOffs[0])), 1), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[1][0][1] + xbOffs[3]))-1), _mm_loaddup_pd(((*xyBase)[1][0][1] + xbOffs[2])), 1), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[1][3][0] + xbOffs[1]))-1), _mm_loaddup_pd(((*xyBase)[1][3][0] + xbOffs[0])), 1), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[1][3][0] + xbOffs[3]))-1), _mm_loaddup_pd(((*xyBase)[1][3][0] + xbOffs[2])), 1), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[1][3][1] + xbOffs[1]))-1), _mm_loaddup_pd(((*xyBase)[1][3][1] + xbOffs[0])), 1), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[1][3][1] + xbOffs[3]))-1), _mm_loaddup_pd(((*xyBase)[1][3][1] + xbOffs[2])), 1), 1);

b_S0_C1_RE = _mm256_sub_pd( psi_S0_RE , psi_S1_IM );
b_S0_C1_IM = _mm256_add_pd( psi_S0_IM , psi_S1_RE );
psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[2][0][0] + xbOffs[1]))-1), _mm_loaddup_pd(((*xyBase)[2][0][0] + xbOffs[0])), 1), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[2][0][0] + xbOffs[3]))-1), _mm_loaddup_pd(((*xyBase)[2][0][0] + xbOffs[2])), 1), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[2][0][1] + xbOffs[1]))-1), _mm_loaddup_pd(((*xyBase)[2][0][1] + xbOffs[0])), 1), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[2][0][1] + xbOffs[3]))-1), _mm_loaddup_pd(((*xyBase)[2][0][1] + xbOffs[2])), 1), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[2][3][0] + xbOffs[1]))-1), _mm_loaddup_pd(((*xyBase)[2][3][0] + xbOffs[0])), 1), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[2][3][0] + xbOffs[3]))-1), _mm_loaddup_pd(((*xyBase)[2][3][0] + xbOffs[2])), 1), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[2][3][1] + xbOffs[1]))-1), _mm_loaddup_pd(((*xyBase)[2][3][1] + xbOffs[0])), 1), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[2][3][1] + xbOffs[3]))-1), _mm_loaddup_pd(((*xyBase)[2][3][1] + xbOffs[2])), 1), 1);

b_S0_C2_RE = _mm256_sub_pd( psi_S0_RE , psi_S1_IM );
b_S0_C2_IM = _mm256_add_pd( psi_S0_IM , psi_S1_RE );
u_00_re = _mm256_load_pd((*gBase)[0][0][0][0]);

u_00_im = _mm256_load_pd((*gBase)[0][0][0][1]);

u_01_re = _mm256_load_pd((*gBase)[0][0][1][0]);

u_01_im = _mm256_load_pd((*gBase)[0][0][1][1]);

u_02_re = _mm256_load_pd((*gBase)[0][0][2][0]);

u_02_im = _mm256_load_pd((*gBase)[0][0][2][1]);

u_10_re = _mm256_load_pd((*gBase)[0][1][0][0]);

u_10_im = _mm256_load_pd((*gBase)[0][1][0][1]);

u_11_re = _mm256_load_pd((*gBase)[0][1][1][0]);

u_11_im = _mm256_load_pd((*gBase)[0][1][1][1]);

u_12_re = _mm256_load_pd((*gBase)[0][1][2][0]);

u_12_im = _mm256_load_pd((*gBase)[0][1][2][1]);

u_20_re = _mm256_mul_pd( u_01_re , u_12_re );
u_20_re = _mm256_fnmadd_pd( u_01_im , u_12_im , u_20_re );
u_20_re = _mm256_fnmadd_pd( u_02_re , u_11_re , u_20_re );
u_20_re = _mm256_fmadd_pd( u_02_im , u_11_im , u_20_re );
u_20_im = _mm256_mul_pd( u_02_re , u_11_im );
u_20_im = _mm256_fmadd_pd( u_02_im , u_11_re , u_20_im );
u_20_im = _mm256_fnmadd_pd( u_01_re , u_12_im , u_20_im );
u_20_im = _mm256_fnmadd_pd( u_01_im , u_12_re , u_20_im );
u_21_re = _mm256_mul_pd( u_02_re , u_10_re );
u_21_re = _mm256_fnmadd_pd( u_02_im , u_10_im , u_21_re );
u_21_re = _mm256_fnmadd_pd( u_00_re , u_12_re , u_21_re );
u_21_re = _mm256_fmadd_pd( u_00_im , u_12_im , u_21_re );
u_21_im = _mm256_mul_pd( u_00_re , u_12_im );
u_21_im = _mm256_fmadd_pd( u_00_im , u_12_re , u_21_im );
u_21_im = _mm256_fnmadd_pd( u_02_re , u_10_im , u_21_im );
u_21_im = _mm256_fnmadd_pd( u_02_im , u_10_re , u_21_im );
u_22_re = _mm256_mul_pd( u_00_re , u_11_re );
u_22_re = _mm256_fnmadd_pd( u_00_im , u_11_im , u_22_re );
u_22_re = _mm256_fnmadd_pd( u_01_re , u_10_re , u_22_re );
u_22_re = _mm256_fmadd_pd( u_01_im , u_10_im , u_22_re );
u_22_im = _mm256_mul_pd( u_01_re , u_10_im );
u_22_im = _mm256_fmadd_pd( u_01_im , u_10_re , u_22_im );
u_22_im = _mm256_fnmadd_pd( u_00_re , u_11_im , u_22_im );
u_22_im = _mm256_fnmadd_pd( u_00_im , u_11_re , u_22_im );
ub_S0_C0_RE = _mm256_mul_pd( u_00_re , b_S0_C0_RE );
ub_S0_C0_RE = _mm256_fmadd_pd( u_00_im , b_S0_C0_IM , ub_S0_C0_RE );
ub_S0_C0_IM = _mm256_mul_pd( u_00_re , b_S0_C0_IM );
ub_S0_C0_IM = _mm256_fnmadd_pd( u_00_im , b_S0_C0_RE , ub_S0_C0_IM );
ub_S0_C0_RE = _mm256_fmadd_pd( u_01_re , b_S0_C1_RE , ub_S0_C0_RE );
ub_S0_C0_RE = _mm256_fmadd_pd( u_01_im , b_S0_C1_IM , ub_S0_C0_RE );
ub_S0_C0_IM = _mm256_fmadd_pd( u_01_re , b_S0_C1_IM , ub_S0_C0_IM );
ub_S0_C0_IM = _mm256_fnmadd_pd( u_01_im , b_S0_C1_RE , ub_S0_C0_IM );
ub_S0_C0_RE = _mm256_fmadd_pd( u_02_re , b_S0_C2_RE , ub_S0_C0_RE );
ub_S0_C0_RE = _mm256_fmadd_pd( u_02_im , b_S0_C2_IM , ub_S0_C0_RE );
ub_S0_C0_IM = _mm256_fmadd_pd( u_02_re , b_S0_C2_IM , ub_S0_C0_IM );
ub_S0_C0_IM = _mm256_fnmadd_pd( u_02_im , b_S0_C2_RE , ub_S0_C0_IM );
ub_S0_C1_RE = _mm256_mul_pd( u_10_re , b_S0_C0_RE );
ub_S0_C1_RE = _mm256_fmadd_pd( u_10_im , b_S0_C0_IM , ub_S0_C1_RE );
ub_S0_C1_IM = _mm256_mul_pd( u_10_re , b_S0_C0_IM );
ub_S0_C1_IM = _mm256_fnmadd_pd( u_10_im , b_S0_C0_RE , ub_S0_C1_IM );
ub_S0_C1_RE = _mm256_fmadd_pd( u_11_re , b_S0_C1_RE , ub_S0_C1_RE );
ub_S0_C1_RE = _mm256_fmadd_pd( u_11_im , b_S0_C1_IM , ub_S0_C1_RE );
ub_S0_C1_IM = _mm256_fmadd_pd( u_11_re , b_S0_C1_IM , ub_S0_C1_IM );
ub_S0_C1_IM = _mm256_fnmadd_pd( u_11_im , b_S0_C1_RE , ub_S0_C1_IM );
ub_S0_C1_RE = _mm256_fmadd_pd( u_12_re , b_S0_C2_RE , ub_S0_C1_RE );
ub_S0_C1_RE = _mm256_fmadd_pd( u_12_im , b_S0_C2_IM , ub_S0_C1_RE );
ub_S0_C1_IM = _mm256_fmadd_pd( u_12_re , b_S0_C2_IM , ub_S0_C1_IM );
ub_S0_C1_IM = _mm256_fnmadd_pd( u_12_im , b_S0_C2_RE , ub_S0_C1_IM );
ub_S0_C2_RE = _mm256_mul_pd( u_20_re , b_S0_C0_RE );
ub_S0_C2_RE = _mm256_fmadd_pd( u_20_im , b_S0_C0_IM , ub_S0_C2_RE );
ub_S0_C2_IM = _mm256_mul_pd( u_20_re , b_S0_C0_IM );
ub_S0_C2_IM = _mm256_fnmadd_pd( u_20_im , b_S0_C0_RE , ub_S0_C2_IM );
ub_S0_C2_RE = _mm256_fmadd_pd( u_21_re , b_S0_C1_RE , ub_S0_C2_RE );
ub_S0_C2_RE = _mm256_fmadd_pd( u_21_im , b_S0_C1_IM , ub_S0_C2_RE );
ub_S0_C2_IM = _mm256_fmadd_pd( u_21_re , b_S0_C1_IM , ub_S0_C2_IM );
ub_S0_C2_IM = _mm256_fnmadd_pd( u_21_im , b_S0_C1_RE , ub_S0_C2_IM );
ub_S0_C2_RE = _mm256_fmadd_pd( u_22_re , b_S0_C2_RE , ub_S0_C2_RE );
ub_S0_C2_RE = _mm256_fmadd_pd( u_22_im , b_S0_C2_IM , ub_S0_C2_RE );
ub_S0_C2_IM = _mm256_fmadd_pd( u_22_re , b_S0_C2_IM , ub_S0_C2_IM );
ub_S0_C2_IM = _mm256_fnmadd_pd( u_22_im , b_S0_C2_RE , ub_S0_C2_IM );
out_S0_C0_RE =_mm256_blendv_pd(out_S0_C0_RE, _mm256_fmadd_pd( beta_vec , ub_S0_C0_RE , out_S0_C0_RE ), accMask);
out_S0_C0_IM =_mm256_blendv_pd(out_S0_C0_IM, _mm256_fmadd_pd( beta_vec , ub_S0_C0_IM , out_S0_C0_IM ), accMask);
out_S0_C1_RE =_mm256_blendv_pd(out_S0_C1_RE, _mm256_fmadd_pd( beta_vec , ub_S0_C1_RE , out_S0_C1_RE ), accMask);
out_S0_C1_IM =_mm256_blendv_pd(out_S0_C1_IM, _mm256_fmadd_pd( beta_vec , ub_S0_C1_IM , out_S0_C1_IM ), accMask);
out_S0_C2_RE =_mm256_blendv_pd(out_S0_C2_RE, _mm256_fmadd_pd( beta_vec , ub_S0_C2_RE , out_S0_C2_RE ), accMask);
out_S0_C2_IM =_mm256_blendv_pd(out_S0_C2_IM, _mm256_fmadd_pd( beta_vec , ub_S0_C2_IM , out_S0_C2_IM ), accMask);
out_S3_C0_RE =_mm256_blendv_pd(out_S3_C0_RE, _mm256_fmadd_pd( beta_vec , ub_S0_C0_IM , out_S3_C0_RE ), accMask);
out_S3_C0_IM = _mm256_blendv_pd(out_S3_C0_IM, _mm256_fnmadd_pd( beta_vec , ub_S0_C0_RE , out_S3_C0_IM ), accMask);
out_S3_C1_RE =_mm256_blendv_pd(out_S3_C1_RE, _mm256_fmadd_pd( beta_vec , ub_S0_C1_IM , out_S3_C1_RE ), accMask);
out_S3_C1_IM = _mm256_blendv_pd(out_S3_C1_IM, _mm256_fnmadd_pd( beta_vec , ub_S0_C1_RE , out_S3_C1_IM ), accMask);
out_S3_C2_RE =_mm256_blendv_pd(out_S3_C2_RE, _mm256_fmadd_pd( beta_vec , ub_S0_C2_IM , out_S3_C2_RE ), accMask);
out_S3_C2_IM = _mm256_blendv_pd(out_S3_C2_IM, _mm256_fnmadd_pd( beta_vec , ub_S0_C2_RE , out_S3_C2_IM ), accMask);
psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[0][1][0] + xbOffs[1]))-1), _mm_loaddup_pd(((*xyBase)[0][1][0] + xbOffs[0])), 1), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[0][1][0] + xbOffs[3]))-1), _mm_loaddup_pd(((*xyBase)[0][1][0] + xbOffs[2])), 1), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[0][1][1] + xbOffs[1]))-1), _mm_loaddup_pd(((*xyBase)[0][1][1] + xbOffs[0])), 1), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[0][1][1] + xbOffs[3]))-1), _mm_loaddup_pd(((*xyBase)[0][1][1] + xbOffs[2])), 1), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[0][2][0] + xbOffs[1]))-1), _mm_loaddup_pd(((*xyBase)[0][2][0] + xbOffs[0])), 1), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[0][2][0] + xbOffs[3]))-1), _mm_loaddup_pd(((*xyBase)[0][2][0] + xbOffs[2])), 1), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[0][2][1] + xbOffs[1]))-1), _mm_loaddup_pd(((*xyBase)[0][2][1] + xbOffs[0])), 1), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[0][2][1] + xbOffs[3]))-1), _mm_loaddup_pd(((*xyBase)[0][2][1] + xbOffs[2])), 1), 1);

b_S1_C0_RE = _mm256_sub_pd( psi_S0_RE , psi_S1_IM );
b_S1_C0_IM = _mm256_add_pd( psi_S0_IM , psi_S1_RE );
psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[1][1][0] + xbOffs[1]))-1), _mm_loaddup_pd(((*xyBase)[1][1][0] + xbOffs[0])), 1), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[1][1][0] + xbOffs[3]))-1), _mm_loaddup_pd(((*xyBase)[1][1][0] + xbOffs[2])), 1), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[1][1][1] + xbOffs[1]))-1), _mm_loaddup_pd(((*xyBase)[1][1][1] + xbOffs[0])), 1), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[1][1][1] + xbOffs[3]))-1), _mm_loaddup_pd(((*xyBase)[1][1][1] + xbOffs[2])), 1), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[1][2][0] + xbOffs[1]))-1), _mm_loaddup_pd(((*xyBase)[1][2][0] + xbOffs[0])), 1), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[1][2][0] + xbOffs[3]))-1), _mm_loaddup_pd(((*xyBase)[1][2][0] + xbOffs[2])), 1), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[1][2][1] + xbOffs[1]))-1), _mm_loaddup_pd(((*xyBase)[1][2][1] + xbOffs[0])), 1), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[1][2][1] + xbOffs[3]))-1), _mm_loaddup_pd(((*xyBase)[1][2][1] + xbOffs[2])), 1), 1);

b_S1_C1_RE = _mm256_sub_pd( psi_S0_RE , psi_S1_IM );
b_S1_C1_IM = _mm256_add_pd( psi_S0_IM , psi_S1_RE );
psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[2][1][0] + xbOffs[1]))-1), _mm_loaddup_pd(((*xyBase)[2][1][0] + xbOffs[0])), 1), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[2][1][0] + xbOffs[3]))-1), _mm_loaddup_pd(((*xyBase)[2][1][0] + xbOffs[2])), 1), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[2][1][1] + xbOffs[1]))-1), _mm_loaddup_pd(((*xyBase)[2][1][1] + xbOffs[0])), 1), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[2][1][1] + xbOffs[3]))-1), _mm_loaddup_pd(((*xyBase)[2][1][1] + xbOffs[2])), 1), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[2][2][0] + xbOffs[1]))-1), _mm_loaddup_pd(((*xyBase)[2][2][0] + xbOffs[0])), 1), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[2][2][0] + xbOffs[3]))-1), _mm_loaddup_pd(((*xyBase)[2][2][0] + xbOffs[2])), 1), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[2][2][1] + xbOffs[1]))-1), _mm_loaddup_pd(((*xyBase)[2][2][1] + xbOffs[0])), 1), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_blend_pd(_mm_loadu_pd((((*xyBase)[2][2][1] + xbOffs[3]))-1), _mm_loaddup_pd(((*xyBase)[2][2][1] + xbOffs[2])), 1), 1);

b_S1_C2_RE = _mm256_sub_pd( psi_S0_RE , psi_S1_IM );
b_S1_C2_IM = _mm256_add_pd( psi_S0_IM , psi_S1_RE );
ub_S1_C0_RE = _mm256_mul_pd( u_00_re , b_S1_C0_RE );
ub_S1_C0_RE = _mm256_fmadd_pd( u_00_im , b_S1_C0_IM , ub_S1_C0_RE );
ub_S1_C0_IM = _mm256_mul_pd( u_00_re , b_S1_C0_IM );
ub_S1_C0_IM = _mm256_fnmadd_pd( u_00_im , b_S1_C0_RE , ub_S1_C0_IM );
ub_S1_C0_RE = _mm256_fmadd_pd( u_01_re , b_S1_C1_RE , ub_S1_C0_RE );
ub_S1_C0_RE = _mm256_fmadd_pd( u_01_im , b_S1_C1_IM , ub_S1_C0_RE );
ub_S1_C0_IM = _mm256_fmadd_pd( u_01_re , b_S1_C1_IM , ub_S1_C0_IM );
ub_S1_C0_IM = _mm256_fnmadd_pd( u_01_im , b_S1_C1_RE , ub_S1_C0_IM );
ub_S1_C0_RE = _mm256_fmadd_pd( u_02_re , b_S1_C2_RE , ub_S1_C0_RE );
ub_S1_C0_RE = _mm256_fmadd_pd( u_02_im , b_S1_C2_IM , ub_S1_C0_RE );
ub_S1_C0_IM = _mm256_fmadd_pd( u_02_re , b_S1_C2_IM , ub_S1_C0_IM );
ub_S1_C0_IM = _mm256_fnmadd_pd( u_02_im , b_S1_C2_RE , ub_S1_C0_IM );
ub_S1_C1_RE = _mm256_mul_pd( u_10_re , b_S1_C0_RE );
ub_S1_C1_RE = _mm256_fmadd_pd( u_10_im , b_S1_C0_IM , ub_S1_C1_RE );
ub_S1_C1_IM = _mm256_mul_pd( u_10_re , b_S1_C0_IM );
ub_S1_C1_IM = _mm256_fnmadd_pd( u_10_im , b_S1_C0_RE , ub_S1_C1_IM );
ub_S1_C1_RE = _mm256_fmadd_pd( u_11_re , b_S1_C1_RE , ub_S1_C1_RE );
ub_S1_C1_RE = _mm256_fmadd_pd( u_11_im , b_S1_C1_IM , ub_S1_C1_RE );
ub_S1_C1_IM = _mm256_fmadd_pd( u_11_re , b_S1_C1_IM , ub_S1_C1_IM );
ub_S1_C1_IM = _mm256_fnmadd_pd( u_11_im , b_S1_C1_RE , ub_S1_C1_IM );
ub_S1_C1_RE = _mm256_fmadd_pd( u_12_re , b_S1_C2_RE , ub_S1_C1_RE );
ub_S1_C1_RE = _mm256_fmadd_pd( u_12_im , b_S1_C2_IM , ub_S1_C1_RE );
ub_S1_C1_IM = _mm256_fmadd_pd( u_12_re , b_S1_C2_IM , ub_S1_C1_IM );
ub_S1_C1_IM = _mm256_fnmadd_pd( u_12_im , b_S1_C2_RE , ub_S1_C1_IM );
ub_S1_C2_RE = _mm256_mul_pd( u_20_re , b_S1_C0_RE );
ub_S1_C2_RE = _mm256_fmadd_pd( u_20_im , b_S1_C0_IM , ub_S1_C2_RE );
ub_S1_C2_IM = _mm256_mul_pd( u_20_re , b_S1_C0_IM );
ub_S1_C2_IM = _mm256_fnmadd_pd( u_20_im , b_S1_C0_RE , ub_S1_C2_IM );
ub_S1_C2_RE = _mm256_fmadd_pd( u_21_re , b_S1_C1_RE , ub_S1_C2_RE );
ub_S1_C2_RE = _mm256_fmadd_pd( u_21_im , b_S1_C1_IM , ub_S1_C2_RE );
ub_S1_C2_IM = _mm256_fmadd_pd( u_21_re , b_S1_C1_IM , ub_S1_C2_IM );
ub_S1_C2_IM = _mm256_fnmadd_pd( u_21_im , b_S1_C1_RE , ub_S1_C2_IM );
ub_S1_C2_RE = _mm256_fmadd_pd( u_22_re , b_S1_C2_RE , ub_S1_C2_RE );
ub_S1_C2_RE = _mm256_fmadd_pd( u_22_im , b_S1_C2_IM , ub_S1_C2_RE );
ub_S1_C2_IM = _mm256_fmadd_pd( u_22_re , b_S1_C2_IM , ub_S1_C2_IM );
ub_S1_C2_IM = _mm256_fnmadd_pd( u_22_im , b_S1_C2_RE , ub_S1_C2_IM );
out_S1_C0_RE =_mm256_blendv_pd(out_S1_C0_RE, _mm256_fmadd_pd( beta_vec , ub_S1_C0_RE , out_S1_C0_RE ), accMask);
out_S1_C0_IM =_mm256_blendv_pd(out_S1_C0_IM, _mm256_fmadd_pd( beta_vec , ub_S1_C0_IM , out_S1_C0_IM ), accMask);
out_S1_C1_RE =_mm256_blendv_pd(out_S1_C1_RE, _mm256_fmadd_pd( beta_vec , ub_S1_C1_RE , out_S1_C1_RE ), accMask);
out_S1_C1_IM =_mm256_blendv_pd(out_S1_C1_IM, _mm256_fmadd_pd( beta_vec , ub_S1_C1_IM , out_S1_C1_IM ), accMask);
out_S1_C2_RE =_mm256_blendv_pd(out_S1_C2_RE, _mm256_fmadd_pd( beta_vec , ub_S1_C2_RE , out_S1_C2_RE ), accMask);
out_S1_C2_IM =_mm256_blendv_pd(out_S1_C2_IM, _mm256_fmadd_pd( beta_vec , ub_S1_C2_IM , out_S1_C2_IM ), accMask);
out_S2_C0_RE =_mm256_blendv_pd(out_S2_C0_RE, _mm256_fmadd_pd( beta_vec , ub_S1_C0_IM , out_S2_C0_RE ), accMask);
out_S2_C0_IM = _mm256_blendv_pd(out_S2_C0_IM, _mm256_fnmadd_pd( beta_vec , ub_S1_C0_RE , out_S2_C0_IM ), accMask);
out_S2_C1_RE =_mm256_blendv_pd(out_S2_C1_RE, _mm256_fmadd_pd( beta_vec , ub_S1_C1_IM , out_S2_C1_RE ), accMask);
out_S2_C1_IM = _mm256_blendv_pd(out_S2_C1_IM, _mm256_fnmadd_pd( beta_vec , ub_S1_C1_RE , out_S2_C1_IM ), accMask);
out_S2_C2_RE =_mm256_blendv_pd(out_S2_C2_RE, _mm256_fmadd_pd( beta_vec , ub_S1_C2_IM , out_S2_C2_RE ), accMask);
out_S2_C2_IM = _mm256_blendv_pd(out_S2_C2_IM, _mm256_fnmadd_pd( beta_vec , ub_S1_C2_RE , out_S2_C2_IM ), accMask);
}
}
 if ( accumulate[1] ) { 
__m256d beta_vec = _mm256_setzero_pd();
beta_vec = _mm256_broadcast_sd((&coeff_s));

 if ((accumulate[1] & 0xF) == 0xF) { 
psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[0][0][0] + xfOffs[0])), _mm_loaddup_pd(((*xyBase)[0][0][0] + xfOffs[1])), 2), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[0][0][0] + xfOffs[2])), _mm_loaddup_pd(((*xyBase)[0][0][0] + xfOffs[3])), 2), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[0][0][1] + xfOffs[0])), _mm_loaddup_pd(((*xyBase)[0][0][1] + xfOffs[1])), 2), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[0][0][1] + xfOffs[2])), _mm_loaddup_pd(((*xyBase)[0][0][1] + xfOffs[3])), 2), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[0][3][0] + xfOffs[0])), _mm_loaddup_pd(((*xyBase)[0][3][0] + xfOffs[1])), 2), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[0][3][0] + xfOffs[2])), _mm_loaddup_pd(((*xyBase)[0][3][0] + xfOffs[3])), 2), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[0][3][1] + xfOffs[0])), _mm_loaddup_pd(((*xyBase)[0][3][1] + xfOffs[1])), 2), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[0][3][1] + xfOffs[2])), _mm_loaddup_pd(((*xyBase)[0][3][1] + xfOffs[3])), 2), 1);

b_S0_C0_RE = _mm256_add_pd( psi_S0_RE , psi_S1_IM );
b_S0_C0_IM = _mm256_sub_pd( psi_S0_IM , psi_S1_RE );
psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[1][0][0] + xfOffs[0])), _mm_loaddup_pd(((*xyBase)[1][0][0] + xfOffs[1])), 2), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[1][0][0] + xfOffs[2])), _mm_loaddup_pd(((*xyBase)[1][0][0] + xfOffs[3])), 2), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[1][0][1] + xfOffs[0])), _mm_loaddup_pd(((*xyBase)[1][0][1] + xfOffs[1])), 2), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[1][0][1] + xfOffs[2])), _mm_loaddup_pd(((*xyBase)[1][0][1] + xfOffs[3])), 2), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[1][3][0] + xfOffs[0])), _mm_loaddup_pd(((*xyBase)[1][3][0] + xfOffs[1])), 2), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[1][3][0] + xfOffs[2])), _mm_loaddup_pd(((*xyBase)[1][3][0] + xfOffs[3])), 2), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[1][3][1] + xfOffs[0])), _mm_loaddup_pd(((*xyBase)[1][3][1] + xfOffs[1])), 2), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[1][3][1] + xfOffs[2])), _mm_loaddup_pd(((*xyBase)[1][3][1] + xfOffs[3])), 2), 1);

b_S0_C1_RE = _mm256_add_pd( psi_S0_RE , psi_S1_IM );
b_S0_C1_IM = _mm256_sub_pd( psi_S0_IM , psi_S1_RE );
psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[2][0][0] + xfOffs[0])), _mm_loaddup_pd(((*xyBase)[2][0][0] + xfOffs[1])), 2), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[2][0][0] + xfOffs[2])), _mm_loaddup_pd(((*xyBase)[2][0][0] + xfOffs[3])), 2), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[2][0][1] + xfOffs[0])), _mm_loaddup_pd(((*xyBase)[2][0][1] + xfOffs[1])), 2), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[2][0][1] + xfOffs[2])), _mm_loaddup_pd(((*xyBase)[2][0][1] + xfOffs[3])), 2), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[2][3][0] + xfOffs[0])), _mm_loaddup_pd(((*xyBase)[2][3][0] + xfOffs[1])), 2), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[2][3][0] + xfOffs[2])), _mm_loaddup_pd(((*xyBase)[2][3][0] + xfOffs[3])), 2), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[2][3][1] + xfOffs[0])), _mm_loaddup_pd(((*xyBase)[2][3][1] + xfOffs[1])), 2), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[2][3][1] + xfOffs[2])), _mm_loaddup_pd(((*xyBase)[2][3][1] + xfOffs[3])), 2), 1);

b_S0_C2_RE = _mm256_add_pd( psi_S0_RE , psi_S1_IM );
b_S0_C2_IM = _mm256_sub_pd( psi_S0_IM , psi_S1_RE );
u_00_re = _mm256_load_pd((*gBase)[1][0][0][0]);

u_00_im = _mm256_load_pd((*gBase)[1][0][0][1]);

u_01_re = _mm256_load_pd((*gBase)[1][0][1][0]);

u_01_im = _mm256_load_pd((*gBase)[1][0][1][1]);

u_02_re = _mm256_load_pd((*gBase)[1][0][2][0]);

u_02_im = _mm256_load_pd((*gBase)[1][0][2][1]);

u_10_re = _mm256_load_pd((*gBase)[1][1][0][0]);

u_10_im = _mm256_load_pd((*gBase)[1][1][0][1]);

u_11_re = _mm256_load_pd((*gBase)[1][1][1][0]);

u_11_im = _mm256_load_pd((*gBase)[1][1][1][1]);

u_12_re = _mm256_load_pd((*gBase)[1][1][2][0]);

u_12_im = _mm256_load_pd((*gBase)[1][1][2][1]);

u_20_re = _mm256_mul_pd( u_01_re , u_12_re );
u_20_re = _mm256_fnmadd_pd( u_01_im , u_12_im , u_20_re );
u_20_re = _mm256_fnmadd_pd( u_02_re , u_11_re , u_20_re );
u_20_re = _mm256_fmadd_pd( u_02_im , u_11_im , u_20_re );
u_20_im = _mm256_mul_pd( u_02_re , u_11_im );
u_20_im = _mm256_fmadd_pd( u_02_im , u_11_re , u_20_im );
u_20_im = _mm256_fnmadd_pd( u_01_re , u_12_im , u_20_im );
u_20_im = _mm256_fnmadd_pd( u_01_im , u_12_re , u_20_im );
u_21_re = _mm256_mul_pd( u_02_re , u_10_re );
u_21_re = _mm256_fnmadd_pd( u_02_im , u_10_im , u_21_re );
u_21_re = _mm256_fnmadd_pd( u_00_re , u_12_re , u_21_re );
u_21_re = _mm256_fmadd_pd( u_00_im , u_12_im , u_21_re );
u_21_im = _mm256_mul_pd( u_00_re , u_12_im );
u_21_im = _mm256_fmadd_pd( u_00_im , u_12_re , u_21_im );
u_21_im = _mm256_fnmadd_pd( u_02_re , u_10_im , u_21_im );
u_21_im = _mm256_fnmadd_pd( u_02_im , u_10_re , u_21_im );
u_22_re = _mm256_mul_pd( u_00_re , u_11_re );
u_22_re = _mm256_fnmadd_pd( u_00_im , u_11_im , u_22_re );
u_22_re = _mm256_fnmadd_pd( u_01_re , u_10_re , u_22_re );
u_22_re = _mm256_fmadd_pd( u_01_im , u_10_im , u_22_re );
u_22_im = _mm256_mul_pd( u_01_re , u_10_im );
u_22_im = _mm256_fmadd_pd( u_01_im , u_10_re , u_22_im );
u_22_im = _mm256_fnmadd_pd( u_00_re , u_11_im , u_22_im );
u_22_im = _mm256_fnmadd_pd( u_00_im , u_11_re , u_22_im );
ub_S0_C0_RE = _mm256_mul_pd( u_00_re , b_S0_C0_RE );
ub_S0_C0_RE = _mm256_fnmadd_pd( u_00_im , b_S0_C0_IM , ub_S0_C0_RE );
ub_S0_C0_IM = _mm256_mul_pd( u_00_re , b_S0_C0_IM );
ub_S0_C0_IM = _mm256_fmadd_pd( u_00_im , b_S0_C0_RE , ub_S0_C0_IM );
ub_S0_C0_RE = _mm256_fmadd_pd( u_10_re , b_S0_C1_RE , ub_S0_C0_RE );
ub_S0_C0_RE = _mm256_fnmadd_pd( u_10_im , b_S0_C1_IM , ub_S0_C0_RE );
ub_S0_C0_IM = _mm256_fmadd_pd( u_10_re , b_S0_C1_IM , ub_S0_C0_IM );
ub_S0_C0_IM = _mm256_fmadd_pd( u_10_im , b_S0_C1_RE , ub_S0_C0_IM );
ub_S0_C0_RE = _mm256_fmadd_pd( u_20_re , b_S0_C2_RE , ub_S0_C0_RE );
ub_S0_C0_RE = _mm256_fnmadd_pd( u_20_im , b_S0_C2_IM , ub_S0_C0_RE );
ub_S0_C0_IM = _mm256_fmadd_pd( u_20_re , b_S0_C2_IM , ub_S0_C0_IM );
ub_S0_C0_IM = _mm256_fmadd_pd( u_20_im , b_S0_C2_RE , ub_S0_C0_IM );
ub_S0_C1_RE = _mm256_mul_pd( u_01_re , b_S0_C0_RE );
ub_S0_C1_RE = _mm256_fnmadd_pd( u_01_im , b_S0_C0_IM , ub_S0_C1_RE );
ub_S0_C1_IM = _mm256_mul_pd( u_01_re , b_S0_C0_IM );
ub_S0_C1_IM = _mm256_fmadd_pd( u_01_im , b_S0_C0_RE , ub_S0_C1_IM );
ub_S0_C1_RE = _mm256_fmadd_pd( u_11_re , b_S0_C1_RE , ub_S0_C1_RE );
ub_S0_C1_RE = _mm256_fnmadd_pd( u_11_im , b_S0_C1_IM , ub_S0_C1_RE );
ub_S0_C1_IM = _mm256_fmadd_pd( u_11_re , b_S0_C1_IM , ub_S0_C1_IM );
ub_S0_C1_IM = _mm256_fmadd_pd( u_11_im , b_S0_C1_RE , ub_S0_C1_IM );
ub_S0_C1_RE = _mm256_fmadd_pd( u_21_re , b_S0_C2_RE , ub_S0_C1_RE );
ub_S0_C1_RE = _mm256_fnmadd_pd( u_21_im , b_S0_C2_IM , ub_S0_C1_RE );
ub_S0_C1_IM = _mm256_fmadd_pd( u_21_re , b_S0_C2_IM , ub_S0_C1_IM );
ub_S0_C1_IM = _mm256_fmadd_pd( u_21_im , b_S0_C2_RE , ub_S0_C1_IM );
ub_S0_C2_RE = _mm256_mul_pd( u_02_re , b_S0_C0_RE );
ub_S0_C2_RE = _mm256_fnmadd_pd( u_02_im , b_S0_C0_IM , ub_S0_C2_RE );
ub_S0_C2_IM = _mm256_mul_pd( u_02_re , b_S0_C0_IM );
ub_S0_C2_IM = _mm256_fmadd_pd( u_02_im , b_S0_C0_RE , ub_S0_C2_IM );
ub_S0_C2_RE = _mm256_fmadd_pd( u_12_re , b_S0_C1_RE , ub_S0_C2_RE );
ub_S0_C2_RE = _mm256_fnmadd_pd( u_12_im , b_S0_C1_IM , ub_S0_C2_RE );
ub_S0_C2_IM = _mm256_fmadd_pd( u_12_re , b_S0_C1_IM , ub_S0_C2_IM );
ub_S0_C2_IM = _mm256_fmadd_pd( u_12_im , b_S0_C1_RE , ub_S0_C2_IM );
ub_S0_C2_RE = _mm256_fmadd_pd( u_22_re , b_S0_C2_RE , ub_S0_C2_RE );
ub_S0_C2_RE = _mm256_fnmadd_pd( u_22_im , b_S0_C2_IM , ub_S0_C2_RE );
ub_S0_C2_IM = _mm256_fmadd_pd( u_22_re , b_S0_C2_IM , ub_S0_C2_IM );
ub_S0_C2_IM = _mm256_fmadd_pd( u_22_im , b_S0_C2_RE , ub_S0_C2_IM );
out_S0_C0_RE = _mm256_fmadd_pd( beta_vec , ub_S0_C0_RE , out_S0_C0_RE );
out_S0_C0_IM = _mm256_fmadd_pd( beta_vec , ub_S0_C0_IM , out_S0_C0_IM );
out_S0_C1_RE = _mm256_fmadd_pd( beta_vec , ub_S0_C1_RE , out_S0_C1_RE );
out_S0_C1_IM = _mm256_fmadd_pd( beta_vec , ub_S0_C1_IM , out_S0_C1_IM );
out_S0_C2_RE = _mm256_fmadd_pd( beta_vec , ub_S0_C2_RE , out_S0_C2_RE );
out_S0_C2_IM = _mm256_fmadd_pd( beta_vec , ub_S0_C2_IM , out_S0_C2_IM );
out_S3_C0_RE = _mm256_fnmadd_pd( beta_vec , ub_S0_C0_IM , out_S3_C0_RE );
out_S3_C0_IM = _mm256_fmadd_pd( beta_vec , ub_S0_C0_RE , out_S3_C0_IM );
out_S3_C1_RE = _mm256_fnmadd_pd( beta_vec , ub_S0_C1_IM , out_S3_C1_RE );
out_S3_C1_IM = _mm256_fmadd_pd( beta_vec , ub_S0_C1_RE , out_S3_C1_IM );
out_S3_C2_RE = _mm256_fnmadd_pd( beta_vec , ub_S0_C2_IM , out_S3_C2_RE );
out_S3_C2_IM = _mm256_fmadd_pd( beta_vec , ub_S0_C2_RE , out_S3_C2_IM );
psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[0][1][0] + xfOffs[0])), _mm_loaddup_pd(((*xyBase)[0][1][0] + xfOffs[1])), 2), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[0][1][0] + xfOffs[2])), _mm_loaddup_pd(((*xyBase)[0][1][0] + xfOffs[3])), 2), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[0][1][1] + xfOffs[0])), _mm_loaddup_pd(((*xyBase)[0][1][1] + xfOffs[1])), 2), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[0][1][1] + xfOffs[2])), _mm_loaddup_pd(((*xyBase)[0][1][1] + xfOffs[3])), 2), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[0][2][0] + xfOffs[0])), _mm_loaddup_pd(((*xyBase)[0][2][0] + xfOffs[1])), 2), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[0][2][0] + xfOffs[2])), _mm_loaddup_pd(((*xyBase)[0][2][0] + xfOffs[3])), 2), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[0][2][1] + xfOffs[0])), _mm_loaddup_pd(((*xyBase)[0][2][1] + xfOffs[1])), 2), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[0][2][1] + xfOffs[2])), _mm_loaddup_pd(((*xyBase)[0][2][1] + xfOffs[3])), 2), 1);

b_S1_C0_RE = _mm256_add_pd( psi_S0_RE , psi_S1_IM );
b_S1_C0_IM = _mm256_sub_pd( psi_S0_IM , psi_S1_RE );
psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[1][1][0] + xfOffs[0])), _mm_loaddup_pd(((*xyBase)[1][1][0] + xfOffs[1])), 2), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[1][1][0] + xfOffs[2])), _mm_loaddup_pd(((*xyBase)[1][1][0] + xfOffs[3])), 2), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[1][1][1] + xfOffs[0])), _mm_loaddup_pd(((*xyBase)[1][1][1] + xfOffs[1])), 2), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[1][1][1] + xfOffs[2])), _mm_loaddup_pd(((*xyBase)[1][1][1] + xfOffs[3])), 2), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[1][2][0] + xfOffs[0])), _mm_loaddup_pd(((*xyBase)[1][2][0] + xfOffs[1])), 2), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[1][2][0] + xfOffs[2])), _mm_loaddup_pd(((*xyBase)[1][2][0] + xfOffs[3])), 2), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[1][2][1] + xfOffs[0])), _mm_loaddup_pd(((*xyBase)[1][2][1] + xfOffs[1])), 2), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[1][2][1] + xfOffs[2])), _mm_loaddup_pd(((*xyBase)[1][2][1] + xfOffs[3])), 2), 1);

b_S1_C1_RE = _mm256_add_pd( psi_S0_RE , psi_S1_IM );
b_S1_C1_IM = _mm256_sub_pd( psi_S0_IM , psi_S1_RE );
psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[2][1][0] + xfOffs[0])), _mm_loaddup_pd(((*xyBase)[2][1][0] + xfOffs[1])), 2), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[2][1][0] + xfOffs[2])), _mm_loaddup_pd(((*xyBase)[2][1][0] + xfOffs[3])), 2), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[2][1][1] + xfOffs[0])), _mm_loaddup_pd(((*xyBase)[2][1][1] + xfOffs[1])), 2), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[2][1][1] + xfOffs[2])), _mm_loaddup_pd(((*xyBase)[2][1][1] + xfOffs[3])), 2), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[2][2][0] + xfOffs[0])), _mm_loaddup_pd(((*xyBase)[2][2][0] + xfOffs[1])), 2), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[2][2][0] + xfOffs[2])), _mm_loaddup_pd(((*xyBase)[2][2][0] + xfOffs[3])), 2), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[2][2][1] + xfOffs[0])), _mm_loaddup_pd(((*xyBase)[2][2][1] + xfOffs[1])), 2), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[2][2][1] + xfOffs[2])), _mm_loaddup_pd(((*xyBase)[2][2][1] + xfOffs[3])), 2), 1);

b_S1_C2_RE = _mm256_add_pd( psi_S0_RE , psi_S1_IM );
b_S1_C2_IM = _mm256_sub_pd( psi_S0_IM , psi_S1_RE );
ub_S1_C0_RE = _mm256_mul_pd( u_00_re , b_S1_C0_RE );
ub_S1_C0_RE = _mm256_fnmadd_pd( u_00_im , b_S1_C0_IM , ub_S1_C0_RE );
ub_S1_C0_IM = _mm256_mul_pd( u_00_re , b_S1_C0_IM );
ub_S1_C0_IM = _mm256_fmadd_pd( u_00_im , b_S1_C0_RE , ub_S1_C0_IM );
ub_S1_C0_RE = _mm256_fmadd_pd( u_10_re , b_S1_C1_RE , ub_S1_C0_RE );
ub_S1_C0_RE = _mm256_fnmadd_pd( u_10_im , b_S1_C1_IM , ub_S1_C0_RE );
ub_S1_C0_IM = _mm256_fmadd_pd( u_10_re , b_S1_C1_IM , ub_S1_C0_IM );
ub_S1_C0_IM = _mm256_fmadd_pd( u_10_im , b_S1_C1_RE , ub_S1_C0_IM );
ub_S1_C0_RE = _mm256_fmadd_pd( u_20_re , b_S1_C2_RE , ub_S1_C0_RE );
ub_S1_C0_RE = _mm256_fnmadd_pd( u_20_im , b_S1_C2_IM , ub_S1_C0_RE );
ub_S1_C0_IM = _mm256_fmadd_pd( u_20_re , b_S1_C2_IM , ub_S1_C0_IM );
ub_S1_C0_IM = _mm256_fmadd_pd( u_20_im , b_S1_C2_RE , ub_S1_C0_IM );
ub_S1_C1_RE = _mm256_mul_pd( u_01_re , b_S1_C0_RE );
ub_S1_C1_RE = _mm256_fnmadd_pd( u_01_im , b_S1_C0_IM , ub_S1_C1_RE );
ub_S1_C1_IM = _mm256_mul_pd( u_01_re , b_S1_C0_IM );
ub_S1_C1_IM = _mm256_fmadd_pd( u_01_im , b_S1_C0_RE , ub_S1_C1_IM );
ub_S1_C1_RE = _mm256_fmadd_pd( u_11_re , b_S1_C1_RE , ub_S1_C1_RE );
ub_S1_C1_RE = _mm256_fnmadd_pd( u_11_im , b_S1_C1_IM , ub_S1_C1_RE );
ub_S1_C1_IM = _mm256_fmadd_pd( u_11_re , b_S1_C1_IM , ub_S1_C1_IM );
ub_S1_C1_IM = _mm256_fmadd_pd( u_11_im , b_S1_C1_RE , ub_S1_C1_IM );
ub_S1_C1_RE = _mm256_fmadd_pd( u_21_re , b_S1_C2_RE , ub_S1_C1_RE );
ub_S1_C1_RE = _mm256_fnmadd_pd( u_21_im , b_S1_C2_IM , ub_S1_C1_RE );
ub_S1_C1_IM = _mm256_fmadd_pd( u_21_re , b_S1_C2_IM , ub_S1_C1_IM );
ub_S1_C1_IM = _mm256_fmadd_pd( u_21_im , b_S1_C2_RE , ub_S1_C1_IM );
ub_S1_C2_RE = _mm256_mul_pd( u_02_re , b_S1_C0_RE );
ub_S1_C2_RE = _mm256_fnmadd_pd( u_02_im , b_S1_C0_IM , ub_S1_C2_RE );
ub_S1_C2_IM = _mm256_mul_pd( u_02_re , b_S1_C0_IM );
ub_S1_C2_IM = _mm256_fmadd_pd( u_02_im , b_S1_C0_RE , ub_S1_C2_IM );
ub_S1_C2_RE = _mm256_fmadd_pd( u_12_re , b_S1_C1_RE , ub_S1_C2_RE );
ub_S1_C2_RE = _mm256_fnmadd_pd( u_12_im , b_S1_C1_IM , ub_S1_C2_RE );
ub_S1_C2_IM = _mm256_fmadd_pd( u_12_re , b_S1_C1_IM , ub_S1_C2_IM );
ub_S1_C2_IM = _mm256_fmadd_pd( u_12_im , b_S1_C1_RE , ub_S1_C2_IM );
ub_S1_C2_RE = _mm256_fmadd_pd( u_22_re , b_S1_C2_RE , ub_S1_C2_RE );
ub_S1_C2_RE = _mm256_fnmadd_pd( u_22_im , b_S1_C2_IM , ub_S1_C2_RE );
ub_S1_C2_IM = _mm256_fmadd_pd( u_22_re , b_S1_C2_IM , ub_S1_C2_IM );
ub_S1_C2_IM = _mm256_fmadd_pd( u_22_im , b_S1_C2_RE , ub_S1_C2_IM );
out_S1_C0_RE = _mm256_fmadd_pd( beta_vec , ub_S1_C0_RE , out_S1_C0_RE );
out_S1_C0_IM = _mm256_fmadd_pd( beta_vec , ub_S1_C0_IM , out_S1_C0_IM );
out_S1_C1_RE = _mm256_fmadd_pd( beta_vec , ub_S1_C1_RE , out_S1_C1_RE );
out_S1_C1_IM = _mm256_fmadd_pd( beta_vec , ub_S1_C1_IM , out_S1_C1_IM );
out_S1_C2_RE = _mm256_fmadd_pd( beta_vec , ub_S1_C2_RE , out_S1_C2_RE );
out_S1_C2_IM = _mm256_fmadd_pd( beta_vec , ub_S1_C2_IM , out_S1_C2_IM );
out_S2_C0_RE = _mm256_fnmadd_pd( beta_vec , ub_S1_C0_IM , out_S2_C0_RE );
out_S2_C0_IM = _mm256_fmadd_pd( beta_vec , ub_S1_C0_RE , out_S2_C0_IM );
out_S2_C1_RE = _mm256_fnmadd_pd( beta_vec , ub_S1_C1_IM , out_S2_C1_RE );
out_S2_C1_IM = _mm256_fmadd_pd( beta_vec , ub_S1_C1_RE , out_S2_C1_IM );
out_S2_C2_RE = _mm256_fnmadd_pd( beta_vec , ub_S1_C2_IM , out_S2_C2_RE );
out_S2_C2_IM = _mm256_fmadd_pd( beta_vec , ub_S1_C2_RE , out_S2_C2_IM );
} else {
__m256d accMask;

accMask = _mm256_int2mask_pd(accumulate[1]);

psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[0][0][0] + xfOffs[0])), _mm_loaddup_pd(((*xyBase)[0][0][0] + xfOffs[1])), 2), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[0][0][0] + xfOffs[2])), _mm_loaddup_pd(((*xyBase)[0][0][0] + xfOffs[3])), 2), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[0][0][1] + xfOffs[0])), _mm_loaddup_pd(((*xyBase)[0][0][1] + xfOffs[1])), 2), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[0][0][1] + xfOffs[2])), _mm_loaddup_pd(((*xyBase)[0][0][1] + xfOffs[3])), 2), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[0][3][0] + xfOffs[0])), _mm_loaddup_pd(((*xyBase)[0][3][0] + xfOffs[1])), 2), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[0][3][0] + xfOffs[2])), _mm_loaddup_pd(((*xyBase)[0][3][0] + xfOffs[3])), 2), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[0][3][1] + xfOffs[0])), _mm_loaddup_pd(((*xyBase)[0][3][1] + xfOffs[1])), 2), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[0][3][1] + xfOffs[2])), _mm_loaddup_pd(((*xyBase)[0][3][1] + xfOffs[3])), 2), 1);

b_S0_C0_RE = _mm256_add_pd( psi_S0_RE , psi_S1_IM );
b_S0_C0_IM = _mm256_sub_pd( psi_S0_IM , psi_S1_RE );
psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[1][0][0] + xfOffs[0])), _mm_loaddup_pd(((*xyBase)[1][0][0] + xfOffs[1])), 2), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[1][0][0] + xfOffs[2])), _mm_loaddup_pd(((*xyBase)[1][0][0] + xfOffs[3])), 2), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[1][0][1] + xfOffs[0])), _mm_loaddup_pd(((*xyBase)[1][0][1] + xfOffs[1])), 2), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[1][0][1] + xfOffs[2])), _mm_loaddup_pd(((*xyBase)[1][0][1] + xfOffs[3])), 2), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[1][3][0] + xfOffs[0])), _mm_loaddup_pd(((*xyBase)[1][3][0] + xfOffs[1])), 2), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[1][3][0] + xfOffs[2])), _mm_loaddup_pd(((*xyBase)[1][3][0] + xfOffs[3])), 2), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[1][3][1] + xfOffs[0])), _mm_loaddup_pd(((*xyBase)[1][3][1] + xfOffs[1])), 2), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[1][3][1] + xfOffs[2])), _mm_loaddup_pd(((*xyBase)[1][3][1] + xfOffs[3])), 2), 1);

b_S0_C1_RE = _mm256_add_pd( psi_S0_RE , psi_S1_IM );
b_S0_C1_IM = _mm256_sub_pd( psi_S0_IM , psi_S1_RE );
psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[2][0][0] + xfOffs[0])), _mm_loaddup_pd(((*xyBase)[2][0][0] + xfOffs[1])), 2), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[2][0][0] + xfOffs[2])), _mm_loaddup_pd(((*xyBase)[2][0][0] + xfOffs[3])), 2), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[2][0][1] + xfOffs[0])), _mm_loaddup_pd(((*xyBase)[2][0][1] + xfOffs[1])), 2), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[2][0][1] + xfOffs[2])), _mm_loaddup_pd(((*xyBase)[2][0][1] + xfOffs[3])), 2), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[2][3][0] + xfOffs[0])), _mm_loaddup_pd(((*xyBase)[2][3][0] + xfOffs[1])), 2), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[2][3][0] + xfOffs[2])), _mm_loaddup_pd(((*xyBase)[2][3][0] + xfOffs[3])), 2), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[2][3][1] + xfOffs[0])), _mm_loaddup_pd(((*xyBase)[2][3][1] + xfOffs[1])), 2), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[2][3][1] + xfOffs[2])), _mm_loaddup_pd(((*xyBase)[2][3][1] + xfOffs[3])), 2), 1);

b_S0_C2_RE = _mm256_add_pd( psi_S0_RE , psi_S1_IM );
b_S0_C2_IM = _mm256_sub_pd( psi_S0_IM , psi_S1_RE );
u_00_re = _mm256_load_pd((*gBase)[1][0][0][0]);

u_00_im = _mm256_load_pd((*gBase)[1][0][0][1]);

u_01_re = _mm256_load_pd((*gBase)[1][0][1][0]);

u_01_im = _mm256_load_pd((*gBase)[1][0][1][1]);

u_02_re = _mm256_load_pd((*gBase)[1][0][2][0]);

u_02_im = _mm256_load_pd((*gBase)[1][0][2][1]);

u_10_re = _mm256_load_pd((*gBase)[1][1][0][0]);

u_10_im = _mm256_load_pd((*gBase)[1][1][0][1]);

u_11_re = _mm256_load_pd((*gBase)[1][1][1][0]);

u_11_im = _mm256_load_pd((*gBase)[1][1][1][1]);

u_12_re = _mm256_load_pd((*gBase)[1][1][2][0]);

u_12_im = _mm256_load_pd((*gBase)[1][1][2][1]);

u_20_re = _mm256_mul_pd( u_01_re , u_12_re );
u_20_re = _mm256_fnmadd_pd( u_01_im , u_12_im , u_20_re );
u_20_re = _mm256_fnmadd_pd( u_02_re , u_11_re , u_20_re );
u_20_re = _mm256_fmadd_pd( u_02_im , u_11_im , u_20_re );
u_20_im = _mm256_mul_pd( u_02_re , u_11_im );
u_20_im = _mm256_fmadd_pd( u_02_im , u_11_re , u_20_im );
u_20_im = _mm256_fnmadd_pd( u_01_re , u_12_im , u_20_im );
u_20_im = _mm256_fnmadd_pd( u_01_im , u_12_re , u_20_im );
u_21_re = _mm256_mul_pd( u_02_re , u_10_re );
u_21_re = _mm256_fnmadd_pd( u_02_im , u_10_im , u_21_re );
u_21_re = _mm256_fnmadd_pd( u_00_re , u_12_re , u_21_re );
u_21_re = _mm256_fmadd_pd( u_00_im , u_12_im , u_21_re );
u_21_im = _mm256_mul_pd( u_00_re , u_12_im );
u_21_im = _mm256_fmadd_pd( u_00_im , u_12_re , u_21_im );
u_21_im = _mm256_fnmadd_pd( u_02_re , u_10_im , u_21_im );
u_21_im = _mm256_fnmadd_pd( u_02_im , u_10_re , u_21_im );
u_22_re = _mm256_mul_pd( u_00_re , u_11_re );
u_22_re = _mm256_fnmadd_pd( u_00_im , u_11_im , u_22_re );
u_22_re = _mm256_fnmadd_pd( u_01_re , u_10_re , u_22_re );
u_22_re = _mm256_fmadd_pd( u_01_im , u_10_im , u_22_re );
u_22_im = _mm256_mul_pd( u_01_re , u_10_im );
u_22_im = _mm256_fmadd_pd( u_01_im , u_10_re , u_22_im );
u_22_im = _mm256_fnmadd_pd( u_00_re , u_11_im , u_22_im );
u_22_im = _mm256_fnmadd_pd( u_00_im , u_11_re , u_22_im );
ub_S0_C0_RE = _mm256_mul_pd( u_00_re , b_S0_C0_RE );
ub_S0_C0_RE = _mm256_fnmadd_pd( u_00_im , b_S0_C0_IM , ub_S0_C0_RE );
ub_S0_C0_IM = _mm256_mul_pd( u_00_re , b_S0_C0_IM );
ub_S0_C0_IM = _mm256_fmadd_pd( u_00_im , b_S0_C0_RE , ub_S0_C0_IM );
ub_S0_C0_RE = _mm256_fmadd_pd( u_10_re , b_S0_C1_RE , ub_S0_C0_RE );
ub_S0_C0_RE = _mm256_fnmadd_pd( u_10_im , b_S0_C1_IM , ub_S0_C0_RE );
ub_S0_C0_IM = _mm256_fmadd_pd( u_10_re , b_S0_C1_IM , ub_S0_C0_IM );
ub_S0_C0_IM = _mm256_fmadd_pd( u_10_im , b_S0_C1_RE , ub_S0_C0_IM );
ub_S0_C0_RE = _mm256_fmadd_pd( u_20_re , b_S0_C2_RE , ub_S0_C0_RE );
ub_S0_C0_RE = _mm256_fnmadd_pd( u_20_im , b_S0_C2_IM , ub_S0_C0_RE );
ub_S0_C0_IM = _mm256_fmadd_pd( u_20_re , b_S0_C2_IM , ub_S0_C0_IM );
ub_S0_C0_IM = _mm256_fmadd_pd( u_20_im , b_S0_C2_RE , ub_S0_C0_IM );
ub_S0_C1_RE = _mm256_mul_pd( u_01_re , b_S0_C0_RE );
ub_S0_C1_RE = _mm256_fnmadd_pd( u_01_im , b_S0_C0_IM , ub_S0_C1_RE );
ub_S0_C1_IM = _mm256_mul_pd( u_01_re , b_S0_C0_IM );
ub_S0_C1_IM = _mm256_fmadd_pd( u_01_im , b_S0_C0_RE , ub_S0_C1_IM );
ub_S0_C1_RE = _mm256_fmadd_pd( u_11_re , b_S0_C1_RE , ub_S0_C1_RE );
ub_S0_C1_RE = _mm256_fnmadd_pd( u_11_im , b_S0_C1_IM , ub_S0_C1_RE );
ub_S0_C1_IM = _mm256_fmadd_pd( u_11_re , b_S0_C1_IM , ub_S0_C1_IM );
ub_S0_C1_IM = _mm256_fmadd_pd( u_11_im , b_S0_C1_RE , ub_S0_C1_IM );
ub_S0_C1_RE = _mm256_fmadd_pd( u_21_re , b_S0_C2_RE , ub_S0_C1_RE );
ub_S0_C1_RE = _mm256_fnmadd_pd( u_21_im , b_S0_C2_IM , ub_S0_C1_RE );
ub_S0_C1_IM = _mm256_fmadd_pd( u_21_re , b_S0_C2_IM , ub_S0_C1_IM );
ub_S0_C1_IM = _mm256_fmadd_pd( u_21_im , b_S0_C2_RE , ub_S0_C1_IM );
ub_S0_C2_RE = _mm256_mul_pd( u_02_re , b_S0_C0_RE );
ub_S0_C2_RE = _mm256_fnmadd_pd( u_02_im , b_S0_C0_IM , ub_S0_C2_RE );
ub_S0_C2_IM = _mm256_mul_pd( u_02_re , b_S0_C0_IM );
ub_S0_C2_IM = _mm256_fmadd_pd( u_02_im , b_S0_C0_RE , ub_S0_C2_IM );
ub_S0_C2_RE = _mm256_fmadd_pd( u_12_re , b_S0_C1_RE , ub_S0_C2_RE );
ub_S0_C2_RE = _mm256_fnmadd_pd( u_12_im , b_S0_C1_IM , ub_S0_C2_RE );
ub_S0_C2_IM = _mm256_fmadd_pd( u_12_re , b_S0_C1_IM , ub_S0_C2_IM );
ub_S0_C2_IM = _mm256_fmadd_pd( u_12_im , b_S0_C1_RE , ub_S0_C2_IM );
ub_S0_C2_RE = _mm256_fmadd_pd( u_22_re , b_S0_C2_RE , ub_S0_C2_RE );
ub_S0_C2_RE = _mm256_fnmadd_pd( u_22_im , b_S0_C2_IM , ub_S0_C2_RE );
ub_S0_C2_IM = _mm256_fmadd_pd( u_22_re , b_S0_C2_IM , ub_S0_C2_IM );
ub_S0_C2_IM = _mm256_fmadd_pd( u_22_im , b_S0_C2_RE , ub_S0_C2_IM );
out_S0_C0_RE =_mm256_blendv_pd(out_S0_C0_RE, _mm256_fmadd_pd( beta_vec , ub_S0_C0_RE , out_S0_C0_RE ), accMask);
out_S0_C0_IM =_mm256_blendv_pd(out_S0_C0_IM, _mm256_fmadd_pd( beta_vec , ub_S0_C0_IM , out_S0_C0_IM ), accMask);
out_S0_C1_RE =_mm256_blendv_pd(out_S0_C1_RE, _mm256_fmadd_pd( beta_vec , ub_S0_C1_RE , out_S0_C1_RE ), accMask);
out_S0_C1_IM =_mm256_blendv_pd(out_S0_C1_IM, _mm256_fmadd_pd( beta_vec , ub_S0_C1_IM , out_S0_C1_IM ), accMask);
out_S0_C2_RE =_mm256_blendv_pd(out_S0_C2_RE, _mm256_fmadd_pd( beta_vec , ub_S0_C2_RE , out_S0_C2_RE ), accMask);
out_S0_C2_IM =_mm256_blendv_pd(out_S0_C2_IM, _mm256_fmadd_pd( beta_vec , ub_S0_C2_IM , out_S0_C2_IM ), accMask);
out_S3_C0_RE = _mm256_blendv_pd(out_S3_C0_RE, _mm256_fnmadd_pd( beta_vec , ub_S0_C0_IM , out_S3_C0_RE ), accMask);
out_S3_C0_IM =_mm256_blendv_pd(out_S3_C0_IM, _mm256_fmadd_pd( beta_vec , ub_S0_C0_RE , out_S3_C0_IM ), accMask);
out_S3_C1_RE = _mm256_blendv_pd(out_S3_C1_RE, _mm256_fnmadd_pd( beta_vec , ub_S0_C1_IM , out_S3_C1_RE ), accMask);
out_S3_C1_IM =_mm256_blendv_pd(out_S3_C1_IM, _mm256_fmadd_pd( beta_vec , ub_S0_C1_RE , out_S3_C1_IM ), accMask);
out_S3_C2_RE = _mm256_blendv_pd(out_S3_C2_RE, _mm256_fnmadd_pd( beta_vec , ub_S0_C2_IM , out_S3_C2_RE ), accMask);
out_S3_C2_IM =_mm256_blendv_pd(out_S3_C2_IM, _mm256_fmadd_pd( beta_vec , ub_S0_C2_RE , out_S3_C2_IM ), accMask);
psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[0][1][0] + xfOffs[0])), _mm_loaddup_pd(((*xyBase)[0][1][0] + xfOffs[1])), 2), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[0][1][0] + xfOffs[2])), _mm_loaddup_pd(((*xyBase)[0][1][0] + xfOffs[3])), 2), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[0][1][1] + xfOffs[0])), _mm_loaddup_pd(((*xyBase)[0][1][1] + xfOffs[1])), 2), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[0][1][1] + xfOffs[2])), _mm_loaddup_pd(((*xyBase)[0][1][1] + xfOffs[3])), 2), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[0][2][0] + xfOffs[0])), _mm_loaddup_pd(((*xyBase)[0][2][0] + xfOffs[1])), 2), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[0][2][0] + xfOffs[2])), _mm_loaddup_pd(((*xyBase)[0][2][0] + xfOffs[3])), 2), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[0][2][1] + xfOffs[0])), _mm_loaddup_pd(((*xyBase)[0][2][1] + xfOffs[1])), 2), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[0][2][1] + xfOffs[2])), _mm_loaddup_pd(((*xyBase)[0][2][1] + xfOffs[3])), 2), 1);

b_S1_C0_RE = _mm256_add_pd( psi_S0_RE , psi_S1_IM );
b_S1_C0_IM = _mm256_sub_pd( psi_S0_IM , psi_S1_RE );
psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[1][1][0] + xfOffs[0])), _mm_loaddup_pd(((*xyBase)[1][1][0] + xfOffs[1])), 2), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[1][1][0] + xfOffs[2])), _mm_loaddup_pd(((*xyBase)[1][1][0] + xfOffs[3])), 2), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[1][1][1] + xfOffs[0])), _mm_loaddup_pd(((*xyBase)[1][1][1] + xfOffs[1])), 2), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[1][1][1] + xfOffs[2])), _mm_loaddup_pd(((*xyBase)[1][1][1] + xfOffs[3])), 2), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[1][2][0] + xfOffs[0])), _mm_loaddup_pd(((*xyBase)[1][2][0] + xfOffs[1])), 2), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[1][2][0] + xfOffs[2])), _mm_loaddup_pd(((*xyBase)[1][2][0] + xfOffs[3])), 2), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[1][2][1] + xfOffs[0])), _mm_loaddup_pd(((*xyBase)[1][2][1] + xfOffs[1])), 2), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[1][2][1] + xfOffs[2])), _mm_loaddup_pd(((*xyBase)[1][2][1] + xfOffs[3])), 2), 1);

b_S1_C1_RE = _mm256_add_pd( psi_S0_RE , psi_S1_IM );
b_S1_C1_IM = _mm256_sub_pd( psi_S0_IM , psi_S1_RE );
psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[2][1][0] + xfOffs[0])), _mm_loaddup_pd(((*xyBase)[2][1][0] + xfOffs[1])), 2), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[2][1][0] + xfOffs[2])), _mm_loaddup_pd(((*xyBase)[2][1][0] + xfOffs[3])), 2), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[2][1][1] + xfOffs[0])), _mm_loaddup_pd(((*xyBase)[2][1][1] + xfOffs[1])), 2), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[2][1][1] + xfOffs[2])), _mm_loaddup_pd(((*xyBase)[2][1][1] + xfOffs[3])), 2), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[2][2][0] + xfOffs[0])), _mm_loaddup_pd(((*xyBase)[2][2][0] + xfOffs[1])), 2), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[2][2][0] + xfOffs[2])), _mm_loaddup_pd(((*xyBase)[2][2][0] + xfOffs[3])), 2), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[2][2][1] + xfOffs[0])), _mm_loaddup_pd(((*xyBase)[2][2][1] + xfOffs[1])), 2), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_blend_pd(_mm_loadu_pd(((*xyBase)[2][2][1] + xfOffs[2])), _mm_loaddup_pd(((*xyBase)[2][2][1] + xfOffs[3])), 2), 1);

b_S1_C2_RE = _mm256_add_pd( psi_S0_RE , psi_S1_IM );
b_S1_C2_IM = _mm256_sub_pd( psi_S0_IM , psi_S1_RE );
ub_S1_C0_RE = _mm256_mul_pd( u_00_re , b_S1_C0_RE );
ub_S1_C0_RE = _mm256_fnmadd_pd( u_00_im , b_S1_C0_IM , ub_S1_C0_RE );
ub_S1_C0_IM = _mm256_mul_pd( u_00_re , b_S1_C0_IM );
ub_S1_C0_IM = _mm256_fmadd_pd( u_00_im , b_S1_C0_RE , ub_S1_C0_IM );
ub_S1_C0_RE = _mm256_fmadd_pd( u_10_re , b_S1_C1_RE , ub_S1_C0_RE );
ub_S1_C0_RE = _mm256_fnmadd_pd( u_10_im , b_S1_C1_IM , ub_S1_C0_RE );
ub_S1_C0_IM = _mm256_fmadd_pd( u_10_re , b_S1_C1_IM , ub_S1_C0_IM );
ub_S1_C0_IM = _mm256_fmadd_pd( u_10_im , b_S1_C1_RE , ub_S1_C0_IM );
ub_S1_C0_RE = _mm256_fmadd_pd( u_20_re , b_S1_C2_RE , ub_S1_C0_RE );
ub_S1_C0_RE = _mm256_fnmadd_pd( u_20_im , b_S1_C2_IM , ub_S1_C0_RE );
ub_S1_C0_IM = _mm256_fmadd_pd( u_20_re , b_S1_C2_IM , ub_S1_C0_IM );
ub_S1_C0_IM = _mm256_fmadd_pd( u_20_im , b_S1_C2_RE , ub_S1_C0_IM );
ub_S1_C1_RE = _mm256_mul_pd( u_01_re , b_S1_C0_RE );
ub_S1_C1_RE = _mm256_fnmadd_pd( u_01_im , b_S1_C0_IM , ub_S1_C1_RE );
ub_S1_C1_IM = _mm256_mul_pd( u_01_re , b_S1_C0_IM );
ub_S1_C1_IM = _mm256_fmadd_pd( u_01_im , b_S1_C0_RE , ub_S1_C1_IM );
ub_S1_C1_RE = _mm256_fmadd_pd( u_11_re , b_S1_C1_RE , ub_S1_C1_RE );
ub_S1_C1_RE = _mm256_fnmadd_pd( u_11_im , b_S1_C1_IM , ub_S1_C1_RE );
ub_S1_C1_IM = _mm256_fmadd_pd( u_11_re , b_S1_C1_IM , ub_S1_C1_IM );
ub_S1_C1_IM = _mm256_fmadd_pd( u_11_im , b_S1_C1_RE , ub_S1_C1_IM );
ub_S1_C1_RE = _mm256_fmadd_pd( u_21_re , b_S1_C2_RE , ub_S1_C1_RE );
ub_S1_C1_RE = _mm256_fnmadd_pd( u_21_im , b_S1_C2_IM , ub_S1_C1_RE );
ub_S1_C1_IM = _mm256_fmadd_pd( u_21_re , b_S1_C2_IM , ub_S1_C1_IM );
ub_S1_C1_IM = _mm256_fmadd_pd( u_21_im , b_S1_C2_RE , ub_S1_C1_IM );
ub_S1_C2_RE = _mm256_mul_pd( u_02_re , b_S1_C0_RE );
ub_S1_C2_RE = _mm256_fnmadd_pd( u_02_im , b_S1_C0_IM , ub_S1_C2_RE );
ub_S1_C2_IM = _mm256_mul_pd( u_02_re , b_S1_C0_IM );
ub_S1_C2_IM = _mm256_fmadd_pd( u_02_im , b_S1_C0_RE , ub_S1_C2_IM );
ub_S1_C2_RE = _mm256_fmadd_pd( u_12_re , b_S1_C1_RE , ub_S1_C2_RE );
ub_S1_C2_RE = _mm256_fnmadd_pd( u_12_im , b_S1_C1_IM , ub_S1_C2_RE );
ub_S1_C2_IM = _mm256_fmadd_pd( u_12_re , b_S1_C1_IM , ub_S1_C2_IM );
ub_S1_C2_IM = _mm256_fmadd_pd( u_12_im , b_S1_C1_RE , ub_S1_C2_IM );
ub_S1_C2_RE = _mm256_fmadd_pd( u_22_re , b_S1_C2_RE , ub_S1_C2_RE );
ub_S1_C2_RE = _mm256_fnmadd_pd( u_22_im , b_S1_C2_IM , ub_S1_C2_RE );
ub_S1_C2_IM = _mm256_fmadd_pd( u_22_re , b_S1_C2_IM , ub_S1_C2_IM );
ub_S1_C2_IM = _mm256_fmadd_pd( u_22_im , b_S1_C2_RE , ub_S1_C2_IM );
out_S1_C0_RE =_mm256_blendv_pd(out_S1_C0_RE, _mm256_fmadd_pd( beta_vec , ub_S1_C0_RE , out_S1_C0_RE ), accMask);
out_S1_C0_IM =_mm256_blendv_pd(out_S1_C0_IM, _mm256_fmadd_pd( beta_vec , ub_S1_C0_IM , out_S1_C0_IM ), accMask);
out_S1_C1_RE =_mm256_blendv_pd(out_S1_C1_RE, _mm256_fmadd_pd( beta_vec , ub_S1_C1_RE , out_S1_C1_RE ), accMask);
out_S1_C1_IM =_mm256_blendv_pd(out_S1_C1_IM, _mm256_fmadd_pd( beta_vec , ub_S1_C1_IM , out_S1_C1_IM ), accMask);
out_S1_C2_RE =_mm256_blendv_pd(out_S1_C2_RE, _mm256_fmadd_pd( beta_vec , ub_S1_C2_RE , out_S1_C2_RE ), accMask);
out_S1_C2_IM =_mm256_blendv_pd(out_S1_C2_IM, _mm256_fmadd_pd( beta_vec , ub_S1_C2_IM , out_S1_C2_IM ), accMask);
out_S2_C0_RE = _mm256_blendv_pd(out_S2_C0_RE, _mm256_fnmadd_pd( beta_vec , ub_S1_C0_IM , out_S2_C0_RE ), accMask);
out_S2_C0_IM =_mm256_blendv_pd(out_S2_C0_IM, _mm256_fmadd_pd( beta_vec , ub_S1_C0_RE , out_S2_C0_IM ), accMask);
out_S2_C1_RE = _mm256_blendv_pd(out_S2_C1_RE, _mm256_fnmadd_pd( beta_vec , ub_S1_C1_IM , out_S2_C1_RE ), accMask);
out_S2_C1_IM =_mm256_blendv_pd(out_S2_C1_IM, _mm256_fmadd_pd( beta_vec , ub_S1_C1_RE , out_S2_C1_IM ), accMask);
out_S2_C2_RE = _mm256_blendv_pd(out_S2_C2_RE, _mm256_fnmadd_pd( beta_vec , ub_S1_C2_IM , out_S2_C2_RE ), accMask);
out_S2_C2_IM =_mm256_blendv_pd(out_S2_C2_IM, _mm256_fmadd_pd( beta_vec , ub_S1_C2_RE , out_S2_C2_IM ), accMask);
}
}
 if ( accumulate[2] ) { 
__m256d beta_vec = _mm256_setzero_pd();
beta_vec = _mm256_broadcast_sd((&coeff_s));

 if ((accumulate[2] & 0xF) == 0xF) { 
psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*xyBase)[0][0][0] + ybOffs[0])), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*xyBase)[0][0][0] + ybOffs[2])), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*xyBase)[0][0][1] + ybOffs[0])), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*xyBase)[0][0][1] + ybOffs[2])), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*xyBase)[0][3][0] + ybOffs[0])), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*xyBase)[0][3][0] + ybOffs[2])), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*xyBase)[0][3][1] + ybOffs[0])), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*xyBase)[0][3][1] + ybOffs[2])), 1);

b_S0_C0_RE = _mm256_sub_pd( psi_S0_RE , psi_S1_RE );
b_S0_C0_IM = _mm256_sub_pd( psi_S0_IM , psi_S1_IM );
psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*xyBase)[1][0][0] + ybOffs[0])), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*xyBase)[1][0][0] + ybOffs[2])), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*xyBase)[1][0][1] + ybOffs[0])), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*xyBase)[1][0][1] + ybOffs[2])), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*xyBase)[1][3][0] + ybOffs[0])), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*xyBase)[1][3][0] + ybOffs[2])), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*xyBase)[1][3][1] + ybOffs[0])), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*xyBase)[1][3][1] + ybOffs[2])), 1);

b_S0_C1_RE = _mm256_sub_pd( psi_S0_RE , psi_S1_RE );
b_S0_C1_IM = _mm256_sub_pd( psi_S0_IM , psi_S1_IM );
psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*xyBase)[2][0][0] + ybOffs[0])), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*xyBase)[2][0][0] + ybOffs[2])), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*xyBase)[2][0][1] + ybOffs[0])), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*xyBase)[2][0][1] + ybOffs[2])), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*xyBase)[2][3][0] + ybOffs[0])), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*xyBase)[2][3][0] + ybOffs[2])), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*xyBase)[2][3][1] + ybOffs[0])), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*xyBase)[2][3][1] + ybOffs[2])), 1);

b_S0_C2_RE = _mm256_sub_pd( psi_S0_RE , psi_S1_RE );
b_S0_C2_IM = _mm256_sub_pd( psi_S0_IM , psi_S1_IM );
u_00_re = _mm256_load_pd((*gBase)[2][0][0][0]);

u_00_im = _mm256_load_pd((*gBase)[2][0][0][1]);

u_01_re = _mm256_load_pd((*gBase)[2][0][1][0]);

u_01_im = _mm256_load_pd((*gBase)[2][0][1][1]);

u_02_re = _mm256_load_pd((*gBase)[2][0][2][0]);

u_02_im = _mm256_load_pd((*gBase)[2][0][2][1]);

u_10_re = _mm256_load_pd((*gBase)[2][1][0][0]);

u_10_im = _mm256_load_pd((*gBase)[2][1][0][1]);

u_11_re = _mm256_load_pd((*gBase)[2][1][1][0]);

u_11_im = _mm256_load_pd((*gBase)[2][1][1][1]);

u_12_re = _mm256_load_pd((*gBase)[2][1][2][0]);

u_12_im = _mm256_load_pd((*gBase)[2][1][2][1]);

u_20_re = _mm256_mul_pd( u_01_re , u_12_re );
u_20_re = _mm256_fnmadd_pd( u_01_im , u_12_im , u_20_re );
u_20_re = _mm256_fnmadd_pd( u_02_re , u_11_re , u_20_re );
u_20_re = _mm256_fmadd_pd( u_02_im , u_11_im , u_20_re );
u_20_im = _mm256_mul_pd( u_02_re , u_11_im );
u_20_im = _mm256_fmadd_pd( u_02_im , u_11_re , u_20_im );
u_20_im = _mm256_fnmadd_pd( u_01_re , u_12_im , u_20_im );
u_20_im = _mm256_fnmadd_pd( u_01_im , u_12_re , u_20_im );
u_21_re = _mm256_mul_pd( u_02_re , u_10_re );
u_21_re = _mm256_fnmadd_pd( u_02_im , u_10_im , u_21_re );
u_21_re = _mm256_fnmadd_pd( u_00_re , u_12_re , u_21_re );
u_21_re = _mm256_fmadd_pd( u_00_im , u_12_im , u_21_re );
u_21_im = _mm256_mul_pd( u_00_re , u_12_im );
u_21_im = _mm256_fmadd_pd( u_00_im , u_12_re , u_21_im );
u_21_im = _mm256_fnmadd_pd( u_02_re , u_10_im , u_21_im );
u_21_im = _mm256_fnmadd_pd( u_02_im , u_10_re , u_21_im );
u_22_re = _mm256_mul_pd( u_00_re , u_11_re );
u_22_re = _mm256_fnmadd_pd( u_00_im , u_11_im , u_22_re );
u_22_re = _mm256_fnmadd_pd( u_01_re , u_10_re , u_22_re );
u_22_re = _mm256_fmadd_pd( u_01_im , u_10_im , u_22_re );
u_22_im = _mm256_mul_pd( u_01_re , u_10_im );
u_22_im = _mm256_fmadd_pd( u_01_im , u_10_re , u_22_im );
u_22_im = _mm256_fnmadd_pd( u_00_re , u_11_im , u_22_im );
u_22_im = _mm256_fnmadd_pd( u_00_im , u_11_re , u_22_im );
ub_S0_C0_RE = _mm256_mul_pd( u_00_re , b_S0_C0_RE );
ub_S0_C0_RE = _mm256_fmadd_pd( u_00_im , b_S0_C0_IM , ub_S0_C0_RE );
ub_S0_C0_IM = _mm256_mul_pd( u_00_re , b_S0_C0_IM );
ub_S0_C0_IM = _mm256_fnmadd_pd( u_00_im , b_S0_C0_RE , ub_S0_C0_IM );
ub_S0_C0_RE = _mm256_fmadd_pd( u_01_re , b_S0_C1_RE , ub_S0_C0_RE );
ub_S0_C0_RE = _mm256_fmadd_pd( u_01_im , b_S0_C1_IM , ub_S0_C0_RE );
ub_S0_C0_IM = _mm256_fmadd_pd( u_01_re , b_S0_C1_IM , ub_S0_C0_IM );
ub_S0_C0_IM = _mm256_fnmadd_pd( u_01_im , b_S0_C1_RE , ub_S0_C0_IM );
ub_S0_C0_RE = _mm256_fmadd_pd( u_02_re , b_S0_C2_RE , ub_S0_C0_RE );
ub_S0_C0_RE = _mm256_fmadd_pd( u_02_im , b_S0_C2_IM , ub_S0_C0_RE );
ub_S0_C0_IM = _mm256_fmadd_pd( u_02_re , b_S0_C2_IM , ub_S0_C0_IM );
ub_S0_C0_IM = _mm256_fnmadd_pd( u_02_im , b_S0_C2_RE , ub_S0_C0_IM );
ub_S0_C1_RE = _mm256_mul_pd( u_10_re , b_S0_C0_RE );
ub_S0_C1_RE = _mm256_fmadd_pd( u_10_im , b_S0_C0_IM , ub_S0_C1_RE );
ub_S0_C1_IM = _mm256_mul_pd( u_10_re , b_S0_C0_IM );
ub_S0_C1_IM = _mm256_fnmadd_pd( u_10_im , b_S0_C0_RE , ub_S0_C1_IM );
ub_S0_C1_RE = _mm256_fmadd_pd( u_11_re , b_S0_C1_RE , ub_S0_C1_RE );
ub_S0_C1_RE = _mm256_fmadd_pd( u_11_im , b_S0_C1_IM , ub_S0_C1_RE );
ub_S0_C1_IM = _mm256_fmadd_pd( u_11_re , b_S0_C1_IM , ub_S0_C1_IM );
ub_S0_C1_IM = _mm256_fnmadd_pd( u_11_im , b_S0_C1_RE , ub_S0_C1_IM );
ub_S0_C1_RE = _mm256_fmadd_pd( u_12_re , b_S0_C2_RE , ub_S0_C1_RE );
ub_S0_C1_RE = _mm256_fmadd_pd( u_12_im , b_S0_C2_IM , ub_S0_C1_RE );
ub_S0_C1_IM = _mm256_fmadd_pd( u_12_re , b_S0_C2_IM , ub_S0_C1_IM );
ub_S0_C1_IM = _mm256_fnmadd_pd( u_12_im , b_S0_C2_RE , ub_S0_C1_IM );
ub_S0_C2_RE = _mm256_mul_pd( u_20_re , b_S0_C0_RE );
ub_S0_C2_RE = _mm256_fmadd_pd( u_20_im , b_S0_C0_IM , ub_S0_C2_RE );
ub_S0_C2_IM = _mm256_mul_pd( u_20_re , b_S0_C0_IM );
ub_S0_C2_IM = _mm256_fnmadd_pd( u_20_im , b_S0_C0_RE , ub_S0_C2_IM );
ub_S0_C2_RE = _mm256_fmadd_pd( u_21_re , b_S0_C1_RE , ub_S0_C2_RE );
ub_S0_C2_RE = _mm256_fmadd_pd( u_21_im , b_S0_C1_IM , ub_S0_C2_RE );
ub_S0_C2_IM = _mm256_fmadd_pd( u_21_re , b_S0_C1_IM , ub_S0_C2_IM );
ub_S0_C2_IM = _mm256_fnmadd_pd( u_21_im , b_S0_C1_RE , ub_S0_C2_IM );
ub_S0_C2_RE = _mm256_fmadd_pd( u_22_re , b_S0_C2_RE , ub_S0_C2_RE );
ub_S0_C2_RE = _mm256_fmadd_pd( u_22_im , b_S0_C2_IM , ub_S0_C2_RE );
ub_S0_C2_IM = _mm256_fmadd_pd( u_22_re , b_S0_C2_IM , ub_S0_C2_IM );
ub_S0_C2_IM = _mm256_fnmadd_pd( u_22_im , b_S0_C2_RE , ub_S0_C2_IM );
out_S0_C0_RE = _mm256_fmadd_pd( beta_vec , ub_S0_C0_RE , out_S0_C0_RE );
out_S0_C0_IM = _mm256_fmadd_pd( beta_vec , ub_S0_C0_IM , out_S0_C0_IM );
out_S0_C1_RE = _mm256_fmadd_pd( beta_vec , ub_S0_C1_RE , out_S0_C1_RE );
out_S0_C1_IM = _mm256_fmadd_pd( beta_vec , ub_S0_C1_IM , out_S0_C1_IM );
out_S0_C2_RE = _mm256_fmadd_pd( beta_vec , ub_S0_C2_RE , out_S0_C2_RE );
out_S0_C2_IM = _mm256_fmadd_pd( beta_vec , ub_S0_C2_IM , out_S0_C2_IM );
out_S3_C0_RE = _mm256_fnmadd_pd( beta_vec , ub_S0_C0_RE , out_S3_C0_RE );
out_S3_C0_IM = _mm256_fnmadd_pd( beta_vec , ub_S0_C0_IM , out_S3_C0_IM );
out_S3_C1_RE = _mm256_fnmadd_pd( beta_vec , ub_S0_C1_RE , out_S3_C1_RE );
out_S3_C1_IM = _mm256_fnmadd_pd( beta_vec , ub_S0_C1_IM , out_S3_C1_IM );
out_S3_C2_RE = _mm256_fnmadd_pd( beta_vec , ub_S0_C2_RE , out_S3_C2_RE );
out_S3_C2_IM = _mm256_fnmadd_pd( beta_vec , ub_S0_C2_IM , out_S3_C2_IM );
psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*xyBase)[0][1][0] + ybOffs[0])), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*xyBase)[0][1][0] + ybOffs[2])), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*xyBase)[0][1][1] + ybOffs[0])), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*xyBase)[0][1][1] + ybOffs[2])), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*xyBase)[0][2][0] + ybOffs[0])), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*xyBase)[0][2][0] + ybOffs[2])), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*xyBase)[0][2][1] + ybOffs[0])), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*xyBase)[0][2][1] + ybOffs[2])), 1);

b_S1_C0_RE = _mm256_add_pd( psi_S0_RE , psi_S1_RE );
b_S1_C0_IM = _mm256_add_pd( psi_S0_IM , psi_S1_IM );
psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*xyBase)[1][1][0] + ybOffs[0])), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*xyBase)[1][1][0] + ybOffs[2])), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*xyBase)[1][1][1] + ybOffs[0])), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*xyBase)[1][1][1] + ybOffs[2])), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*xyBase)[1][2][0] + ybOffs[0])), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*xyBase)[1][2][0] + ybOffs[2])), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*xyBase)[1][2][1] + ybOffs[0])), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*xyBase)[1][2][1] + ybOffs[2])), 1);

b_S1_C1_RE = _mm256_add_pd( psi_S0_RE , psi_S1_RE );
b_S1_C1_IM = _mm256_add_pd( psi_S0_IM , psi_S1_IM );
psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*xyBase)[2][1][0] + ybOffs[0])), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*xyBase)[2][1][0] + ybOffs[2])), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*xyBase)[2][1][1] + ybOffs[0])), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*xyBase)[2][1][1] + ybOffs[2])), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*xyBase)[2][2][0] + ybOffs[0])), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*xyBase)[2][2][0] + ybOffs[2])), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*xyBase)[2][2][1] + ybOffs[0])), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*xyBase)[2][2][1] + ybOffs[2])), 1);

b_S1_C2_RE = _mm256_add_pd( psi_S0_RE , psi_S1_RE );
b_S1_C2_IM = _mm256_add_pd( psi_S0_IM , psi_S1_IM );
ub_S1_C0_RE = _mm256_mul_pd( u_00_re , b_S1_C0_RE );
ub_S1_C0_RE = _mm256_fmadd_pd( u_00_im , b_S1_C0_IM , ub_S1_C0_RE );
ub_S1_C0_IM = _mm256_mul_pd( u_00_re , b_S1_C0_IM );
ub_S1_C0_IM = _mm256_fnmadd_pd( u_00_im , b_S1_C0_RE , ub_S1_C0_IM );
ub_S1_C0_RE = _mm256_fmadd_pd( u_01_re , b_S1_C1_RE , ub_S1_C0_RE );
ub_S1_C0_RE = _mm256_fmadd_pd( u_01_im , b_S1_C1_IM , ub_S1_C0_RE );
ub_S1_C0_IM = _mm256_fmadd_pd( u_01_re , b_S1_C1_IM , ub_S1_C0_IM );
ub_S1_C0_IM = _mm256_fnmadd_pd( u_01_im , b_S1_C1_RE , ub_S1_C0_IM );
ub_S1_C0_RE = _mm256_fmadd_pd( u_02_re , b_S1_C2_RE , ub_S1_C0_RE );
ub_S1_C0_RE = _mm256_fmadd_pd( u_02_im , b_S1_C2_IM , ub_S1_C0_RE );
ub_S1_C0_IM = _mm256_fmadd_pd( u_02_re , b_S1_C2_IM , ub_S1_C0_IM );
ub_S1_C0_IM = _mm256_fnmadd_pd( u_02_im , b_S1_C2_RE , ub_S1_C0_IM );
ub_S1_C1_RE = _mm256_mul_pd( u_10_re , b_S1_C0_RE );
ub_S1_C1_RE = _mm256_fmadd_pd( u_10_im , b_S1_C0_IM , ub_S1_C1_RE );
ub_S1_C1_IM = _mm256_mul_pd( u_10_re , b_S1_C0_IM );
ub_S1_C1_IM = _mm256_fnmadd_pd( u_10_im , b_S1_C0_RE , ub_S1_C1_IM );
ub_S1_C1_RE = _mm256_fmadd_pd( u_11_re , b_S1_C1_RE , ub_S1_C1_RE );
ub_S1_C1_RE = _mm256_fmadd_pd( u_11_im , b_S1_C1_IM , ub_S1_C1_RE );
ub_S1_C1_IM = _mm256_fmadd_pd( u_11_re , b_S1_C1_IM , ub_S1_C1_IM );
ub_S1_C1_IM = _mm256_fnmadd_pd( u_11_im , b_S1_C1_RE , ub_S1_C1_IM );
ub_S1_C1_RE = _mm256_fmadd_pd( u_12_re , b_S1_C2_RE , ub_S1_C1_RE );
ub_S1_C1_RE = _mm256_fmadd_pd( u_12_im , b_S1_C2_IM , ub_S1_C1_RE );
ub_S1_C1_IM = _mm256_fmadd_pd( u_12_re , b_S1_C2_IM , ub_S1_C1_IM );
ub_S1_C1_IM = _mm256_fnmadd_pd( u_12_im , b_S1_C2_RE , ub_S1_C1_IM );
ub_S1_C2_RE = _mm256_mul_pd( u_20_re , b_S1_C0_RE );
ub_S1_C2_RE = _mm256_fmadd_pd( u_20_im , b_S1_C0_IM , ub_S1_C2_RE );
ub_S1_C2_IM = _mm256_mul_pd( u_20_re , b_S1_C0_IM );
ub_S1_C2_IM = _mm256_fnmadd_pd( u_20_im , b_S1_C0_RE , ub_S1_C2_IM );
ub_S1_C2_RE = _mm256_fmadd_pd( u_21_re , b_S1_C1_RE , ub_S1_C2_RE );
ub_S1_C2_RE = _mm256_fmadd_pd( u_21_im , b_S1_C1_IM , ub_S1_C2_RE );
ub_S1_C2_IM = _mm256_fmadd_pd( u_21_re , b_S1_C1_IM , ub_S1_C2_IM );
ub_S1_C2_IM = _mm256_fnmadd_pd( u_21_im , b_S1_C1_RE , ub_S1_C2_IM );
ub_S1_C2_RE = _mm256_fmadd_pd( u_22_re , b_S1_C2_RE , ub_S1_C2_RE );
ub_S1_C2_RE = _mm256_fmadd_pd( u_22_im , b_S1_C2_IM , ub_S1_C2_RE );
ub_S1_C2_IM = _mm256_fmadd_pd( u_22_re , b_S1_C2_IM , ub_S1_C2_IM );
ub_S1_C2_IM = _mm256_fnmadd_pd( u_22_im , b_S1_C2_RE , ub_S1_C2_IM );
out_S1_C0_RE = _mm256_fmadd_pd( beta_vec , ub_S1_C0_RE , out_S1_C0_RE );
out_S1_C0_IM = _mm256_fmadd_pd( beta_vec , ub_S1_C0_IM , out_S1_C0_IM );
out_S1_C1_RE = _mm256_fmadd_pd( beta_vec , ub_S1_C1_RE , out_S1_C1_RE );
out_S1_C1_IM = _mm256_fmadd_pd( beta_vec , ub_S1_C1_IM , out_S1_C1_IM );
out_S1_C2_RE = _mm256_fmadd_pd( beta_vec , ub_S1_C2_RE , out_S1_C2_RE );
out_S1_C2_IM = _mm256_fmadd_pd( beta_vec , ub_S1_C2_IM , out_S1_C2_IM );
out_S2_C0_RE = _mm256_fmadd_pd( beta_vec , ub_S1_C0_RE , out_S2_C0_RE );
out_S2_C0_IM = _mm256_fmadd_pd( beta_vec , ub_S1_C0_IM , out_S2_C0_IM );
out_S2_C1_RE = _mm256_fmadd_pd( beta_vec , ub_S1_C1_RE , out_S2_C1_RE );
out_S2_C1_IM = _mm256_fmadd_pd( beta_vec , ub_S1_C1_IM , out_S2_C1_IM );
out_S2_C2_RE = _mm256_fmadd_pd( beta_vec , ub_S1_C2_RE , out_S2_C2_RE );
out_S2_C2_IM = _mm256_fmadd_pd( beta_vec , ub_S1_C2_IM , out_S2_C2_IM );
} else {
__m256d accMask;

accMask = _mm256_int2mask_pd(accumulate[2]);

psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*xyBase)[0][0][0] + ybOffs[0])), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*xyBase)[0][0][0] + ybOffs[2])), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*xyBase)[0][0][1] + ybOffs[0])), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*xyBase)[0][0][1] + ybOffs[2])), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*xyBase)[0][3][0] + ybOffs[0])), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*xyBase)[0][3][0] + ybOffs[2])), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*xyBase)[0][3][1] + ybOffs[0])), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*xyBase)[0][3][1] + ybOffs[2])), 1);

b_S0_C0_RE = _mm256_sub_pd( psi_S0_RE , psi_S1_RE );
b_S0_C0_IM = _mm256_sub_pd( psi_S0_IM , psi_S1_IM );
psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*xyBase)[1][0][0] + ybOffs[0])), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*xyBase)[1][0][0] + ybOffs[2])), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*xyBase)[1][0][1] + ybOffs[0])), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*xyBase)[1][0][1] + ybOffs[2])), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*xyBase)[1][3][0] + ybOffs[0])), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*xyBase)[1][3][0] + ybOffs[2])), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*xyBase)[1][3][1] + ybOffs[0])), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*xyBase)[1][3][1] + ybOffs[2])), 1);

b_S0_C1_RE = _mm256_sub_pd( psi_S0_RE , psi_S1_RE );
b_S0_C1_IM = _mm256_sub_pd( psi_S0_IM , psi_S1_IM );
psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*xyBase)[2][0][0] + ybOffs[0])), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*xyBase)[2][0][0] + ybOffs[2])), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*xyBase)[2][0][1] + ybOffs[0])), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*xyBase)[2][0][1] + ybOffs[2])), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*xyBase)[2][3][0] + ybOffs[0])), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*xyBase)[2][3][0] + ybOffs[2])), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*xyBase)[2][3][1] + ybOffs[0])), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*xyBase)[2][3][1] + ybOffs[2])), 1);

b_S0_C2_RE = _mm256_sub_pd( psi_S0_RE , psi_S1_RE );
b_S0_C2_IM = _mm256_sub_pd( psi_S0_IM , psi_S1_IM );
u_00_re = _mm256_load_pd((*gBase)[2][0][0][0]);

u_00_im = _mm256_load_pd((*gBase)[2][0][0][1]);

u_01_re = _mm256_load_pd((*gBase)[2][0][1][0]);

u_01_im = _mm256_load_pd((*gBase)[2][0][1][1]);

u_02_re = _mm256_load_pd((*gBase)[2][0][2][0]);

u_02_im = _mm256_load_pd((*gBase)[2][0][2][1]);

u_10_re = _mm256_load_pd((*gBase)[2][1][0][0]);

u_10_im = _mm256_load_pd((*gBase)[2][1][0][1]);

u_11_re = _mm256_load_pd((*gBase)[2][1][1][0]);

u_11_im = _mm256_load_pd((*gBase)[2][1][1][1]);

u_12_re = _mm256_load_pd((*gBase)[2][1][2][0]);

u_12_im = _mm256_load_pd((*gBase)[2][1][2][1]);

u_20_re = _mm256_mul_pd( u_01_re , u_12_re );
u_20_re = _mm256_fnmadd_pd( u_01_im , u_12_im , u_20_re );
u_20_re = _mm256_fnmadd_pd( u_02_re , u_11_re , u_20_re );
u_20_re = _mm256_fmadd_pd( u_02_im , u_11_im , u_20_re );
u_20_im = _mm256_mul_pd( u_02_re , u_11_im );
u_20_im = _mm256_fmadd_pd( u_02_im , u_11_re , u_20_im );
u_20_im = _mm256_fnmadd_pd( u_01_re , u_12_im , u_20_im );
u_20_im = _mm256_fnmadd_pd( u_01_im , u_12_re , u_20_im );
u_21_re = _mm256_mul_pd( u_02_re , u_10_re );
u_21_re = _mm256_fnmadd_pd( u_02_im , u_10_im , u_21_re );
u_21_re = _mm256_fnmadd_pd( u_00_re , u_12_re , u_21_re );
u_21_re = _mm256_fmadd_pd( u_00_im , u_12_im , u_21_re );
u_21_im = _mm256_mul_pd( u_00_re , u_12_im );
u_21_im = _mm256_fmadd_pd( u_00_im , u_12_re , u_21_im );
u_21_im = _mm256_fnmadd_pd( u_02_re , u_10_im , u_21_im );
u_21_im = _mm256_fnmadd_pd( u_02_im , u_10_re , u_21_im );
u_22_re = _mm256_mul_pd( u_00_re , u_11_re );
u_22_re = _mm256_fnmadd_pd( u_00_im , u_11_im , u_22_re );
u_22_re = _mm256_fnmadd_pd( u_01_re , u_10_re , u_22_re );
u_22_re = _mm256_fmadd_pd( u_01_im , u_10_im , u_22_re );
u_22_im = _mm256_mul_pd( u_01_re , u_10_im );
u_22_im = _mm256_fmadd_pd( u_01_im , u_10_re , u_22_im );
u_22_im = _mm256_fnmadd_pd( u_00_re , u_11_im , u_22_im );
u_22_im = _mm256_fnmadd_pd( u_00_im , u_11_re , u_22_im );
ub_S0_C0_RE = _mm256_mul_pd( u_00_re , b_S0_C0_RE );
ub_S0_C0_RE = _mm256_fmadd_pd( u_00_im , b_S0_C0_IM , ub_S0_C0_RE );
ub_S0_C0_IM = _mm256_mul_pd( u_00_re , b_S0_C0_IM );
ub_S0_C0_IM = _mm256_fnmadd_pd( u_00_im , b_S0_C0_RE , ub_S0_C0_IM );
ub_S0_C0_RE = _mm256_fmadd_pd( u_01_re , b_S0_C1_RE , ub_S0_C0_RE );
ub_S0_C0_RE = _mm256_fmadd_pd( u_01_im , b_S0_C1_IM , ub_S0_C0_RE );
ub_S0_C0_IM = _mm256_fmadd_pd( u_01_re , b_S0_C1_IM , ub_S0_C0_IM );
ub_S0_C0_IM = _mm256_fnmadd_pd( u_01_im , b_S0_C1_RE , ub_S0_C0_IM );
ub_S0_C0_RE = _mm256_fmadd_pd( u_02_re , b_S0_C2_RE , ub_S0_C0_RE );
ub_S0_C0_RE = _mm256_fmadd_pd( u_02_im , b_S0_C2_IM , ub_S0_C0_RE );
ub_S0_C0_IM = _mm256_fmadd_pd( u_02_re , b_S0_C2_IM , ub_S0_C0_IM );
ub_S0_C0_IM = _mm256_fnmadd_pd( u_02_im , b_S0_C2_RE , ub_S0_C0_IM );
ub_S0_C1_RE = _mm256_mul_pd( u_10_re , b_S0_C0_RE );
ub_S0_C1_RE = _mm256_fmadd_pd( u_10_im , b_S0_C0_IM , ub_S0_C1_RE );
ub_S0_C1_IM = _mm256_mul_pd( u_10_re , b_S0_C0_IM );
ub_S0_C1_IM = _mm256_fnmadd_pd( u_10_im , b_S0_C0_RE , ub_S0_C1_IM );
ub_S0_C1_RE = _mm256_fmadd_pd( u_11_re , b_S0_C1_RE , ub_S0_C1_RE );
ub_S0_C1_RE = _mm256_fmadd_pd( u_11_im , b_S0_C1_IM , ub_S0_C1_RE );
ub_S0_C1_IM = _mm256_fmadd_pd( u_11_re , b_S0_C1_IM , ub_S0_C1_IM );
ub_S0_C1_IM = _mm256_fnmadd_pd( u_11_im , b_S0_C1_RE , ub_S0_C1_IM );
ub_S0_C1_RE = _mm256_fmadd_pd( u_12_re , b_S0_C2_RE , ub_S0_C1_RE );
ub_S0_C1_RE = _mm256_fmadd_pd( u_12_im , b_S0_C2_IM , ub_S0_C1_RE );
ub_S0_C1_IM = _mm256_fmadd_pd( u_12_re , b_S0_C2_IM , ub_S0_C1_IM );
ub_S0_C1_IM = _mm256_fnmadd_pd( u_12_im , b_S0_C2_RE , ub_S0_C1_IM );
ub_S0_C2_RE = _mm256_mul_pd( u_20_re , b_S0_C0_RE );
ub_S0_C2_RE = _mm256_fmadd_pd( u_20_im , b_S0_C0_IM , ub_S0_C2_RE );
ub_S0_C2_IM = _mm256_mul_pd( u_20_re , b_S0_C0_IM );
ub_S0_C2_IM = _mm256_fnmadd_pd( u_20_im , b_S0_C0_RE , ub_S0_C2_IM );
ub_S0_C2_RE = _mm256_fmadd_pd( u_21_re , b_S0_C1_RE , ub_S0_C2_RE );
ub_S0_C2_RE = _mm256_fmadd_pd( u_21_im , b_S0_C1_IM , ub_S0_C2_RE );
ub_S0_C2_IM = _mm256_fmadd_pd( u_21_re , b_S0_C1_IM , ub_S0_C2_IM );
ub_S0_C2_IM = _mm256_fnmadd_pd( u_21_im , b_S0_C1_RE , ub_S0_C2_IM );
ub_S0_C2_RE = _mm256_fmadd_pd( u_22_re , b_S0_C2_RE , ub_S0_C2_RE );
ub_S0_C2_RE = _mm256_fmadd_pd( u_22_im , b_S0_C2_IM , ub_S0_C2_RE );
ub_S0_C2_IM = _mm256_fmadd_pd( u_22_re , b_S0_C2_IM , ub_S0_C2_IM );
ub_S0_C2_IM = _mm256_fnmadd_pd( u_22_im , b_S0_C2_RE , ub_S0_C2_IM );
out_S0_C0_RE =_mm256_blendv_pd(out_S0_C0_RE, _mm256_fmadd_pd( beta_vec , ub_S0_C0_RE , out_S0_C0_RE ), accMask);
out_S0_C0_IM =_mm256_blendv_pd(out_S0_C0_IM, _mm256_fmadd_pd( beta_vec , ub_S0_C0_IM , out_S0_C0_IM ), accMask);
out_S0_C1_RE =_mm256_blendv_pd(out_S0_C1_RE, _mm256_fmadd_pd( beta_vec , ub_S0_C1_RE , out_S0_C1_RE ), accMask);
out_S0_C1_IM =_mm256_blendv_pd(out_S0_C1_IM, _mm256_fmadd_pd( beta_vec , ub_S0_C1_IM , out_S0_C1_IM ), accMask);
out_S0_C2_RE =_mm256_blendv_pd(out_S0_C2_RE, _mm256_fmadd_pd( beta_vec , ub_S0_C2_RE , out_S0_C2_RE ), accMask);
out_S0_C2_IM =_mm256_blendv_pd(out_S0_C2_IM, _mm256_fmadd_pd( beta_vec , ub_S0_C2_IM , out_S0_C2_IM ), accMask);
out_S3_C0_RE = _mm256_blendv_pd(out_S3_C0_RE, _mm256_fnmadd_pd( beta_vec , ub_S0_C0_RE , out_S3_C0_RE ), accMask);
out_S3_C0_IM = _mm256_blendv_pd(out_S3_C0_IM, _mm256_fnmadd_pd( beta_vec , ub_S0_C0_IM , out_S3_C0_IM ), accMask);
out_S3_C1_RE = _mm256_blendv_pd(out_S3_C1_RE, _mm256_fnmadd_pd( beta_vec , ub_S0_C1_RE , out_S3_C1_RE ), accMask);
out_S3_C1_IM = _mm256_blendv_pd(out_S3_C1_IM, _mm256_fnmadd_pd( beta_vec , ub_S0_C1_IM , out_S3_C1_IM ), accMask);
out_S3_C2_RE = _mm256_blendv_pd(out_S3_C2_RE, _mm256_fnmadd_pd( beta_vec , ub_S0_C2_RE , out_S3_C2_RE ), accMask);
out_S3_C2_IM = _mm256_blendv_pd(out_S3_C2_IM, _mm256_fnmadd_pd( beta_vec , ub_S0_C2_IM , out_S3_C2_IM ), accMask);
psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*xyBase)[0][1][0] + ybOffs[0])), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*xyBase)[0][1][0] + ybOffs[2])), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*xyBase)[0][1][1] + ybOffs[0])), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*xyBase)[0][1][1] + ybOffs[2])), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*xyBase)[0][2][0] + ybOffs[0])), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*xyBase)[0][2][0] + ybOffs[2])), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*xyBase)[0][2][1] + ybOffs[0])), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*xyBase)[0][2][1] + ybOffs[2])), 1);

b_S1_C0_RE = _mm256_add_pd( psi_S0_RE , psi_S1_RE );
b_S1_C0_IM = _mm256_add_pd( psi_S0_IM , psi_S1_IM );
psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*xyBase)[1][1][0] + ybOffs[0])), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*xyBase)[1][1][0] + ybOffs[2])), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*xyBase)[1][1][1] + ybOffs[0])), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*xyBase)[1][1][1] + ybOffs[2])), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*xyBase)[1][2][0] + ybOffs[0])), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*xyBase)[1][2][0] + ybOffs[2])), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*xyBase)[1][2][1] + ybOffs[0])), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*xyBase)[1][2][1] + ybOffs[2])), 1);

b_S1_C1_RE = _mm256_add_pd( psi_S0_RE , psi_S1_RE );
b_S1_C1_IM = _mm256_add_pd( psi_S0_IM , psi_S1_IM );
psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*xyBase)[2][1][0] + ybOffs[0])), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*xyBase)[2][1][0] + ybOffs[2])), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*xyBase)[2][1][1] + ybOffs[0])), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*xyBase)[2][1][1] + ybOffs[2])), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*xyBase)[2][2][0] + ybOffs[0])), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*xyBase)[2][2][0] + ybOffs[2])), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*xyBase)[2][2][1] + ybOffs[0])), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*xyBase)[2][2][1] + ybOffs[2])), 1);

b_S1_C2_RE = _mm256_add_pd( psi_S0_RE , psi_S1_RE );
b_S1_C2_IM = _mm256_add_pd( psi_S0_IM , psi_S1_IM );
ub_S1_C0_RE = _mm256_mul_pd( u_00_re , b_S1_C0_RE );
ub_S1_C0_RE = _mm256_fmadd_pd( u_00_im , b_S1_C0_IM , ub_S1_C0_RE );
ub_S1_C0_IM = _mm256_mul_pd( u_00_re , b_S1_C0_IM );
ub_S1_C0_IM = _mm256_fnmadd_pd( u_00_im , b_S1_C0_RE , ub_S1_C0_IM );
ub_S1_C0_RE = _mm256_fmadd_pd( u_01_re , b_S1_C1_RE , ub_S1_C0_RE );
ub_S1_C0_RE = _mm256_fmadd_pd( u_01_im , b_S1_C1_IM , ub_S1_C0_RE );
ub_S1_C0_IM = _mm256_fmadd_pd( u_01_re , b_S1_C1_IM , ub_S1_C0_IM );
ub_S1_C0_IM = _mm256_fnmadd_pd( u_01_im , b_S1_C1_RE , ub_S1_C0_IM );
ub_S1_C0_RE = _mm256_fmadd_pd( u_02_re , b_S1_C2_RE , ub_S1_C0_RE );
ub_S1_C0_RE = _mm256_fmadd_pd( u_02_im , b_S1_C2_IM , ub_S1_C0_RE );
ub_S1_C0_IM = _mm256_fmadd_pd( u_02_re , b_S1_C2_IM , ub_S1_C0_IM );
ub_S1_C0_IM = _mm256_fnmadd_pd( u_02_im , b_S1_C2_RE , ub_S1_C0_IM );
ub_S1_C1_RE = _mm256_mul_pd( u_10_re , b_S1_C0_RE );
ub_S1_C1_RE = _mm256_fmadd_pd( u_10_im , b_S1_C0_IM , ub_S1_C1_RE );
ub_S1_C1_IM = _mm256_mul_pd( u_10_re , b_S1_C0_IM );
ub_S1_C1_IM = _mm256_fnmadd_pd( u_10_im , b_S1_C0_RE , ub_S1_C1_IM );
ub_S1_C1_RE = _mm256_fmadd_pd( u_11_re , b_S1_C1_RE , ub_S1_C1_RE );
ub_S1_C1_RE = _mm256_fmadd_pd( u_11_im , b_S1_C1_IM , ub_S1_C1_RE );
ub_S1_C1_IM = _mm256_fmadd_pd( u_11_re , b_S1_C1_IM , ub_S1_C1_IM );
ub_S1_C1_IM = _mm256_fnmadd_pd( u_11_im , b_S1_C1_RE , ub_S1_C1_IM );
ub_S1_C1_RE = _mm256_fmadd_pd( u_12_re , b_S1_C2_RE , ub_S1_C1_RE );
ub_S1_C1_RE = _mm256_fmadd_pd( u_12_im , b_S1_C2_IM , ub_S1_C1_RE );
ub_S1_C1_IM = _mm256_fmadd_pd( u_12_re , b_S1_C2_IM , ub_S1_C1_IM );
ub_S1_C1_IM = _mm256_fnmadd_pd( u_12_im , b_S1_C2_RE , ub_S1_C1_IM );
ub_S1_C2_RE = _mm256_mul_pd( u_20_re , b_S1_C0_RE );
ub_S1_C2_RE = _mm256_fmadd_pd( u_20_im , b_S1_C0_IM , ub_S1_C2_RE );
ub_S1_C2_IM = _mm256_mul_pd( u_20_re , b_S1_C0_IM );
ub_S1_C2_IM = _mm256_fnmadd_pd( u_20_im , b_S1_C0_RE , ub_S1_C2_IM );
ub_S1_C2_RE = _mm256_fmadd_pd( u_21_re , b_S1_C1_RE , ub_S1_C2_RE );
ub_S1_C2_RE = _mm256_fmadd_pd( u_21_im , b_S1_C1_IM , ub_S1_C2_RE );
ub_S1_C2_IM = _mm256_fmadd_pd( u_21_re , b_S1_C1_IM , ub_S1_C2_IM );
ub_S1_C2_IM = _mm256_fnmadd_pd( u_21_im , b_S1_C1_RE , ub_S1_C2_IM );
ub_S1_C2_RE = _mm256_fmadd_pd( u_22_re , b_S1_C2_RE , ub_S1_C2_RE );
ub_S1_C2_RE = _mm256_fmadd_pd( u_22_im , b_S1_C2_IM , ub_S1_C2_RE );
ub_S1_C2_IM = _mm256_fmadd_pd( u_22_re , b_S1_C2_IM , ub_S1_C2_IM );
ub_S1_C2_IM = _mm256_fnmadd_pd( u_22_im , b_S1_C2_RE , ub_S1_C2_IM );
out_S1_C0_RE =_mm256_blendv_pd(out_S1_C0_RE, _mm256_fmadd_pd( beta_vec , ub_S1_C0_RE , out_S1_C0_RE ), accMask);
out_S1_C0_IM =_mm256_blendv_pd(out_S1_C0_IM, _mm256_fmadd_pd( beta_vec , ub_S1_C0_IM , out_S1_C0_IM ), accMask);
out_S1_C1_RE =_mm256_blendv_pd(out_S1_C1_RE, _mm256_fmadd_pd( beta_vec , ub_S1_C1_RE , out_S1_C1_RE ), accMask);
out_S1_C1_IM =_mm256_blendv_pd(out_S1_C1_IM, _mm256_fmadd_pd( beta_vec , ub_S1_C1_IM , out_S1_C1_IM ), accMask);
out_S1_C2_RE =_mm256_blendv_pd(out_S1_C2_RE, _mm256_fmadd_pd( beta_vec , ub_S1_C2_RE , out_S1_C2_RE ), accMask);
out_S1_C2_IM =_mm256_blendv_pd(out_S1_C2_IM, _mm256_fmadd_pd( beta_vec , ub_S1_C2_IM , out_S1_C2_IM ), accMask);
out_S2_C0_RE =_mm256_blendv_pd(out_S2_C0_RE, _mm256_fmadd_pd( beta_vec , ub_S1_C0_RE , out_S2_C0_RE ), accMask);
out_S2_C0_IM =_mm256_blendv_pd(out_S2_C0_IM, _mm256_fmadd_pd( beta_vec , ub_S1_C0_IM , out_S2_C0_IM ), accMask);
out_S2_C1_RE =_mm256_blendv_pd(out_S2_C1_RE, _mm256_fmadd_pd( beta_vec , ub_S1_C1_RE , out_S2_C1_RE ), accMask);
out_S2_C1_IM =_mm256_blendv_pd(out_S2_C1_IM, _mm256_fmadd_pd( beta_vec , ub_S1_C1_IM , out_S2_C1_IM ), accMask);
out_S2_C2_RE =_mm256_blendv_pd(out_S2_C2_RE, _mm256_fmadd_pd( beta_vec , ub_S1_C2_RE , out_S2_C2_RE ), accMask);
out_S2_C2_IM =_mm256_blendv_pd(out_S2_C2_IM, _mm256_fmadd_pd( beta_vec , ub_S1_C2_IM , out_S2_C2_IM ), accMask);
}
}
 if ( accumulate[3] ) { 
__m256d beta_vec = _mm256_setzero_pd();
beta_vec = _mm256_broadcast_sd((&coeff_s));

 if ((accumulate[3] & 0xF) == 0xF) { 
psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*xyBase)[0][0][0] + yfOffs[0])), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*xyBase)[0][0][0] + yfOffs[2])), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*xyBase)[0][0][1] + yfOffs[0])), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*xyBase)[0][0][1] + yfOffs[2])), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*xyBase)[0][3][0] + yfOffs[0])), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*xyBase)[0][3][0] + yfOffs[2])), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*xyBase)[0][3][1] + yfOffs[0])), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*xyBase)[0][3][1] + yfOffs[2])), 1);

b_S0_C0_RE = _mm256_add_pd( psi_S0_RE , psi_S1_RE );
b_S0_C0_IM = _mm256_add_pd( psi_S0_IM , psi_S1_IM );
psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*xyBase)[1][0][0] + yfOffs[0])), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*xyBase)[1][0][0] + yfOffs[2])), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*xyBase)[1][0][1] + yfOffs[0])), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*xyBase)[1][0][1] + yfOffs[2])), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*xyBase)[1][3][0] + yfOffs[0])), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*xyBase)[1][3][0] + yfOffs[2])), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*xyBase)[1][3][1] + yfOffs[0])), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*xyBase)[1][3][1] + yfOffs[2])), 1);

b_S0_C1_RE = _mm256_add_pd( psi_S0_RE , psi_S1_RE );
b_S0_C1_IM = _mm256_add_pd( psi_S0_IM , psi_S1_IM );
psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*xyBase)[2][0][0] + yfOffs[0])), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*xyBase)[2][0][0] + yfOffs[2])), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*xyBase)[2][0][1] + yfOffs[0])), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*xyBase)[2][0][1] + yfOffs[2])), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*xyBase)[2][3][0] + yfOffs[0])), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*xyBase)[2][3][0] + yfOffs[2])), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*xyBase)[2][3][1] + yfOffs[0])), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*xyBase)[2][3][1] + yfOffs[2])), 1);

b_S0_C2_RE = _mm256_add_pd( psi_S0_RE , psi_S1_RE );
b_S0_C2_IM = _mm256_add_pd( psi_S0_IM , psi_S1_IM );
u_00_re = _mm256_load_pd((*gBase)[3][0][0][0]);

u_00_im = _mm256_load_pd((*gBase)[3][0][0][1]);

u_01_re = _mm256_load_pd((*gBase)[3][0][1][0]);

u_01_im = _mm256_load_pd((*gBase)[3][0][1][1]);

u_02_re = _mm256_load_pd((*gBase)[3][0][2][0]);

u_02_im = _mm256_load_pd((*gBase)[3][0][2][1]);

u_10_re = _mm256_load_pd((*gBase)[3][1][0][0]);

u_10_im = _mm256_load_pd((*gBase)[3][1][0][1]);

u_11_re = _mm256_load_pd((*gBase)[3][1][1][0]);

u_11_im = _mm256_load_pd((*gBase)[3][1][1][1]);

u_12_re = _mm256_load_pd((*gBase)[3][1][2][0]);

u_12_im = _mm256_load_pd((*gBase)[3][1][2][1]);

u_20_re = _mm256_mul_pd( u_01_re , u_12_re );
u_20_re = _mm256_fnmadd_pd( u_01_im , u_12_im , u_20_re );
u_20_re = _mm256_fnmadd_pd( u_02_re , u_11_re , u_20_re );
u_20_re = _mm256_fmadd_pd( u_02_im , u_11_im , u_20_re );
u_20_im = _mm256_mul_pd( u_02_re , u_11_im );
u_20_im = _mm256_fmadd_pd( u_02_im , u_11_re , u_20_im );
u_20_im = _mm256_fnmadd_pd( u_01_re , u_12_im , u_20_im );
u_20_im = _mm256_fnmadd_pd( u_01_im , u_12_re , u_20_im );
u_21_re = _mm256_mul_pd( u_02_re , u_10_re );
u_21_re = _mm256_fnmadd_pd( u_02_im , u_10_im , u_21_re );
u_21_re = _mm256_fnmadd_pd( u_00_re , u_12_re , u_21_re );
u_21_re = _mm256_fmadd_pd( u_00_im , u_12_im , u_21_re );
u_21_im = _mm256_mul_pd( u_00_re , u_12_im );
u_21_im = _mm256_fmadd_pd( u_00_im , u_12_re , u_21_im );
u_21_im = _mm256_fnmadd_pd( u_02_re , u_10_im , u_21_im );
u_21_im = _mm256_fnmadd_pd( u_02_im , u_10_re , u_21_im );
u_22_re = _mm256_mul_pd( u_00_re , u_11_re );
u_22_re = _mm256_fnmadd_pd( u_00_im , u_11_im , u_22_re );
u_22_re = _mm256_fnmadd_pd( u_01_re , u_10_re , u_22_re );
u_22_re = _mm256_fmadd_pd( u_01_im , u_10_im , u_22_re );
u_22_im = _mm256_mul_pd( u_01_re , u_10_im );
u_22_im = _mm256_fmadd_pd( u_01_im , u_10_re , u_22_im );
u_22_im = _mm256_fnmadd_pd( u_00_re , u_11_im , u_22_im );
u_22_im = _mm256_fnmadd_pd( u_00_im , u_11_re , u_22_im );
ub_S0_C0_RE = _mm256_mul_pd( u_00_re , b_S0_C0_RE );
ub_S0_C0_RE = _mm256_fnmadd_pd( u_00_im , b_S0_C0_IM , ub_S0_C0_RE );
ub_S0_C0_IM = _mm256_mul_pd( u_00_re , b_S0_C0_IM );
ub_S0_C0_IM = _mm256_fmadd_pd( u_00_im , b_S0_C0_RE , ub_S0_C0_IM );
ub_S0_C0_RE = _mm256_fmadd_pd( u_10_re , b_S0_C1_RE , ub_S0_C0_RE );
ub_S0_C0_RE = _mm256_fnmadd_pd( u_10_im , b_S0_C1_IM , ub_S0_C0_RE );
ub_S0_C0_IM = _mm256_fmadd_pd( u_10_re , b_S0_C1_IM , ub_S0_C0_IM );
ub_S0_C0_IM = _mm256_fmadd_pd( u_10_im , b_S0_C1_RE , ub_S0_C0_IM );
ub_S0_C0_RE = _mm256_fmadd_pd( u_20_re , b_S0_C2_RE , ub_S0_C0_RE );
ub_S0_C0_RE = _mm256_fnmadd_pd( u_20_im , b_S0_C2_IM , ub_S0_C0_RE );
ub_S0_C0_IM = _mm256_fmadd_pd( u_20_re , b_S0_C2_IM , ub_S0_C0_IM );
ub_S0_C0_IM = _mm256_fmadd_pd( u_20_im , b_S0_C2_RE , ub_S0_C0_IM );
ub_S0_C1_RE = _mm256_mul_pd( u_01_re , b_S0_C0_RE );
ub_S0_C1_RE = _mm256_fnmadd_pd( u_01_im , b_S0_C0_IM , ub_S0_C1_RE );
ub_S0_C1_IM = _mm256_mul_pd( u_01_re , b_S0_C0_IM );
ub_S0_C1_IM = _mm256_fmadd_pd( u_01_im , b_S0_C0_RE , ub_S0_C1_IM );
ub_S0_C1_RE = _mm256_fmadd_pd( u_11_re , b_S0_C1_RE , ub_S0_C1_RE );
ub_S0_C1_RE = _mm256_fnmadd_pd( u_11_im , b_S0_C1_IM , ub_S0_C1_RE );
ub_S0_C1_IM = _mm256_fmadd_pd( u_11_re , b_S0_C1_IM , ub_S0_C1_IM );
ub_S0_C1_IM = _mm256_fmadd_pd( u_11_im , b_S0_C1_RE , ub_S0_C1_IM );
ub_S0_C1_RE = _mm256_fmadd_pd( u_21_re , b_S0_C2_RE , ub_S0_C1_RE );
ub_S0_C1_RE = _mm256_fnmadd_pd( u_21_im , b_S0_C2_IM , ub_S0_C1_RE );
ub_S0_C1_IM = _mm256_fmadd_pd( u_21_re , b_S0_C2_IM , ub_S0_C1_IM );
ub_S0_C1_IM = _mm256_fmadd_pd( u_21_im , b_S0_C2_RE , ub_S0_C1_IM );
ub_S0_C2_RE = _mm256_mul_pd( u_02_re , b_S0_C0_RE );
ub_S0_C2_RE = _mm256_fnmadd_pd( u_02_im , b_S0_C0_IM , ub_S0_C2_RE );
ub_S0_C2_IM = _mm256_mul_pd( u_02_re , b_S0_C0_IM );
ub_S0_C2_IM = _mm256_fmadd_pd( u_02_im , b_S0_C0_RE , ub_S0_C2_IM );
ub_S0_C2_RE = _mm256_fmadd_pd( u_12_re , b_S0_C1_RE , ub_S0_C2_RE );
ub_S0_C2_RE = _mm256_fnmadd_pd( u_12_im , b_S0_C1_IM , ub_S0_C2_RE );
ub_S0_C2_IM = _mm256_fmadd_pd( u_12_re , b_S0_C1_IM , ub_S0_C2_IM );
ub_S0_C2_IM = _mm256_fmadd_pd( u_12_im , b_S0_C1_RE , ub_S0_C2_IM );
ub_S0_C2_RE = _mm256_fmadd_pd( u_22_re , b_S0_C2_RE , ub_S0_C2_RE );
ub_S0_C2_RE = _mm256_fnmadd_pd( u_22_im , b_S0_C2_IM , ub_S0_C2_RE );
ub_S0_C2_IM = _mm256_fmadd_pd( u_22_re , b_S0_C2_IM , ub_S0_C2_IM );
ub_S0_C2_IM = _mm256_fmadd_pd( u_22_im , b_S0_C2_RE , ub_S0_C2_IM );
out_S0_C0_RE = _mm256_fmadd_pd( beta_vec , ub_S0_C0_RE , out_S0_C0_RE );
out_S0_C0_IM = _mm256_fmadd_pd( beta_vec , ub_S0_C0_IM , out_S0_C0_IM );
out_S0_C1_RE = _mm256_fmadd_pd( beta_vec , ub_S0_C1_RE , out_S0_C1_RE );
out_S0_C1_IM = _mm256_fmadd_pd( beta_vec , ub_S0_C1_IM , out_S0_C1_IM );
out_S0_C2_RE = _mm256_fmadd_pd( beta_vec , ub_S0_C2_RE , out_S0_C2_RE );
out_S0_C2_IM = _mm256_fmadd_pd( beta_vec , ub_S0_C2_IM , out_S0_C2_IM );
out_S3_C0_RE = _mm256_fmadd_pd( beta_vec , ub_S0_C0_RE , out_S3_C0_RE );
out_S3_C0_IM = _mm256_fmadd_pd( beta_vec , ub_S0_C0_IM , out_S3_C0_IM );
out_S3_C1_RE = _mm256_fmadd_pd( beta_vec , ub_S0_C1_RE , out_S3_C1_RE );
out_S3_C1_IM = _mm256_fmadd_pd( beta_vec , ub_S0_C1_IM , out_S3_C1_IM );
out_S3_C2_RE = _mm256_fmadd_pd( beta_vec , ub_S0_C2_RE , out_S3_C2_RE );
out_S3_C2_IM = _mm256_fmadd_pd( beta_vec , ub_S0_C2_IM , out_S3_C2_IM );
psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*xyBase)[0][1][0] + yfOffs[0])), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*xyBase)[0][1][0] + yfOffs[2])), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*xyBase)[0][1][1] + yfOffs[0])), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*xyBase)[0][1][1] + yfOffs[2])), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*xyBase)[0][2][0] + yfOffs[0])), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*xyBase)[0][2][0] + yfOffs[2])), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*xyBase)[0][2][1] + yfOffs[0])), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*xyBase)[0][2][1] + yfOffs[2])), 1);

b_S1_C0_RE = _mm256_sub_pd( psi_S0_RE , psi_S1_RE );
b_S1_C0_IM = _mm256_sub_pd( psi_S0_IM , psi_S1_IM );
psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*xyBase)[1][1][0] + yfOffs[0])), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*xyBase)[1][1][0] + yfOffs[2])), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*xyBase)[1][1][1] + yfOffs[0])), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*xyBase)[1][1][1] + yfOffs[2])), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*xyBase)[1][2][0] + yfOffs[0])), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*xyBase)[1][2][0] + yfOffs[2])), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*xyBase)[1][2][1] + yfOffs[0])), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*xyBase)[1][2][1] + yfOffs[2])), 1);

b_S1_C1_RE = _mm256_sub_pd( psi_S0_RE , psi_S1_RE );
b_S1_C1_IM = _mm256_sub_pd( psi_S0_IM , psi_S1_IM );
psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*xyBase)[2][1][0] + yfOffs[0])), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*xyBase)[2][1][0] + yfOffs[2])), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*xyBase)[2][1][1] + yfOffs[0])), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*xyBase)[2][1][1] + yfOffs[2])), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*xyBase)[2][2][0] + yfOffs[0])), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*xyBase)[2][2][0] + yfOffs[2])), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*xyBase)[2][2][1] + yfOffs[0])), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*xyBase)[2][2][1] + yfOffs[2])), 1);

b_S1_C2_RE = _mm256_sub_pd( psi_S0_RE , psi_S1_RE );
b_S1_C2_IM = _mm256_sub_pd( psi_S0_IM , psi_S1_IM );
ub_S1_C0_RE = _mm256_mul_pd( u_00_re , b_S1_C0_RE );
ub_S1_C0_RE = _mm256_fnmadd_pd( u_00_im , b_S1_C0_IM , ub_S1_C0_RE );
ub_S1_C0_IM = _mm256_mul_pd( u_00_re , b_S1_C0_IM );
ub_S1_C0_IM = _mm256_fmadd_pd( u_00_im , b_S1_C0_RE , ub_S1_C0_IM );
ub_S1_C0_RE = _mm256_fmadd_pd( u_10_re , b_S1_C1_RE , ub_S1_C0_RE );
ub_S1_C0_RE = _mm256_fnmadd_pd( u_10_im , b_S1_C1_IM , ub_S1_C0_RE );
ub_S1_C0_IM = _mm256_fmadd_pd( u_10_re , b_S1_C1_IM , ub_S1_C0_IM );
ub_S1_C0_IM = _mm256_fmadd_pd( u_10_im , b_S1_C1_RE , ub_S1_C0_IM );
ub_S1_C0_RE = _mm256_fmadd_pd( u_20_re , b_S1_C2_RE , ub_S1_C0_RE );
ub_S1_C0_RE = _mm256_fnmadd_pd( u_20_im , b_S1_C2_IM , ub_S1_C0_RE );
ub_S1_C0_IM = _mm256_fmadd_pd( u_20_re , b_S1_C2_IM , ub_S1_C0_IM );
ub_S1_C0_IM = _mm256_fmadd_pd( u_20_im , b_S1_C2_RE , ub_S1_C0_IM );
ub_S1_C1_RE = _mm256_mul_pd( u_01_re , b_S1_C0_RE );
ub_S1_C1_RE = _mm256_fnmadd_pd( u_01_im , b_S1_C0_IM , ub_S1_C1_RE );
ub_S1_C1_IM = _mm256_mul_pd( u_01_re , b_S1_C0_IM );
ub_S1_C1_IM = _mm256_fmadd_pd( u_01_im , b_S1_C0_RE , ub_S1_C1_IM );
ub_S1_C1_RE = _mm256_fmadd_pd( u_11_re , b_S1_C1_RE , ub_S1_C1_RE );
ub_S1_C1_RE = _mm256_fnmadd_pd( u_11_im , b_S1_C1_IM , ub_S1_C1_RE );
ub_S1_C1_IM = _mm256_fmadd_pd( u_11_re , b_S1_C1_IM , ub_S1_C1_IM );
ub_S1_C1_IM = _mm256_fmadd_pd( u_11_im , b_S1_C1_RE , ub_S1_C1_IM );
ub_S1_C1_RE = _mm256_fmadd_pd( u_21_re , b_S1_C2_RE , ub_S1_C1_RE );
ub_S1_C1_RE = _mm256_fnmadd_pd( u_21_im , b_S1_C2_IM , ub_S1_C1_RE );
ub_S1_C1_IM = _mm256_fmadd_pd( u_21_re , b_S1_C2_IM , ub_S1_C1_IM );
ub_S1_C1_IM = _mm256_fmadd_pd( u_21_im , b_S1_C2_RE , ub_S1_C1_IM );
ub_S1_C2_RE = _mm256_mul_pd( u_02_re , b_S1_C0_RE );
ub_S1_C2_RE = _mm256_fnmadd_pd( u_02_im , b_S1_C0_IM , ub_S1_C2_RE );
ub_S1_C2_IM = _mm256_mul_pd( u_02_re , b_S1_C0_IM );
ub_S1_C2_IM = _mm256_fmadd_pd( u_02_im , b_S1_C0_RE , ub_S1_C2_IM );
ub_S1_C2_RE = _mm256_fmadd_pd( u_12_re , b_S1_C1_RE , ub_S1_C2_RE );
ub_S1_C2_RE = _mm256_fnmadd_pd( u_12_im , b_S1_C1_IM , ub_S1_C2_RE );
ub_S1_C2_IM = _mm256_fmadd_pd( u_12_re , b_S1_C1_IM , ub_S1_C2_IM );
ub_S1_C2_IM = _mm256_fmadd_pd( u_12_im , b_S1_C1_RE , ub_S1_C2_IM );
ub_S1_C2_RE = _mm256_fmadd_pd( u_22_re , b_S1_C2_RE , ub_S1_C2_RE );
ub_S1_C2_RE = _mm256_fnmadd_pd( u_22_im , b_S1_C2_IM , ub_S1_C2_RE );
ub_S1_C2_IM = _mm256_fmadd_pd( u_22_re , b_S1_C2_IM , ub_S1_C2_IM );
ub_S1_C2_IM = _mm256_fmadd_pd( u_22_im , b_S1_C2_RE , ub_S1_C2_IM );
out_S1_C0_RE = _mm256_fmadd_pd( beta_vec , ub_S1_C0_RE , out_S1_C0_RE );
out_S1_C0_IM = _mm256_fmadd_pd( beta_vec , ub_S1_C0_IM , out_S1_C0_IM );
out_S1_C1_RE = _mm256_fmadd_pd( beta_vec , ub_S1_C1_RE , out_S1_C1_RE );
out_S1_C1_IM = _mm256_fmadd_pd( beta_vec , ub_S1_C1_IM , out_S1_C1_IM );
out_S1_C2_RE = _mm256_fmadd_pd( beta_vec , ub_S1_C2_RE , out_S1_C2_RE );
out_S1_C2_IM = _mm256_fmadd_pd( beta_vec , ub_S1_C2_IM , out_S1_C2_IM );
out_S2_C0_RE = _mm256_fnmadd_pd( beta_vec , ub_S1_C0_RE , out_S2_C0_RE );
out_S2_C0_IM = _mm256_fnmadd_pd( beta_vec , ub_S1_C0_IM , out_S2_C0_IM );
out_S2_C1_RE = _mm256_fnmadd_pd( beta_vec , ub_S1_C1_RE , out_S2_C1_RE );
out_S2_C1_IM = _mm256_fnmadd_pd( beta_vec , ub_S1_C1_IM , out_S2_C1_IM );
out_S2_C2_RE = _mm256_fnmadd_pd( beta_vec , ub_S1_C2_RE , out_S2_C2_RE );
out_S2_C2_IM = _mm256_fnmadd_pd( beta_vec , ub_S1_C2_IM , out_S2_C2_IM );
} else {
__m256d accMask;

accMask = _mm256_int2mask_pd(accumulate[3]);

psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*xyBase)[0][0][0] + yfOffs[0])), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*xyBase)[0][0][0] + yfOffs[2])), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*xyBase)[0][0][1] + yfOffs[0])), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*xyBase)[0][0][1] + yfOffs[2])), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*xyBase)[0][3][0] + yfOffs[0])), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*xyBase)[0][3][0] + yfOffs[2])), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*xyBase)[0][3][1] + yfOffs[0])), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*xyBase)[0][3][1] + yfOffs[2])), 1);

b_S0_C0_RE = _mm256_add_pd( psi_S0_RE , psi_S1_RE );
b_S0_C0_IM = _mm256_add_pd( psi_S0_IM , psi_S1_IM );
psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*xyBase)[1][0][0] + yfOffs[0])), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*xyBase)[1][0][0] + yfOffs[2])), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*xyBase)[1][0][1] + yfOffs[0])), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*xyBase)[1][0][1] + yfOffs[2])), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*xyBase)[1][3][0] + yfOffs[0])), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*xyBase)[1][3][0] + yfOffs[2])), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*xyBase)[1][3][1] + yfOffs[0])), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*xyBase)[1][3][1] + yfOffs[2])), 1);

b_S0_C1_RE = _mm256_add_pd( psi_S0_RE , psi_S1_RE );
b_S0_C1_IM = _mm256_add_pd( psi_S0_IM , psi_S1_IM );
psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*xyBase)[2][0][0] + yfOffs[0])), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*xyBase)[2][0][0] + yfOffs[2])), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*xyBase)[2][0][1] + yfOffs[0])), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*xyBase)[2][0][1] + yfOffs[2])), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*xyBase)[2][3][0] + yfOffs[0])), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*xyBase)[2][3][0] + yfOffs[2])), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*xyBase)[2][3][1] + yfOffs[0])), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*xyBase)[2][3][1] + yfOffs[2])), 1);

b_S0_C2_RE = _mm256_add_pd( psi_S0_RE , psi_S1_RE );
b_S0_C2_IM = _mm256_add_pd( psi_S0_IM , psi_S1_IM );
u_00_re = _mm256_load_pd((*gBase)[3][0][0][0]);

u_00_im = _mm256_load_pd((*gBase)[3][0][0][1]);

u_01_re = _mm256_load_pd((*gBase)[3][0][1][0]);

u_01_im = _mm256_load_pd((*gBase)[3][0][1][1]);

u_02_re = _mm256_load_pd((*gBase)[3][0][2][0]);

u_02_im = _mm256_load_pd((*gBase)[3][0][2][1]);

u_10_re = _mm256_load_pd((*gBase)[3][1][0][0]);

u_10_im = _mm256_load_pd((*gBase)[3][1][0][1]);

u_11_re = _mm256_load_pd((*gBase)[3][1][1][0]);

u_11_im = _mm256_load_pd((*gBase)[3][1][1][1]);

u_12_re = _mm256_load_pd((*gBase)[3][1][2][0]);

u_12_im = _mm256_load_pd((*gBase)[3][1][2][1]);

u_20_re = _mm256_mul_pd( u_01_re , u_12_re );
u_20_re = _mm256_fnmadd_pd( u_01_im , u_12_im , u_20_re );
u_20_re = _mm256_fnmadd_pd( u_02_re , u_11_re , u_20_re );
u_20_re = _mm256_fmadd_pd( u_02_im , u_11_im , u_20_re );
u_20_im = _mm256_mul_pd( u_02_re , u_11_im );
u_20_im = _mm256_fmadd_pd( u_02_im , u_11_re , u_20_im );
u_20_im = _mm256_fnmadd_pd( u_01_re , u_12_im , u_20_im );
u_20_im = _mm256_fnmadd_pd( u_01_im , u_12_re , u_20_im );
u_21_re = _mm256_mul_pd( u_02_re , u_10_re );
u_21_re = _mm256_fnmadd_pd( u_02_im , u_10_im , u_21_re );
u_21_re = _mm256_fnmadd_pd( u_00_re , u_12_re , u_21_re );
u_21_re = _mm256_fmadd_pd( u_00_im , u_12_im , u_21_re );
u_21_im = _mm256_mul_pd( u_00_re , u_12_im );
u_21_im = _mm256_fmadd_pd( u_00_im , u_12_re , u_21_im );
u_21_im = _mm256_fnmadd_pd( u_02_re , u_10_im , u_21_im );
u_21_im = _mm256_fnmadd_pd( u_02_im , u_10_re , u_21_im );
u_22_re = _mm256_mul_pd( u_00_re , u_11_re );
u_22_re = _mm256_fnmadd_pd( u_00_im , u_11_im , u_22_re );
u_22_re = _mm256_fnmadd_pd( u_01_re , u_10_re , u_22_re );
u_22_re = _mm256_fmadd_pd( u_01_im , u_10_im , u_22_re );
u_22_im = _mm256_mul_pd( u_01_re , u_10_im );
u_22_im = _mm256_fmadd_pd( u_01_im , u_10_re , u_22_im );
u_22_im = _mm256_fnmadd_pd( u_00_re , u_11_im , u_22_im );
u_22_im = _mm256_fnmadd_pd( u_00_im , u_11_re , u_22_im );
ub_S0_C0_RE = _mm256_mul_pd( u_00_re , b_S0_C0_RE );
ub_S0_C0_RE = _mm256_fnmadd_pd( u_00_im , b_S0_C0_IM , ub_S0_C0_RE );
ub_S0_C0_IM = _mm256_mul_pd( u_00_re , b_S0_C0_IM );
ub_S0_C0_IM = _mm256_fmadd_pd( u_00_im , b_S0_C0_RE , ub_S0_C0_IM );
ub_S0_C0_RE = _mm256_fmadd_pd( u_10_re , b_S0_C1_RE , ub_S0_C0_RE );
ub_S0_C0_RE = _mm256_fnmadd_pd( u_10_im , b_S0_C1_IM , ub_S0_C0_RE );
ub_S0_C0_IM = _mm256_fmadd_pd( u_10_re , b_S0_C1_IM , ub_S0_C0_IM );
ub_S0_C0_IM = _mm256_fmadd_pd( u_10_im , b_S0_C1_RE , ub_S0_C0_IM );
ub_S0_C0_RE = _mm256_fmadd_pd( u_20_re , b_S0_C2_RE , ub_S0_C0_RE );
ub_S0_C0_RE = _mm256_fnmadd_pd( u_20_im , b_S0_C2_IM , ub_S0_C0_RE );
ub_S0_C0_IM = _mm256_fmadd_pd( u_20_re , b_S0_C2_IM , ub_S0_C0_IM );
ub_S0_C0_IM = _mm256_fmadd_pd( u_20_im , b_S0_C2_RE , ub_S0_C0_IM );
ub_S0_C1_RE = _mm256_mul_pd( u_01_re , b_S0_C0_RE );
ub_S0_C1_RE = _mm256_fnmadd_pd( u_01_im , b_S0_C0_IM , ub_S0_C1_RE );
ub_S0_C1_IM = _mm256_mul_pd( u_01_re , b_S0_C0_IM );
ub_S0_C1_IM = _mm256_fmadd_pd( u_01_im , b_S0_C0_RE , ub_S0_C1_IM );
ub_S0_C1_RE = _mm256_fmadd_pd( u_11_re , b_S0_C1_RE , ub_S0_C1_RE );
ub_S0_C1_RE = _mm256_fnmadd_pd( u_11_im , b_S0_C1_IM , ub_S0_C1_RE );
ub_S0_C1_IM = _mm256_fmadd_pd( u_11_re , b_S0_C1_IM , ub_S0_C1_IM );
ub_S0_C1_IM = _mm256_fmadd_pd( u_11_im , b_S0_C1_RE , ub_S0_C1_IM );
ub_S0_C1_RE = _mm256_fmadd_pd( u_21_re , b_S0_C2_RE , ub_S0_C1_RE );
ub_S0_C1_RE = _mm256_fnmadd_pd( u_21_im , b_S0_C2_IM , ub_S0_C1_RE );
ub_S0_C1_IM = _mm256_fmadd_pd( u_21_re , b_S0_C2_IM , ub_S0_C1_IM );
ub_S0_C1_IM = _mm256_fmadd_pd( u_21_im , b_S0_C2_RE , ub_S0_C1_IM );
ub_S0_C2_RE = _mm256_mul_pd( u_02_re , b_S0_C0_RE );
ub_S0_C2_RE = _mm256_fnmadd_pd( u_02_im , b_S0_C0_IM , ub_S0_C2_RE );
ub_S0_C2_IM = _mm256_mul_pd( u_02_re , b_S0_C0_IM );
ub_S0_C2_IM = _mm256_fmadd_pd( u_02_im , b_S0_C0_RE , ub_S0_C2_IM );
ub_S0_C2_RE = _mm256_fmadd_pd( u_12_re , b_S0_C1_RE , ub_S0_C2_RE );
ub_S0_C2_RE = _mm256_fnmadd_pd( u_12_im , b_S0_C1_IM , ub_S0_C2_RE );
ub_S0_C2_IM = _mm256_fmadd_pd( u_12_re , b_S0_C1_IM , ub_S0_C2_IM );
ub_S0_C2_IM = _mm256_fmadd_pd( u_12_im , b_S0_C1_RE , ub_S0_C2_IM );
ub_S0_C2_RE = _mm256_fmadd_pd( u_22_re , b_S0_C2_RE , ub_S0_C2_RE );
ub_S0_C2_RE = _mm256_fnmadd_pd( u_22_im , b_S0_C2_IM , ub_S0_C2_RE );
ub_S0_C2_IM = _mm256_fmadd_pd( u_22_re , b_S0_C2_IM , ub_S0_C2_IM );
ub_S0_C2_IM = _mm256_fmadd_pd( u_22_im , b_S0_C2_RE , ub_S0_C2_IM );
out_S0_C0_RE =_mm256_blendv_pd(out_S0_C0_RE, _mm256_fmadd_pd( beta_vec , ub_S0_C0_RE , out_S0_C0_RE ), accMask);
out_S0_C0_IM =_mm256_blendv_pd(out_S0_C0_IM, _mm256_fmadd_pd( beta_vec , ub_S0_C0_IM , out_S0_C0_IM ), accMask);
out_S0_C1_RE =_mm256_blendv_pd(out_S0_C1_RE, _mm256_fmadd_pd( beta_vec , ub_S0_C1_RE , out_S0_C1_RE ), accMask);
out_S0_C1_IM =_mm256_blendv_pd(out_S0_C1_IM, _mm256_fmadd_pd( beta_vec , ub_S0_C1_IM , out_S0_C1_IM ), accMask);
out_S0_C2_RE =_mm256_blendv_pd(out_S0_C2_RE, _mm256_fmadd_pd( beta_vec , ub_S0_C2_RE , out_S0_C2_RE ), accMask);
out_S0_C2_IM =_mm256_blendv_pd(out_S0_C2_IM, _mm256_fmadd_pd( beta_vec , ub_S0_C2_IM , out_S0_C2_IM ), accMask);
out_S3_C0_RE =_mm256_blendv_pd(out_S3_C0_RE, _mm256_fmadd_pd( beta_vec , ub_S0_C0_RE , out_S3_C0_RE ), accMask);
out_S3_C0_IM =_mm256_blendv_pd(out_S3_C0_IM, _mm256_fmadd_pd( beta_vec , ub_S0_C0_IM , out_S3_C0_IM ), accMask);
out_S3_C1_RE =_mm256_blendv_pd(out_S3_C1_RE, _mm256_fmadd_pd( beta_vec , ub_S0_C1_RE , out_S3_C1_RE ), accMask);
out_S3_C1_IM =_mm256_blendv_pd(out_S3_C1_IM, _mm256_fmadd_pd( beta_vec , ub_S0_C1_IM , out_S3_C1_IM ), accMask);
out_S3_C2_RE =_mm256_blendv_pd(out_S3_C2_RE, _mm256_fmadd_pd( beta_vec , ub_S0_C2_RE , out_S3_C2_RE ), accMask);
out_S3_C2_IM =_mm256_blendv_pd(out_S3_C2_IM, _mm256_fmadd_pd( beta_vec , ub_S0_C2_IM , out_S3_C2_IM ), accMask);
psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*xyBase)[0][1][0] + yfOffs[0])), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*xyBase)[0][1][0] + yfOffs[2])), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*xyBase)[0][1][1] + yfOffs[0])), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*xyBase)[0][1][1] + yfOffs[2])), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*xyBase)[0][2][0] + yfOffs[0])), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*xyBase)[0][2][0] + yfOffs[2])), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*xyBase)[0][2][1] + yfOffs[0])), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*xyBase)[0][2][1] + yfOffs[2])), 1);

b_S1_C0_RE = _mm256_sub_pd( psi_S0_RE , psi_S1_RE );
b_S1_C0_IM = _mm256_sub_pd( psi_S0_IM , psi_S1_IM );
psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*xyBase)[1][1][0] + yfOffs[0])), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*xyBase)[1][1][0] + yfOffs[2])), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*xyBase)[1][1][1] + yfOffs[0])), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*xyBase)[1][1][1] + yfOffs[2])), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*xyBase)[1][2][0] + yfOffs[0])), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*xyBase)[1][2][0] + yfOffs[2])), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*xyBase)[1][2][1] + yfOffs[0])), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*xyBase)[1][2][1] + yfOffs[2])), 1);

b_S1_C1_RE = _mm256_sub_pd( psi_S0_RE , psi_S1_RE );
b_S1_C1_IM = _mm256_sub_pd( psi_S0_IM , psi_S1_IM );
psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*xyBase)[2][1][0] + yfOffs[0])), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*xyBase)[2][1][0] + yfOffs[2])), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*xyBase)[2][1][1] + yfOffs[0])), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*xyBase)[2][1][1] + yfOffs[2])), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*xyBase)[2][2][0] + yfOffs[0])), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*xyBase)[2][2][0] + yfOffs[2])), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*xyBase)[2][2][1] + yfOffs[0])), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*xyBase)[2][2][1] + yfOffs[2])), 1);

b_S1_C2_RE = _mm256_sub_pd( psi_S0_RE , psi_S1_RE );
b_S1_C2_IM = _mm256_sub_pd( psi_S0_IM , psi_S1_IM );
ub_S1_C0_RE = _mm256_mul_pd( u_00_re , b_S1_C0_RE );
ub_S1_C0_RE = _mm256_fnmadd_pd( u_00_im , b_S1_C0_IM , ub_S1_C0_RE );
ub_S1_C0_IM = _mm256_mul_pd( u_00_re , b_S1_C0_IM );
ub_S1_C0_IM = _mm256_fmadd_pd( u_00_im , b_S1_C0_RE , ub_S1_C0_IM );
ub_S1_C0_RE = _mm256_fmadd_pd( u_10_re , b_S1_C1_RE , ub_S1_C0_RE );
ub_S1_C0_RE = _mm256_fnmadd_pd( u_10_im , b_S1_C1_IM , ub_S1_C0_RE );
ub_S1_C0_IM = _mm256_fmadd_pd( u_10_re , b_S1_C1_IM , ub_S1_C0_IM );
ub_S1_C0_IM = _mm256_fmadd_pd( u_10_im , b_S1_C1_RE , ub_S1_C0_IM );
ub_S1_C0_RE = _mm256_fmadd_pd( u_20_re , b_S1_C2_RE , ub_S1_C0_RE );
ub_S1_C0_RE = _mm256_fnmadd_pd( u_20_im , b_S1_C2_IM , ub_S1_C0_RE );
ub_S1_C0_IM = _mm256_fmadd_pd( u_20_re , b_S1_C2_IM , ub_S1_C0_IM );
ub_S1_C0_IM = _mm256_fmadd_pd( u_20_im , b_S1_C2_RE , ub_S1_C0_IM );
ub_S1_C1_RE = _mm256_mul_pd( u_01_re , b_S1_C0_RE );
ub_S1_C1_RE = _mm256_fnmadd_pd( u_01_im , b_S1_C0_IM , ub_S1_C1_RE );
ub_S1_C1_IM = _mm256_mul_pd( u_01_re , b_S1_C0_IM );
ub_S1_C1_IM = _mm256_fmadd_pd( u_01_im , b_S1_C0_RE , ub_S1_C1_IM );
ub_S1_C1_RE = _mm256_fmadd_pd( u_11_re , b_S1_C1_RE , ub_S1_C1_RE );
ub_S1_C1_RE = _mm256_fnmadd_pd( u_11_im , b_S1_C1_IM , ub_S1_C1_RE );
ub_S1_C1_IM = _mm256_fmadd_pd( u_11_re , b_S1_C1_IM , ub_S1_C1_IM );
ub_S1_C1_IM = _mm256_fmadd_pd( u_11_im , b_S1_C1_RE , ub_S1_C1_IM );
ub_S1_C1_RE = _mm256_fmadd_pd( u_21_re , b_S1_C2_RE , ub_S1_C1_RE );
ub_S1_C1_RE = _mm256_fnmadd_pd( u_21_im , b_S1_C2_IM , ub_S1_C1_RE );
ub_S1_C1_IM = _mm256_fmadd_pd( u_21_re , b_S1_C2_IM , ub_S1_C1_IM );
ub_S1_C1_IM = _mm256_fmadd_pd( u_21_im , b_S1_C2_RE , ub_S1_C1_IM );
ub_S1_C2_RE = _mm256_mul_pd( u_02_re , b_S1_C0_RE );
ub_S1_C2_RE = _mm256_fnmadd_pd( u_02_im , b_S1_C0_IM , ub_S1_C2_RE );
ub_S1_C2_IM = _mm256_mul_pd( u_02_re , b_S1_C0_IM );
ub_S1_C2_IM = _mm256_fmadd_pd( u_02_im , b_S1_C0_RE , ub_S1_C2_IM );
ub_S1_C2_RE = _mm256_fmadd_pd( u_12_re , b_S1_C1_RE , ub_S1_C2_RE );
ub_S1_C2_RE = _mm256_fnmadd_pd( u_12_im , b_S1_C1_IM , ub_S1_C2_RE );
ub_S1_C2_IM = _mm256_fmadd_pd( u_12_re , b_S1_C1_IM , ub_S1_C2_IM );
ub_S1_C2_IM = _mm256_fmadd_pd( u_12_im , b_S1_C1_RE , ub_S1_C2_IM );
ub_S1_C2_RE = _mm256_fmadd_pd( u_22_re , b_S1_C2_RE , ub_S1_C2_RE );
ub_S1_C2_RE = _mm256_fnmadd_pd( u_22_im , b_S1_C2_IM , ub_S1_C2_RE );
ub_S1_C2_IM = _mm256_fmadd_pd( u_22_re , b_S1_C2_IM , ub_S1_C2_IM );
ub_S1_C2_IM = _mm256_fmadd_pd( u_22_im , b_S1_C2_RE , ub_S1_C2_IM );
out_S1_C0_RE =_mm256_blendv_pd(out_S1_C0_RE, _mm256_fmadd_pd( beta_vec , ub_S1_C0_RE , out_S1_C0_RE ), accMask);
out_S1_C0_IM =_mm256_blendv_pd(out_S1_C0_IM, _mm256_fmadd_pd( beta_vec , ub_S1_C0_IM , out_S1_C0_IM ), accMask);
out_S1_C1_RE =_mm256_blendv_pd(out_S1_C1_RE, _mm256_fmadd_pd( beta_vec , ub_S1_C1_RE , out_S1_C1_RE ), accMask);
out_S1_C1_IM =_mm256_blendv_pd(out_S1_C1_IM, _mm256_fmadd_pd( beta_vec , ub_S1_C1_IM , out_S1_C1_IM ), accMask);
out_S1_C2_RE =_mm256_blendv_pd(out_S1_C2_RE, _mm256_fmadd_pd( beta_vec , ub_S1_C2_RE , out_S1_C2_RE ), accMask);
out_S1_C2_IM =_mm256_blendv_pd(out_S1_C2_IM, _mm256_fmadd_pd( beta_vec , ub_S1_C2_IM , out_S1_C2_IM ), accMask);
out_S2_C0_RE = _mm256_blendv_pd(out_S2_C0_RE, _mm256_fnmadd_pd( beta_vec , ub_S1_C0_RE , out_S2_C0_RE ), accMask);
out_S2_C0_IM = _mm256_blendv_pd(out_S2_C0_IM, _mm256_fnmadd_pd( beta_vec , ub_S1_C0_IM , out_S2_C0_IM ), accMask);
out_S2_C1_RE = _mm256_blendv_pd(out_S2_C1_RE, _mm256_fnmadd_pd( beta_vec , ub_S1_C1_RE , out_S2_C1_RE ), accMask);
out_S2_C1_IM = _mm256_blendv_pd(out_S2_C1_IM, _mm256_fnmadd_pd( beta_vec , ub_S1_C1_IM , out_S2_C1_IM ), accMask);
out_S2_C2_RE = _mm256_blendv_pd(out_S2_C2_RE, _mm256_fnmadd_pd( beta_vec , ub_S1_C2_RE , out_S2_C2_RE ), accMask);
out_S2_C2_IM = _mm256_blendv_pd(out_S2_C2_IM, _mm256_fnmadd_pd( beta_vec , ub_S1_C2_IM , out_S2_C2_IM ), accMask);
}
}
 if ( accumulate[4] ) { 
__m256d beta_vec = _mm256_setzero_pd();
beta_vec = _mm256_broadcast_sd((&coeff_s));

psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*zbBase)[0][0][0] + offs[0])), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*zbBase)[0][0][0] + offs[2])), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*zbBase)[0][0][1] + offs[0])), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*zbBase)[0][0][1] + offs[2])), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*zbBase)[0][2][0] + offs[0])), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*zbBase)[0][2][0] + offs[2])), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*zbBase)[0][2][1] + offs[0])), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*zbBase)[0][2][1] + offs[2])), 1);

b_S0_C0_RE = _mm256_sub_pd( psi_S0_RE , psi_S1_IM );
b_S0_C0_IM = _mm256_add_pd( psi_S0_IM , psi_S1_RE );
psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*zbBase)[1][0][0] + offs[0])), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*zbBase)[1][0][0] + offs[2])), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*zbBase)[1][0][1] + offs[0])), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*zbBase)[1][0][1] + offs[2])), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*zbBase)[1][2][0] + offs[0])), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*zbBase)[1][2][0] + offs[2])), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*zbBase)[1][2][1] + offs[0])), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*zbBase)[1][2][1] + offs[2])), 1);

b_S0_C1_RE = _mm256_sub_pd( psi_S0_RE , psi_S1_IM );
b_S0_C1_IM = _mm256_add_pd( psi_S0_IM , psi_S1_RE );
psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*zbBase)[2][0][0] + offs[0])), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*zbBase)[2][0][0] + offs[2])), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*zbBase)[2][0][1] + offs[0])), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*zbBase)[2][0][1] + offs[2])), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*zbBase)[2][2][0] + offs[0])), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*zbBase)[2][2][0] + offs[2])), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*zbBase)[2][2][1] + offs[0])), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*zbBase)[2][2][1] + offs[2])), 1);

b_S0_C2_RE = _mm256_sub_pd( psi_S0_RE , psi_S1_IM );
b_S0_C2_IM = _mm256_add_pd( psi_S0_IM , psi_S1_RE );
u_00_re = _mm256_load_pd((*gBase)[4][0][0][0]);

u_00_im = _mm256_load_pd((*gBase)[4][0][0][1]);

u_01_re = _mm256_load_pd((*gBase)[4][0][1][0]);

u_01_im = _mm256_load_pd((*gBase)[4][0][1][1]);

u_02_re = _mm256_load_pd((*gBase)[4][0][2][0]);

u_02_im = _mm256_load_pd((*gBase)[4][0][2][1]);

u_10_re = _mm256_load_pd((*gBase)[4][1][0][0]);

u_10_im = _mm256_load_pd((*gBase)[4][1][0][1]);

u_11_re = _mm256_load_pd((*gBase)[4][1][1][0]);

u_11_im = _mm256_load_pd((*gBase)[4][1][1][1]);

u_12_re = _mm256_load_pd((*gBase)[4][1][2][0]);

u_12_im = _mm256_load_pd((*gBase)[4][1][2][1]);

u_20_re = _mm256_mul_pd( u_01_re , u_12_re );
u_20_re = _mm256_fnmadd_pd( u_01_im , u_12_im , u_20_re );
u_20_re = _mm256_fnmadd_pd( u_02_re , u_11_re , u_20_re );
u_20_re = _mm256_fmadd_pd( u_02_im , u_11_im , u_20_re );
u_20_im = _mm256_mul_pd( u_02_re , u_11_im );
u_20_im = _mm256_fmadd_pd( u_02_im , u_11_re , u_20_im );
u_20_im = _mm256_fnmadd_pd( u_01_re , u_12_im , u_20_im );
u_20_im = _mm256_fnmadd_pd( u_01_im , u_12_re , u_20_im );
u_21_re = _mm256_mul_pd( u_02_re , u_10_re );
u_21_re = _mm256_fnmadd_pd( u_02_im , u_10_im , u_21_re );
u_21_re = _mm256_fnmadd_pd( u_00_re , u_12_re , u_21_re );
u_21_re = _mm256_fmadd_pd( u_00_im , u_12_im , u_21_re );
u_21_im = _mm256_mul_pd( u_00_re , u_12_im );
u_21_im = _mm256_fmadd_pd( u_00_im , u_12_re , u_21_im );
u_21_im = _mm256_fnmadd_pd( u_02_re , u_10_im , u_21_im );
u_21_im = _mm256_fnmadd_pd( u_02_im , u_10_re , u_21_im );
u_22_re = _mm256_mul_pd( u_00_re , u_11_re );
u_22_re = _mm256_fnmadd_pd( u_00_im , u_11_im , u_22_re );
u_22_re = _mm256_fnmadd_pd( u_01_re , u_10_re , u_22_re );
u_22_re = _mm256_fmadd_pd( u_01_im , u_10_im , u_22_re );
u_22_im = _mm256_mul_pd( u_01_re , u_10_im );
u_22_im = _mm256_fmadd_pd( u_01_im , u_10_re , u_22_im );
u_22_im = _mm256_fnmadd_pd( u_00_re , u_11_im , u_22_im );
u_22_im = _mm256_fnmadd_pd( u_00_im , u_11_re , u_22_im );
ub_S0_C0_RE = _mm256_mul_pd( u_00_re , b_S0_C0_RE );
ub_S0_C0_RE = _mm256_fmadd_pd( u_00_im , b_S0_C0_IM , ub_S0_C0_RE );
ub_S0_C0_IM = _mm256_mul_pd( u_00_re , b_S0_C0_IM );
ub_S0_C0_IM = _mm256_fnmadd_pd( u_00_im , b_S0_C0_RE , ub_S0_C0_IM );
ub_S0_C0_RE = _mm256_fmadd_pd( u_01_re , b_S0_C1_RE , ub_S0_C0_RE );
ub_S0_C0_RE = _mm256_fmadd_pd( u_01_im , b_S0_C1_IM , ub_S0_C0_RE );
ub_S0_C0_IM = _mm256_fmadd_pd( u_01_re , b_S0_C1_IM , ub_S0_C0_IM );
ub_S0_C0_IM = _mm256_fnmadd_pd( u_01_im , b_S0_C1_RE , ub_S0_C0_IM );
ub_S0_C0_RE = _mm256_fmadd_pd( u_02_re , b_S0_C2_RE , ub_S0_C0_RE );
ub_S0_C0_RE = _mm256_fmadd_pd( u_02_im , b_S0_C2_IM , ub_S0_C0_RE );
ub_S0_C0_IM = _mm256_fmadd_pd( u_02_re , b_S0_C2_IM , ub_S0_C0_IM );
ub_S0_C0_IM = _mm256_fnmadd_pd( u_02_im , b_S0_C2_RE , ub_S0_C0_IM );
ub_S0_C1_RE = _mm256_mul_pd( u_10_re , b_S0_C0_RE );
ub_S0_C1_RE = _mm256_fmadd_pd( u_10_im , b_S0_C0_IM , ub_S0_C1_RE );
ub_S0_C1_IM = _mm256_mul_pd( u_10_re , b_S0_C0_IM );
ub_S0_C1_IM = _mm256_fnmadd_pd( u_10_im , b_S0_C0_RE , ub_S0_C1_IM );
ub_S0_C1_RE = _mm256_fmadd_pd( u_11_re , b_S0_C1_RE , ub_S0_C1_RE );
ub_S0_C1_RE = _mm256_fmadd_pd( u_11_im , b_S0_C1_IM , ub_S0_C1_RE );
ub_S0_C1_IM = _mm256_fmadd_pd( u_11_re , b_S0_C1_IM , ub_S0_C1_IM );
ub_S0_C1_IM = _mm256_fnmadd_pd( u_11_im , b_S0_C1_RE , ub_S0_C1_IM );
ub_S0_C1_RE = _mm256_fmadd_pd( u_12_re , b_S0_C2_RE , ub_S0_C1_RE );
ub_S0_C1_RE = _mm256_fmadd_pd( u_12_im , b_S0_C2_IM , ub_S0_C1_RE );
ub_S0_C1_IM = _mm256_fmadd_pd( u_12_re , b_S0_C2_IM , ub_S0_C1_IM );
ub_S0_C1_IM = _mm256_fnmadd_pd( u_12_im , b_S0_C2_RE , ub_S0_C1_IM );
ub_S0_C2_RE = _mm256_mul_pd( u_20_re , b_S0_C0_RE );
ub_S0_C2_RE = _mm256_fmadd_pd( u_20_im , b_S0_C0_IM , ub_S0_C2_RE );
ub_S0_C2_IM = _mm256_mul_pd( u_20_re , b_S0_C0_IM );
ub_S0_C2_IM = _mm256_fnmadd_pd( u_20_im , b_S0_C0_RE , ub_S0_C2_IM );
ub_S0_C2_RE = _mm256_fmadd_pd( u_21_re , b_S0_C1_RE , ub_S0_C2_RE );
ub_S0_C2_RE = _mm256_fmadd_pd( u_21_im , b_S0_C1_IM , ub_S0_C2_RE );
ub_S0_C2_IM = _mm256_fmadd_pd( u_21_re , b_S0_C1_IM , ub_S0_C2_IM );
ub_S0_C2_IM = _mm256_fnmadd_pd( u_21_im , b_S0_C1_RE , ub_S0_C2_IM );
ub_S0_C2_RE = _mm256_fmadd_pd( u_22_re , b_S0_C2_RE , ub_S0_C2_RE );
ub_S0_C2_RE = _mm256_fmadd_pd( u_22_im , b_S0_C2_IM , ub_S0_C2_RE );
ub_S0_C2_IM = _mm256_fmadd_pd( u_22_re , b_S0_C2_IM , ub_S0_C2_IM );
ub_S0_C2_IM = _mm256_fnmadd_pd( u_22_im , b_S0_C2_RE , ub_S0_C2_IM );
out_S0_C0_RE = _mm256_fmadd_pd( beta_vec , ub_S0_C0_RE , out_S0_C0_RE );
out_S0_C0_IM = _mm256_fmadd_pd( beta_vec , ub_S0_C0_IM , out_S0_C0_IM );
out_S0_C1_RE = _mm256_fmadd_pd( beta_vec , ub_S0_C1_RE , out_S0_C1_RE );
out_S0_C1_IM = _mm256_fmadd_pd( beta_vec , ub_S0_C1_IM , out_S0_C1_IM );
out_S0_C2_RE = _mm256_fmadd_pd( beta_vec , ub_S0_C2_RE , out_S0_C2_RE );
out_S0_C2_IM = _mm256_fmadd_pd( beta_vec , ub_S0_C2_IM , out_S0_C2_IM );
out_S2_C0_RE = _mm256_fmadd_pd( beta_vec , ub_S0_C0_IM , out_S2_C0_RE );
out_S2_C0_IM = _mm256_fnmadd_pd( beta_vec , ub_S0_C0_RE , out_S2_C0_IM );
out_S2_C1_RE = _mm256_fmadd_pd( beta_vec , ub_S0_C1_IM , out_S2_C1_RE );
out_S2_C1_IM = _mm256_fnmadd_pd( beta_vec , ub_S0_C1_RE , out_S2_C1_IM );
out_S2_C2_RE = _mm256_fmadd_pd( beta_vec , ub_S0_C2_IM , out_S2_C2_RE );
out_S2_C2_IM = _mm256_fnmadd_pd( beta_vec , ub_S0_C2_RE , out_S2_C2_IM );
psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*zbBase)[0][1][0] + offs[0])), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*zbBase)[0][1][0] + offs[2])), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*zbBase)[0][1][1] + offs[0])), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*zbBase)[0][1][1] + offs[2])), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*zbBase)[0][3][0] + offs[0])), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*zbBase)[0][3][0] + offs[2])), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*zbBase)[0][3][1] + offs[0])), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*zbBase)[0][3][1] + offs[2])), 1);

b_S1_C0_RE = _mm256_add_pd( psi_S0_RE , psi_S1_IM );
b_S1_C0_IM = _mm256_sub_pd( psi_S0_IM , psi_S1_RE );
psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*zbBase)[1][1][0] + offs[0])), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*zbBase)[1][1][0] + offs[2])), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*zbBase)[1][1][1] + offs[0])), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*zbBase)[1][1][1] + offs[2])), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*zbBase)[1][3][0] + offs[0])), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*zbBase)[1][3][0] + offs[2])), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*zbBase)[1][3][1] + offs[0])), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*zbBase)[1][3][1] + offs[2])), 1);

b_S1_C1_RE = _mm256_add_pd( psi_S0_RE , psi_S1_IM );
b_S1_C1_IM = _mm256_sub_pd( psi_S0_IM , psi_S1_RE );
psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*zbBase)[2][1][0] + offs[0])), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*zbBase)[2][1][0] + offs[2])), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*zbBase)[2][1][1] + offs[0])), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*zbBase)[2][1][1] + offs[2])), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*zbBase)[2][3][0] + offs[0])), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*zbBase)[2][3][0] + offs[2])), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*zbBase)[2][3][1] + offs[0])), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*zbBase)[2][3][1] + offs[2])), 1);

b_S1_C2_RE = _mm256_add_pd( psi_S0_RE , psi_S1_IM );
b_S1_C2_IM = _mm256_sub_pd( psi_S0_IM , psi_S1_RE );
ub_S1_C0_RE = _mm256_mul_pd( u_00_re , b_S1_C0_RE );
ub_S1_C0_RE = _mm256_fmadd_pd( u_00_im , b_S1_C0_IM , ub_S1_C0_RE );
ub_S1_C0_IM = _mm256_mul_pd( u_00_re , b_S1_C0_IM );
ub_S1_C0_IM = _mm256_fnmadd_pd( u_00_im , b_S1_C0_RE , ub_S1_C0_IM );
ub_S1_C0_RE = _mm256_fmadd_pd( u_01_re , b_S1_C1_RE , ub_S1_C0_RE );
ub_S1_C0_RE = _mm256_fmadd_pd( u_01_im , b_S1_C1_IM , ub_S1_C0_RE );
ub_S1_C0_IM = _mm256_fmadd_pd( u_01_re , b_S1_C1_IM , ub_S1_C0_IM );
ub_S1_C0_IM = _mm256_fnmadd_pd( u_01_im , b_S1_C1_RE , ub_S1_C0_IM );
ub_S1_C0_RE = _mm256_fmadd_pd( u_02_re , b_S1_C2_RE , ub_S1_C0_RE );
ub_S1_C0_RE = _mm256_fmadd_pd( u_02_im , b_S1_C2_IM , ub_S1_C0_RE );
ub_S1_C0_IM = _mm256_fmadd_pd( u_02_re , b_S1_C2_IM , ub_S1_C0_IM );
ub_S1_C0_IM = _mm256_fnmadd_pd( u_02_im , b_S1_C2_RE , ub_S1_C0_IM );
ub_S1_C1_RE = _mm256_mul_pd( u_10_re , b_S1_C0_RE );
ub_S1_C1_RE = _mm256_fmadd_pd( u_10_im , b_S1_C0_IM , ub_S1_C1_RE );
ub_S1_C1_IM = _mm256_mul_pd( u_10_re , b_S1_C0_IM );
ub_S1_C1_IM = _mm256_fnmadd_pd( u_10_im , b_S1_C0_RE , ub_S1_C1_IM );
ub_S1_C1_RE = _mm256_fmadd_pd( u_11_re , b_S1_C1_RE , ub_S1_C1_RE );
ub_S1_C1_RE = _mm256_fmadd_pd( u_11_im , b_S1_C1_IM , ub_S1_C1_RE );
ub_S1_C1_IM = _mm256_fmadd_pd( u_11_re , b_S1_C1_IM , ub_S1_C1_IM );
ub_S1_C1_IM = _mm256_fnmadd_pd( u_11_im , b_S1_C1_RE , ub_S1_C1_IM );
ub_S1_C1_RE = _mm256_fmadd_pd( u_12_re , b_S1_C2_RE , ub_S1_C1_RE );
ub_S1_C1_RE = _mm256_fmadd_pd( u_12_im , b_S1_C2_IM , ub_S1_C1_RE );
ub_S1_C1_IM = _mm256_fmadd_pd( u_12_re , b_S1_C2_IM , ub_S1_C1_IM );
ub_S1_C1_IM = _mm256_fnmadd_pd( u_12_im , b_S1_C2_RE , ub_S1_C1_IM );
ub_S1_C2_RE = _mm256_mul_pd( u_20_re , b_S1_C0_RE );
ub_S1_C2_RE = _mm256_fmadd_pd( u_20_im , b_S1_C0_IM , ub_S1_C2_RE );
ub_S1_C2_IM = _mm256_mul_pd( u_20_re , b_S1_C0_IM );
ub_S1_C2_IM = _mm256_fnmadd_pd( u_20_im , b_S1_C0_RE , ub_S1_C2_IM );
ub_S1_C2_RE = _mm256_fmadd_pd( u_21_re , b_S1_C1_RE , ub_S1_C2_RE );
ub_S1_C2_RE = _mm256_fmadd_pd( u_21_im , b_S1_C1_IM , ub_S1_C2_RE );
ub_S1_C2_IM = _mm256_fmadd_pd( u_21_re , b_S1_C1_IM , ub_S1_C2_IM );
ub_S1_C2_IM = _mm256_fnmadd_pd( u_21_im , b_S1_C1_RE , ub_S1_C2_IM );
ub_S1_C2_RE = _mm256_fmadd_pd( u_22_re , b_S1_C2_RE , ub_S1_C2_RE );
ub_S1_C2_RE = _mm256_fmadd_pd( u_22_im , b_S1_C2_IM , ub_S1_C2_RE );
ub_S1_C2_IM = _mm256_fmadd_pd( u_22_re , b_S1_C2_IM , ub_S1_C2_IM );
ub_S1_C2_IM = _mm256_fnmadd_pd( u_22_im , b_S1_C2_RE , ub_S1_C2_IM );
out_S1_C0_RE = _mm256_fmadd_pd( beta_vec , ub_S1_C0_RE , out_S1_C0_RE );
out_S1_C0_IM = _mm256_fmadd_pd( beta_vec , ub_S1_C0_IM , out_S1_C0_IM );
out_S1_C1_RE = _mm256_fmadd_pd( beta_vec , ub_S1_C1_RE , out_S1_C1_RE );
out_S1_C1_IM = _mm256_fmadd_pd( beta_vec , ub_S1_C1_IM , out_S1_C1_IM );
out_S1_C2_RE = _mm256_fmadd_pd( beta_vec , ub_S1_C2_RE , out_S1_C2_RE );
out_S1_C2_IM = _mm256_fmadd_pd( beta_vec , ub_S1_C2_IM , out_S1_C2_IM );
out_S3_C0_RE = _mm256_fnmadd_pd( beta_vec , ub_S1_C0_IM , out_S3_C0_RE );
out_S3_C0_IM = _mm256_fmadd_pd( beta_vec , ub_S1_C0_RE , out_S3_C0_IM );
out_S3_C1_RE = _mm256_fnmadd_pd( beta_vec , ub_S1_C1_IM , out_S3_C1_RE );
out_S3_C1_IM = _mm256_fmadd_pd( beta_vec , ub_S1_C1_RE , out_S3_C1_IM );
out_S3_C2_RE = _mm256_fnmadd_pd( beta_vec , ub_S1_C2_IM , out_S3_C2_RE );
out_S3_C2_IM = _mm256_fmadd_pd( beta_vec , ub_S1_C2_RE , out_S3_C2_IM );
}
 if ( accumulate[5] ) { 
__m256d beta_vec = _mm256_setzero_pd();
beta_vec = _mm256_broadcast_sd((&coeff_s));

psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*zfBase)[0][0][0] + offs[0])), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*zfBase)[0][0][0] + offs[2])), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*zfBase)[0][0][1] + offs[0])), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*zfBase)[0][0][1] + offs[2])), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*zfBase)[0][2][0] + offs[0])), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*zfBase)[0][2][0] + offs[2])), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*zfBase)[0][2][1] + offs[0])), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*zfBase)[0][2][1] + offs[2])), 1);

b_S0_C0_RE = _mm256_add_pd( psi_S0_RE , psi_S1_IM );
b_S0_C0_IM = _mm256_sub_pd( psi_S0_IM , psi_S1_RE );
psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*zfBase)[1][0][0] + offs[0])), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*zfBase)[1][0][0] + offs[2])), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*zfBase)[1][0][1] + offs[0])), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*zfBase)[1][0][1] + offs[2])), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*zfBase)[1][2][0] + offs[0])), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*zfBase)[1][2][0] + offs[2])), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*zfBase)[1][2][1] + offs[0])), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*zfBase)[1][2][1] + offs[2])), 1);

b_S0_C1_RE = _mm256_add_pd( psi_S0_RE , psi_S1_IM );
b_S0_C1_IM = _mm256_sub_pd( psi_S0_IM , psi_S1_RE );
psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*zfBase)[2][0][0] + offs[0])), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*zfBase)[2][0][0] + offs[2])), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*zfBase)[2][0][1] + offs[0])), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*zfBase)[2][0][1] + offs[2])), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*zfBase)[2][2][0] + offs[0])), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*zfBase)[2][2][0] + offs[2])), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*zfBase)[2][2][1] + offs[0])), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*zfBase)[2][2][1] + offs[2])), 1);

b_S0_C2_RE = _mm256_add_pd( psi_S0_RE , psi_S1_IM );
b_S0_C2_IM = _mm256_sub_pd( psi_S0_IM , psi_S1_RE );
u_00_re = _mm256_load_pd((*gBase)[5][0][0][0]);

u_00_im = _mm256_load_pd((*gBase)[5][0][0][1]);

u_01_re = _mm256_load_pd((*gBase)[5][0][1][0]);

u_01_im = _mm256_load_pd((*gBase)[5][0][1][1]);

u_02_re = _mm256_load_pd((*gBase)[5][0][2][0]);

u_02_im = _mm256_load_pd((*gBase)[5][0][2][1]);

u_10_re = _mm256_load_pd((*gBase)[5][1][0][0]);

u_10_im = _mm256_load_pd((*gBase)[5][1][0][1]);

u_11_re = _mm256_load_pd((*gBase)[5][1][1][0]);

u_11_im = _mm256_load_pd((*gBase)[5][1][1][1]);

u_12_re = _mm256_load_pd((*gBase)[5][1][2][0]);

u_12_im = _mm256_load_pd((*gBase)[5][1][2][1]);

u_20_re = _mm256_mul_pd( u_01_re , u_12_re );
u_20_re = _mm256_fnmadd_pd( u_01_im , u_12_im , u_20_re );
u_20_re = _mm256_fnmadd_pd( u_02_re , u_11_re , u_20_re );
u_20_re = _mm256_fmadd_pd( u_02_im , u_11_im , u_20_re );
u_20_im = _mm256_mul_pd( u_02_re , u_11_im );
u_20_im = _mm256_fmadd_pd( u_02_im , u_11_re , u_20_im );
u_20_im = _mm256_fnmadd_pd( u_01_re , u_12_im , u_20_im );
u_20_im = _mm256_fnmadd_pd( u_01_im , u_12_re , u_20_im );
u_21_re = _mm256_mul_pd( u_02_re , u_10_re );
u_21_re = _mm256_fnmadd_pd( u_02_im , u_10_im , u_21_re );
u_21_re = _mm256_fnmadd_pd( u_00_re , u_12_re , u_21_re );
u_21_re = _mm256_fmadd_pd( u_00_im , u_12_im , u_21_re );
u_21_im = _mm256_mul_pd( u_00_re , u_12_im );
u_21_im = _mm256_fmadd_pd( u_00_im , u_12_re , u_21_im );
u_21_im = _mm256_fnmadd_pd( u_02_re , u_10_im , u_21_im );
u_21_im = _mm256_fnmadd_pd( u_02_im , u_10_re , u_21_im );
u_22_re = _mm256_mul_pd( u_00_re , u_11_re );
u_22_re = _mm256_fnmadd_pd( u_00_im , u_11_im , u_22_re );
u_22_re = _mm256_fnmadd_pd( u_01_re , u_10_re , u_22_re );
u_22_re = _mm256_fmadd_pd( u_01_im , u_10_im , u_22_re );
u_22_im = _mm256_mul_pd( u_01_re , u_10_im );
u_22_im = _mm256_fmadd_pd( u_01_im , u_10_re , u_22_im );
u_22_im = _mm256_fnmadd_pd( u_00_re , u_11_im , u_22_im );
u_22_im = _mm256_fnmadd_pd( u_00_im , u_11_re , u_22_im );
ub_S0_C0_RE = _mm256_mul_pd( u_00_re , b_S0_C0_RE );
ub_S0_C0_RE = _mm256_fnmadd_pd( u_00_im , b_S0_C0_IM , ub_S0_C0_RE );
ub_S0_C0_IM = _mm256_mul_pd( u_00_re , b_S0_C0_IM );
ub_S0_C0_IM = _mm256_fmadd_pd( u_00_im , b_S0_C0_RE , ub_S0_C0_IM );
ub_S0_C0_RE = _mm256_fmadd_pd( u_10_re , b_S0_C1_RE , ub_S0_C0_RE );
ub_S0_C0_RE = _mm256_fnmadd_pd( u_10_im , b_S0_C1_IM , ub_S0_C0_RE );
ub_S0_C0_IM = _mm256_fmadd_pd( u_10_re , b_S0_C1_IM , ub_S0_C0_IM );
ub_S0_C0_IM = _mm256_fmadd_pd( u_10_im , b_S0_C1_RE , ub_S0_C0_IM );
ub_S0_C0_RE = _mm256_fmadd_pd( u_20_re , b_S0_C2_RE , ub_S0_C0_RE );
ub_S0_C0_RE = _mm256_fnmadd_pd( u_20_im , b_S0_C2_IM , ub_S0_C0_RE );
ub_S0_C0_IM = _mm256_fmadd_pd( u_20_re , b_S0_C2_IM , ub_S0_C0_IM );
ub_S0_C0_IM = _mm256_fmadd_pd( u_20_im , b_S0_C2_RE , ub_S0_C0_IM );
ub_S0_C1_RE = _mm256_mul_pd( u_01_re , b_S0_C0_RE );
ub_S0_C1_RE = _mm256_fnmadd_pd( u_01_im , b_S0_C0_IM , ub_S0_C1_RE );
ub_S0_C1_IM = _mm256_mul_pd( u_01_re , b_S0_C0_IM );
ub_S0_C1_IM = _mm256_fmadd_pd( u_01_im , b_S0_C0_RE , ub_S0_C1_IM );
ub_S0_C1_RE = _mm256_fmadd_pd( u_11_re , b_S0_C1_RE , ub_S0_C1_RE );
ub_S0_C1_RE = _mm256_fnmadd_pd( u_11_im , b_S0_C1_IM , ub_S0_C1_RE );
ub_S0_C1_IM = _mm256_fmadd_pd( u_11_re , b_S0_C1_IM , ub_S0_C1_IM );
ub_S0_C1_IM = _mm256_fmadd_pd( u_11_im , b_S0_C1_RE , ub_S0_C1_IM );
ub_S0_C1_RE = _mm256_fmadd_pd( u_21_re , b_S0_C2_RE , ub_S0_C1_RE );
ub_S0_C1_RE = _mm256_fnmadd_pd( u_21_im , b_S0_C2_IM , ub_S0_C1_RE );
ub_S0_C1_IM = _mm256_fmadd_pd( u_21_re , b_S0_C2_IM , ub_S0_C1_IM );
ub_S0_C1_IM = _mm256_fmadd_pd( u_21_im , b_S0_C2_RE , ub_S0_C1_IM );
ub_S0_C2_RE = _mm256_mul_pd( u_02_re , b_S0_C0_RE );
ub_S0_C2_RE = _mm256_fnmadd_pd( u_02_im , b_S0_C0_IM , ub_S0_C2_RE );
ub_S0_C2_IM = _mm256_mul_pd( u_02_re , b_S0_C0_IM );
ub_S0_C2_IM = _mm256_fmadd_pd( u_02_im , b_S0_C0_RE , ub_S0_C2_IM );
ub_S0_C2_RE = _mm256_fmadd_pd( u_12_re , b_S0_C1_RE , ub_S0_C2_RE );
ub_S0_C2_RE = _mm256_fnmadd_pd( u_12_im , b_S0_C1_IM , ub_S0_C2_RE );
ub_S0_C2_IM = _mm256_fmadd_pd( u_12_re , b_S0_C1_IM , ub_S0_C2_IM );
ub_S0_C2_IM = _mm256_fmadd_pd( u_12_im , b_S0_C1_RE , ub_S0_C2_IM );
ub_S0_C2_RE = _mm256_fmadd_pd( u_22_re , b_S0_C2_RE , ub_S0_C2_RE );
ub_S0_C2_RE = _mm256_fnmadd_pd( u_22_im , b_S0_C2_IM , ub_S0_C2_RE );
ub_S0_C2_IM = _mm256_fmadd_pd( u_22_re , b_S0_C2_IM , ub_S0_C2_IM );
ub_S0_C2_IM = _mm256_fmadd_pd( u_22_im , b_S0_C2_RE , ub_S0_C2_IM );
out_S0_C0_RE = _mm256_fmadd_pd( beta_vec , ub_S0_C0_RE , out_S0_C0_RE );
out_S0_C0_IM = _mm256_fmadd_pd( beta_vec , ub_S0_C0_IM , out_S0_C0_IM );
out_S0_C1_RE = _mm256_fmadd_pd( beta_vec , ub_S0_C1_RE , out_S0_C1_RE );
out_S0_C1_IM = _mm256_fmadd_pd( beta_vec , ub_S0_C1_IM , out_S0_C1_IM );
out_S0_C2_RE = _mm256_fmadd_pd( beta_vec , ub_S0_C2_RE , out_S0_C2_RE );
out_S0_C2_IM = _mm256_fmadd_pd( beta_vec , ub_S0_C2_IM , out_S0_C2_IM );
out_S2_C0_RE = _mm256_fnmadd_pd( beta_vec , ub_S0_C0_IM , out_S2_C0_RE );
out_S2_C0_IM = _mm256_fmadd_pd( beta_vec , ub_S0_C0_RE , out_S2_C0_IM );
out_S2_C1_RE = _mm256_fnmadd_pd( beta_vec , ub_S0_C1_IM , out_S2_C1_RE );
out_S2_C1_IM = _mm256_fmadd_pd( beta_vec , ub_S0_C1_RE , out_S2_C1_IM );
out_S2_C2_RE = _mm256_fnmadd_pd( beta_vec , ub_S0_C2_IM , out_S2_C2_RE );
out_S2_C2_IM = _mm256_fmadd_pd( beta_vec , ub_S0_C2_RE , out_S2_C2_IM );
psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*zfBase)[0][1][0] + offs[0])), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*zfBase)[0][1][0] + offs[2])), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*zfBase)[0][1][1] + offs[0])), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*zfBase)[0][1][1] + offs[2])), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*zfBase)[0][3][0] + offs[0])), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*zfBase)[0][3][0] + offs[2])), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*zfBase)[0][3][1] + offs[0])), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*zfBase)[0][3][1] + offs[2])), 1);

b_S1_C0_RE = _mm256_sub_pd( psi_S0_RE , psi_S1_IM );
b_S1_C0_IM = _mm256_add_pd( psi_S0_IM , psi_S1_RE );
psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*zfBase)[1][1][0] + offs[0])), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*zfBase)[1][1][0] + offs[2])), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*zfBase)[1][1][1] + offs[0])), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*zfBase)[1][1][1] + offs[2])), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*zfBase)[1][3][0] + offs[0])), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*zfBase)[1][3][0] + offs[2])), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*zfBase)[1][3][1] + offs[0])), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*zfBase)[1][3][1] + offs[2])), 1);

b_S1_C1_RE = _mm256_sub_pd( psi_S0_RE , psi_S1_IM );
b_S1_C1_IM = _mm256_add_pd( psi_S0_IM , psi_S1_RE );
psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*zfBase)[2][1][0] + offs[0])), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*zfBase)[2][1][0] + offs[2])), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*zfBase)[2][1][1] + offs[0])), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*zfBase)[2][1][1] + offs[2])), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*zfBase)[2][3][0] + offs[0])), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*zfBase)[2][3][0] + offs[2])), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*zfBase)[2][3][1] + offs[0])), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*zfBase)[2][3][1] + offs[2])), 1);

b_S1_C2_RE = _mm256_sub_pd( psi_S0_RE , psi_S1_IM );
b_S1_C2_IM = _mm256_add_pd( psi_S0_IM , psi_S1_RE );
ub_S1_C0_RE = _mm256_mul_pd( u_00_re , b_S1_C0_RE );
ub_S1_C0_RE = _mm256_fnmadd_pd( u_00_im , b_S1_C0_IM , ub_S1_C0_RE );
ub_S1_C0_IM = _mm256_mul_pd( u_00_re , b_S1_C0_IM );
ub_S1_C0_IM = _mm256_fmadd_pd( u_00_im , b_S1_C0_RE , ub_S1_C0_IM );
ub_S1_C0_RE = _mm256_fmadd_pd( u_10_re , b_S1_C1_RE , ub_S1_C0_RE );
ub_S1_C0_RE = _mm256_fnmadd_pd( u_10_im , b_S1_C1_IM , ub_S1_C0_RE );
ub_S1_C0_IM = _mm256_fmadd_pd( u_10_re , b_S1_C1_IM , ub_S1_C0_IM );
ub_S1_C0_IM = _mm256_fmadd_pd( u_10_im , b_S1_C1_RE , ub_S1_C0_IM );
ub_S1_C0_RE = _mm256_fmadd_pd( u_20_re , b_S1_C2_RE , ub_S1_C0_RE );
ub_S1_C0_RE = _mm256_fnmadd_pd( u_20_im , b_S1_C2_IM , ub_S1_C0_RE );
ub_S1_C0_IM = _mm256_fmadd_pd( u_20_re , b_S1_C2_IM , ub_S1_C0_IM );
ub_S1_C0_IM = _mm256_fmadd_pd( u_20_im , b_S1_C2_RE , ub_S1_C0_IM );
ub_S1_C1_RE = _mm256_mul_pd( u_01_re , b_S1_C0_RE );
ub_S1_C1_RE = _mm256_fnmadd_pd( u_01_im , b_S1_C0_IM , ub_S1_C1_RE );
ub_S1_C1_IM = _mm256_mul_pd( u_01_re , b_S1_C0_IM );
ub_S1_C1_IM = _mm256_fmadd_pd( u_01_im , b_S1_C0_RE , ub_S1_C1_IM );
ub_S1_C1_RE = _mm256_fmadd_pd( u_11_re , b_S1_C1_RE , ub_S1_C1_RE );
ub_S1_C1_RE = _mm256_fnmadd_pd( u_11_im , b_S1_C1_IM , ub_S1_C1_RE );
ub_S1_C1_IM = _mm256_fmadd_pd( u_11_re , b_S1_C1_IM , ub_S1_C1_IM );
ub_S1_C1_IM = _mm256_fmadd_pd( u_11_im , b_S1_C1_RE , ub_S1_C1_IM );
ub_S1_C1_RE = _mm256_fmadd_pd( u_21_re , b_S1_C2_RE , ub_S1_C1_RE );
ub_S1_C1_RE = _mm256_fnmadd_pd( u_21_im , b_S1_C2_IM , ub_S1_C1_RE );
ub_S1_C1_IM = _mm256_fmadd_pd( u_21_re , b_S1_C2_IM , ub_S1_C1_IM );
ub_S1_C1_IM = _mm256_fmadd_pd( u_21_im , b_S1_C2_RE , ub_S1_C1_IM );
ub_S1_C2_RE = _mm256_mul_pd( u_02_re , b_S1_C0_RE );
ub_S1_C2_RE = _mm256_fnmadd_pd( u_02_im , b_S1_C0_IM , ub_S1_C2_RE );
ub_S1_C2_IM = _mm256_mul_pd( u_02_re , b_S1_C0_IM );
ub_S1_C2_IM = _mm256_fmadd_pd( u_02_im , b_S1_C0_RE , ub_S1_C2_IM );
ub_S1_C2_RE = _mm256_fmadd_pd( u_12_re , b_S1_C1_RE , ub_S1_C2_RE );
ub_S1_C2_RE = _mm256_fnmadd_pd( u_12_im , b_S1_C1_IM , ub_S1_C2_RE );
ub_S1_C2_IM = _mm256_fmadd_pd( u_12_re , b_S1_C1_IM , ub_S1_C2_IM );
ub_S1_C2_IM = _mm256_fmadd_pd( u_12_im , b_S1_C1_RE , ub_S1_C2_IM );
ub_S1_C2_RE = _mm256_fmadd_pd( u_22_re , b_S1_C2_RE , ub_S1_C2_RE );
ub_S1_C2_RE = _mm256_fnmadd_pd( u_22_im , b_S1_C2_IM , ub_S1_C2_RE );
ub_S1_C2_IM = _mm256_fmadd_pd( u_22_re , b_S1_C2_IM , ub_S1_C2_IM );
ub_S1_C2_IM = _mm256_fmadd_pd( u_22_im , b_S1_C2_RE , ub_S1_C2_IM );
out_S1_C0_RE = _mm256_fmadd_pd( beta_vec , ub_S1_C0_RE , out_S1_C0_RE );
out_S1_C0_IM = _mm256_fmadd_pd( beta_vec , ub_S1_C0_IM , out_S1_C0_IM );
out_S1_C1_RE = _mm256_fmadd_pd( beta_vec , ub_S1_C1_RE , out_S1_C1_RE );
out_S1_C1_IM = _mm256_fmadd_pd( beta_vec , ub_S1_C1_IM , out_S1_C1_IM );
out_S1_C2_RE = _mm256_fmadd_pd( beta_vec , ub_S1_C2_RE , out_S1_C2_RE );
out_S1_C2_IM = _mm256_fmadd_pd( beta_vec , ub_S1_C2_IM , out_S1_C2_IM );
out_S3_C0_RE = _mm256_fmadd_pd( beta_vec , ub_S1_C0_IM , out_S3_C0_RE );
out_S3_C0_IM = _mm256_fnmadd_pd( beta_vec , ub_S1_C0_RE , out_S3_C0_IM );
out_S3_C1_RE = _mm256_fmadd_pd( beta_vec , ub_S1_C1_IM , out_S3_C1_RE );
out_S3_C1_IM = _mm256_fnmadd_pd( beta_vec , ub_S1_C1_RE , out_S3_C1_IM );
out_S3_C2_RE = _mm256_fmadd_pd( beta_vec , ub_S1_C2_IM , out_S3_C2_RE );
out_S3_C2_IM = _mm256_fnmadd_pd( beta_vec , ub_S1_C2_RE , out_S3_C2_IM );
}
tbc_phase_re = _mm256_broadcast_sd((&tbc_phases[3][0]));

tbc_phase_im = _mm256_broadcast_sd((&tbc_phases[3][1]));

 if ( accumulate[6] ) { 
__m256d beta_vec = _mm256_setzero_pd();
beta_vec = _mm256_broadcast_sd((&coeff_t_b));

psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*tbBase)[0][0][0] + offs[0])), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*tbBase)[0][0][0] + offs[2])), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*tbBase)[0][0][1] + offs[0])), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*tbBase)[0][0][1] + offs[2])), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*tbBase)[0][2][0] + offs[0])), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*tbBase)[0][2][0] + offs[2])), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*tbBase)[0][2][1] + offs[0])), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*tbBase)[0][2][1] + offs[2])), 1);

b_S0_C0_RE = _mm256_add_pd( psi_S0_RE , psi_S1_RE );
b_S0_C0_IM = _mm256_add_pd( psi_S0_IM , psi_S1_IM );
psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*tbBase)[1][0][0] + offs[0])), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*tbBase)[1][0][0] + offs[2])), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*tbBase)[1][0][1] + offs[0])), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*tbBase)[1][0][1] + offs[2])), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*tbBase)[1][2][0] + offs[0])), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*tbBase)[1][2][0] + offs[2])), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*tbBase)[1][2][1] + offs[0])), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*tbBase)[1][2][1] + offs[2])), 1);

b_S0_C1_RE = _mm256_add_pd( psi_S0_RE , psi_S1_RE );
b_S0_C1_IM = _mm256_add_pd( psi_S0_IM , psi_S1_IM );
psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*tbBase)[2][0][0] + offs[0])), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*tbBase)[2][0][0] + offs[2])), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*tbBase)[2][0][1] + offs[0])), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*tbBase)[2][0][1] + offs[2])), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*tbBase)[2][2][0] + offs[0])), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*tbBase)[2][2][0] + offs[2])), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*tbBase)[2][2][1] + offs[0])), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*tbBase)[2][2][1] + offs[2])), 1);

b_S0_C2_RE = _mm256_add_pd( psi_S0_RE , psi_S1_RE );
b_S0_C2_IM = _mm256_add_pd( psi_S0_IM , psi_S1_IM );
u_00_re = _mm256_load_pd((*gBase)[6][0][0][0]);

u_00_im = _mm256_load_pd((*gBase)[6][0][0][1]);

u_01_re = _mm256_load_pd((*gBase)[6][0][1][0]);

u_01_im = _mm256_load_pd((*gBase)[6][0][1][1]);

u_02_re = _mm256_load_pd((*gBase)[6][0][2][0]);

u_02_im = _mm256_load_pd((*gBase)[6][0][2][1]);

u_10_re = _mm256_load_pd((*gBase)[6][1][0][0]);

u_10_im = _mm256_load_pd((*gBase)[6][1][0][1]);

u_11_re = _mm256_load_pd((*gBase)[6][1][1][0]);

u_11_im = _mm256_load_pd((*gBase)[6][1][1][1]);

u_12_re = _mm256_load_pd((*gBase)[6][1][2][0]);

u_12_im = _mm256_load_pd((*gBase)[6][1][2][1]);

u_20_re = _mm256_mul_pd( u_01_re , u_12_re );
u_20_re = _mm256_fnmadd_pd( u_01_im , u_12_im , u_20_re );
u_20_re = _mm256_fnmadd_pd( u_02_re , u_11_re , u_20_re );
u_20_re = _mm256_fmadd_pd( u_02_im , u_11_im , u_20_re );
u_20_im = _mm256_mul_pd( u_02_re , u_11_im );
u_20_im = _mm256_fmadd_pd( u_02_im , u_11_re , u_20_im );
u_20_im = _mm256_fnmadd_pd( u_01_re , u_12_im , u_20_im );
u_20_im = _mm256_fnmadd_pd( u_01_im , u_12_re , u_20_im );
u_21_re = _mm256_mul_pd( u_02_re , u_10_re );
u_21_re = _mm256_fnmadd_pd( u_02_im , u_10_im , u_21_re );
u_21_re = _mm256_fnmadd_pd( u_00_re , u_12_re , u_21_re );
u_21_re = _mm256_fmadd_pd( u_00_im , u_12_im , u_21_re );
u_21_im = _mm256_mul_pd( u_00_re , u_12_im );
u_21_im = _mm256_fmadd_pd( u_00_im , u_12_re , u_21_im );
u_21_im = _mm256_fnmadd_pd( u_02_re , u_10_im , u_21_im );
u_21_im = _mm256_fnmadd_pd( u_02_im , u_10_re , u_21_im );
u_22_re = _mm256_mul_pd( u_00_re , u_11_re );
u_22_re = _mm256_fnmadd_pd( u_00_im , u_11_im , u_22_re );
u_22_re = _mm256_fnmadd_pd( u_01_re , u_10_re , u_22_re );
u_22_re = _mm256_fmadd_pd( u_01_im , u_10_im , u_22_re );
u_22_im = _mm256_mul_pd( u_01_re , u_10_im );
u_22_im = _mm256_fmadd_pd( u_01_im , u_10_re , u_22_im );
u_22_im = _mm256_fnmadd_pd( u_00_re , u_11_im , u_22_im );
u_22_im = _mm256_fnmadd_pd( u_00_im , u_11_re , u_22_im );
ub_S0_C0_RE = _mm256_mul_pd( u_00_re , b_S0_C0_RE );
ub_S0_C0_RE = _mm256_fmadd_pd( u_00_im , b_S0_C0_IM , ub_S0_C0_RE );
ub_S0_C0_IM = _mm256_mul_pd( u_00_re , b_S0_C0_IM );
ub_S0_C0_IM = _mm256_fnmadd_pd( u_00_im , b_S0_C0_RE , ub_S0_C0_IM );
ub_S0_C0_RE = _mm256_fmadd_pd( u_01_re , b_S0_C1_RE , ub_S0_C0_RE );
ub_S0_C0_RE = _mm256_fmadd_pd( u_01_im , b_S0_C1_IM , ub_S0_C0_RE );
ub_S0_C0_IM = _mm256_fmadd_pd( u_01_re , b_S0_C1_IM , ub_S0_C0_IM );
ub_S0_C0_IM = _mm256_fnmadd_pd( u_01_im , b_S0_C1_RE , ub_S0_C0_IM );
ub_S0_C0_RE = _mm256_fmadd_pd( u_02_re , b_S0_C2_RE , ub_S0_C0_RE );
ub_S0_C0_RE = _mm256_fmadd_pd( u_02_im , b_S0_C2_IM , ub_S0_C0_RE );
ub_S0_C0_IM = _mm256_fmadd_pd( u_02_re , b_S0_C2_IM , ub_S0_C0_IM );
ub_S0_C0_IM = _mm256_fnmadd_pd( u_02_im , b_S0_C2_RE , ub_S0_C0_IM );
ub_S0_C1_RE = _mm256_mul_pd( u_10_re , b_S0_C0_RE );
ub_S0_C1_RE = _mm256_fmadd_pd( u_10_im , b_S0_C0_IM , ub_S0_C1_RE );
ub_S0_C1_IM = _mm256_mul_pd( u_10_re , b_S0_C0_IM );
ub_S0_C1_IM = _mm256_fnmadd_pd( u_10_im , b_S0_C0_RE , ub_S0_C1_IM );
ub_S0_C1_RE = _mm256_fmadd_pd( u_11_re , b_S0_C1_RE , ub_S0_C1_RE );
ub_S0_C1_RE = _mm256_fmadd_pd( u_11_im , b_S0_C1_IM , ub_S0_C1_RE );
ub_S0_C1_IM = _mm256_fmadd_pd( u_11_re , b_S0_C1_IM , ub_S0_C1_IM );
ub_S0_C1_IM = _mm256_fnmadd_pd( u_11_im , b_S0_C1_RE , ub_S0_C1_IM );
ub_S0_C1_RE = _mm256_fmadd_pd( u_12_re , b_S0_C2_RE , ub_S0_C1_RE );
ub_S0_C1_RE = _mm256_fmadd_pd( u_12_im , b_S0_C2_IM , ub_S0_C1_RE );
ub_S0_C1_IM = _mm256_fmadd_pd( u_12_re , b_S0_C2_IM , ub_S0_C1_IM );
ub_S0_C1_IM = _mm256_fnmadd_pd( u_12_im , b_S0_C2_RE , ub_S0_C1_IM );
ub_S0_C2_RE = _mm256_mul_pd( u_20_re , b_S0_C0_RE );
ub_S0_C2_RE = _mm256_fmadd_pd( u_20_im , b_S0_C0_IM , ub_S0_C2_RE );
ub_S0_C2_IM = _mm256_mul_pd( u_20_re , b_S0_C0_IM );
ub_S0_C2_IM = _mm256_fnmadd_pd( u_20_im , b_S0_C0_RE , ub_S0_C2_IM );
ub_S0_C2_RE = _mm256_fmadd_pd( u_21_re , b_S0_C1_RE , ub_S0_C2_RE );
ub_S0_C2_RE = _mm256_fmadd_pd( u_21_im , b_S0_C1_IM , ub_S0_C2_RE );
ub_S0_C2_IM = _mm256_fmadd_pd( u_21_re , b_S0_C1_IM , ub_S0_C2_IM );
ub_S0_C2_IM = _mm256_fnmadd_pd( u_21_im , b_S0_C1_RE , ub_S0_C2_IM );
ub_S0_C2_RE = _mm256_fmadd_pd( u_22_re , b_S0_C2_RE , ub_S0_C2_RE );
ub_S0_C2_RE = _mm256_fmadd_pd( u_22_im , b_S0_C2_IM , ub_S0_C2_RE );
ub_S0_C2_IM = _mm256_fmadd_pd( u_22_re , b_S0_C2_IM , ub_S0_C2_IM );
ub_S0_C2_IM = _mm256_fnmadd_pd( u_22_im , b_S0_C2_RE , ub_S0_C2_IM );
tmp_1_re = _mm256_mul_pd( tbc_phase_re , ub_S0_C0_RE );
tmp_1_re = _mm256_fmadd_pd( tbc_phase_im , ub_S0_C0_IM , tmp_1_re );
tmp_1_im = _mm256_mul_pd( tbc_phase_re , ub_S0_C0_IM );
tmp_1_im = _mm256_fnmadd_pd( tbc_phase_im , ub_S0_C0_RE , tmp_1_im );
ub_S0_C0_RE = tmp_1_re;
ub_S0_C0_IM = tmp_1_im;
tmp_1_re = _mm256_mul_pd( tbc_phase_re , ub_S0_C1_RE );
tmp_1_re = _mm256_fmadd_pd( tbc_phase_im , ub_S0_C1_IM , tmp_1_re );
tmp_1_im = _mm256_mul_pd( tbc_phase_re , ub_S0_C1_IM );
tmp_1_im = _mm256_fnmadd_pd( tbc_phase_im , ub_S0_C1_RE , tmp_1_im );
ub_S0_C1_RE = tmp_1_re;
ub_S0_C1_IM = tmp_1_im;
tmp_1_re = _mm256_mul_pd( tbc_phase_re , ub_S0_C2_RE );
tmp_1_re = _mm256_fmadd_pd( tbc_phase_im , ub_S0_C2_IM , tmp_1_re );
tmp_1_im = _mm256_mul_pd( tbc_phase_re , ub_S0_C2_IM );
tmp_1_im = _mm256_fnmadd_pd( tbc_phase_im , ub_S0_C2_RE , tmp_1_im );
ub_S0_C2_RE = tmp_1_re;
ub_S0_C2_IM = tmp_1_im;
tmp_1_re = _mm256_mul_pd( tbc_phase_re , ub_S1_C0_RE );
tmp_1_re = _mm256_fmadd_pd( tbc_phase_im , ub_S1_C0_IM , tmp_1_re );
tmp_1_im = _mm256_mul_pd( tbc_phase_re , ub_S1_C0_IM );
tmp_1_im = _mm256_fnmadd_pd( tbc_phase_im , ub_S1_C0_RE , tmp_1_im );
ub_S1_C0_RE = tmp_1_re;
ub_S1_C0_IM = tmp_1_im;
tmp_1_re = _mm256_mul_pd( tbc_phase_re , ub_S1_C1_RE );
tmp_1_re = _mm256_fmadd_pd( tbc_phase_im , ub_S1_C1_IM , tmp_1_re );
tmp_1_im = _mm256_mul_pd( tbc_phase_re , ub_S1_C1_IM );
tmp_1_im = _mm256_fnmadd_pd( tbc_phase_im , ub_S1_C1_RE , tmp_1_im );
ub_S1_C1_RE = tmp_1_re;
ub_S1_C1_IM = tmp_1_im;
tmp_1_re = _mm256_mul_pd( tbc_phase_re , ub_S1_C2_RE );
tmp_1_re = _mm256_fmadd_pd( tbc_phase_im , ub_S1_C2_IM , tmp_1_re );
tmp_1_im = _mm256_mul_pd( tbc_phase_re , ub_S1_C2_IM );
tmp_1_im = _mm256_fnmadd_pd( tbc_phase_im , ub_S1_C2_RE , tmp_1_im );
ub_S1_C2_RE = tmp_1_re;
ub_S1_C2_IM = tmp_1_im;
out_S0_C0_RE = _mm256_fmadd_pd( beta_vec , ub_S0_C0_RE , out_S0_C0_RE );
out_S0_C0_IM = _mm256_fmadd_pd( beta_vec , ub_S0_C0_IM , out_S0_C0_IM );
out_S0_C1_RE = _mm256_fmadd_pd( beta_vec , ub_S0_C1_RE , out_S0_C1_RE );
out_S0_C1_IM = _mm256_fmadd_pd( beta_vec , ub_S0_C1_IM , out_S0_C1_IM );
out_S0_C2_RE = _mm256_fmadd_pd( beta_vec , ub_S0_C2_RE , out_S0_C2_RE );
out_S0_C2_IM = _mm256_fmadd_pd( beta_vec , ub_S0_C2_IM , out_S0_C2_IM );
out_S2_C0_RE = _mm256_fmadd_pd( beta_vec , ub_S0_C0_RE , out_S2_C0_RE );
out_S2_C0_IM = _mm256_fmadd_pd( beta_vec , ub_S0_C0_IM , out_S2_C0_IM );
out_S2_C1_RE = _mm256_fmadd_pd( beta_vec , ub_S0_C1_RE , out_S2_C1_RE );
out_S2_C1_IM = _mm256_fmadd_pd( beta_vec , ub_S0_C1_IM , out_S2_C1_IM );
out_S2_C2_RE = _mm256_fmadd_pd( beta_vec , ub_S0_C2_RE , out_S2_C2_RE );
out_S2_C2_IM = _mm256_fmadd_pd( beta_vec , ub_S0_C2_IM , out_S2_C2_IM );
psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*tbBase)[0][1][0] + offs[0])), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*tbBase)[0][1][0] + offs[2])), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*tbBase)[0][1][1] + offs[0])), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*tbBase)[0][1][1] + offs[2])), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*tbBase)[0][3][0] + offs[0])), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*tbBase)[0][3][0] + offs[2])), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*tbBase)[0][3][1] + offs[0])), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*tbBase)[0][3][1] + offs[2])), 1);

b_S1_C0_RE = _mm256_add_pd( psi_S0_RE , psi_S1_RE );
b_S1_C0_IM = _mm256_add_pd( psi_S0_IM , psi_S1_IM );
psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*tbBase)[1][1][0] + offs[0])), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*tbBase)[1][1][0] + offs[2])), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*tbBase)[1][1][1] + offs[0])), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*tbBase)[1][1][1] + offs[2])), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*tbBase)[1][3][0] + offs[0])), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*tbBase)[1][3][0] + offs[2])), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*tbBase)[1][3][1] + offs[0])), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*tbBase)[1][3][1] + offs[2])), 1);

b_S1_C1_RE = _mm256_add_pd( psi_S0_RE , psi_S1_RE );
b_S1_C1_IM = _mm256_add_pd( psi_S0_IM , psi_S1_IM );
psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*tbBase)[2][1][0] + offs[0])), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*tbBase)[2][1][0] + offs[2])), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*tbBase)[2][1][1] + offs[0])), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*tbBase)[2][1][1] + offs[2])), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*tbBase)[2][3][0] + offs[0])), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*tbBase)[2][3][0] + offs[2])), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*tbBase)[2][3][1] + offs[0])), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*tbBase)[2][3][1] + offs[2])), 1);

b_S1_C2_RE = _mm256_add_pd( psi_S0_RE , psi_S1_RE );
b_S1_C2_IM = _mm256_add_pd( psi_S0_IM , psi_S1_IM );
ub_S1_C0_RE = _mm256_mul_pd( u_00_re , b_S1_C0_RE );
ub_S1_C0_RE = _mm256_fmadd_pd( u_00_im , b_S1_C0_IM , ub_S1_C0_RE );
ub_S1_C0_IM = _mm256_mul_pd( u_00_re , b_S1_C0_IM );
ub_S1_C0_IM = _mm256_fnmadd_pd( u_00_im , b_S1_C0_RE , ub_S1_C0_IM );
ub_S1_C0_RE = _mm256_fmadd_pd( u_01_re , b_S1_C1_RE , ub_S1_C0_RE );
ub_S1_C0_RE = _mm256_fmadd_pd( u_01_im , b_S1_C1_IM , ub_S1_C0_RE );
ub_S1_C0_IM = _mm256_fmadd_pd( u_01_re , b_S1_C1_IM , ub_S1_C0_IM );
ub_S1_C0_IM = _mm256_fnmadd_pd( u_01_im , b_S1_C1_RE , ub_S1_C0_IM );
ub_S1_C0_RE = _mm256_fmadd_pd( u_02_re , b_S1_C2_RE , ub_S1_C0_RE );
ub_S1_C0_RE = _mm256_fmadd_pd( u_02_im , b_S1_C2_IM , ub_S1_C0_RE );
ub_S1_C0_IM = _mm256_fmadd_pd( u_02_re , b_S1_C2_IM , ub_S1_C0_IM );
ub_S1_C0_IM = _mm256_fnmadd_pd( u_02_im , b_S1_C2_RE , ub_S1_C0_IM );
ub_S1_C1_RE = _mm256_mul_pd( u_10_re , b_S1_C0_RE );
ub_S1_C1_RE = _mm256_fmadd_pd( u_10_im , b_S1_C0_IM , ub_S1_C1_RE );
ub_S1_C1_IM = _mm256_mul_pd( u_10_re , b_S1_C0_IM );
ub_S1_C1_IM = _mm256_fnmadd_pd( u_10_im , b_S1_C0_RE , ub_S1_C1_IM );
ub_S1_C1_RE = _mm256_fmadd_pd( u_11_re , b_S1_C1_RE , ub_S1_C1_RE );
ub_S1_C1_RE = _mm256_fmadd_pd( u_11_im , b_S1_C1_IM , ub_S1_C1_RE );
ub_S1_C1_IM = _mm256_fmadd_pd( u_11_re , b_S1_C1_IM , ub_S1_C1_IM );
ub_S1_C1_IM = _mm256_fnmadd_pd( u_11_im , b_S1_C1_RE , ub_S1_C1_IM );
ub_S1_C1_RE = _mm256_fmadd_pd( u_12_re , b_S1_C2_RE , ub_S1_C1_RE );
ub_S1_C1_RE = _mm256_fmadd_pd( u_12_im , b_S1_C2_IM , ub_S1_C1_RE );
ub_S1_C1_IM = _mm256_fmadd_pd( u_12_re , b_S1_C2_IM , ub_S1_C1_IM );
ub_S1_C1_IM = _mm256_fnmadd_pd( u_12_im , b_S1_C2_RE , ub_S1_C1_IM );
ub_S1_C2_RE = _mm256_mul_pd( u_20_re , b_S1_C0_RE );
ub_S1_C2_RE = _mm256_fmadd_pd( u_20_im , b_S1_C0_IM , ub_S1_C2_RE );
ub_S1_C2_IM = _mm256_mul_pd( u_20_re , b_S1_C0_IM );
ub_S1_C2_IM = _mm256_fnmadd_pd( u_20_im , b_S1_C0_RE , ub_S1_C2_IM );
ub_S1_C2_RE = _mm256_fmadd_pd( u_21_re , b_S1_C1_RE , ub_S1_C2_RE );
ub_S1_C2_RE = _mm256_fmadd_pd( u_21_im , b_S1_C1_IM , ub_S1_C2_RE );
ub_S1_C2_IM = _mm256_fmadd_pd( u_21_re , b_S1_C1_IM , ub_S1_C2_IM );
ub_S1_C2_IM = _mm256_fnmadd_pd( u_21_im , b_S1_C1_RE , ub_S1_C2_IM );
ub_S1_C2_RE = _mm256_fmadd_pd( u_22_re , b_S1_C2_RE , ub_S1_C2_RE );
ub_S1_C2_RE = _mm256_fmadd_pd( u_22_im , b_S1_C2_IM , ub_S1_C2_RE );
ub_S1_C2_IM = _mm256_fmadd_pd( u_22_re , b_S1_C2_IM , ub_S1_C2_IM );
ub_S1_C2_IM = _mm256_fnmadd_pd( u_22_im , b_S1_C2_RE , ub_S1_C2_IM );
tmp_1_re = _mm256_mul_pd( tbc_phase_re , ub_S0_C0_RE );
tmp_1_re = _mm256_fmadd_pd( tbc_phase_im , ub_S0_C0_IM , tmp_1_re );
tmp_1_im = _mm256_mul_pd( tbc_phase_re , ub_S0_C0_IM );
tmp_1_im = _mm256_fnmadd_pd( tbc_phase_im , ub_S0_C0_RE , tmp_1_im );
ub_S0_C0_RE = tmp_1_re;
ub_S0_C0_IM = tmp_1_im;
tmp_1_re = _mm256_mul_pd( tbc_phase_re , ub_S0_C1_RE );
tmp_1_re = _mm256_fmadd_pd( tbc_phase_im , ub_S0_C1_IM , tmp_1_re );
tmp_1_im = _mm256_mul_pd( tbc_phase_re , ub_S0_C1_IM );
tmp_1_im = _mm256_fnmadd_pd( tbc_phase_im , ub_S0_C1_RE , tmp_1_im );
ub_S0_C1_RE = tmp_1_re;
ub_S0_C1_IM = tmp_1_im;
tmp_1_re = _mm256_mul_pd( tbc_phase_re , ub_S0_C2_RE );
tmp_1_re = _mm256_fmadd_pd( tbc_phase_im , ub_S0_C2_IM , tmp_1_re );
tmp_1_im = _mm256_mul_pd( tbc_phase_re , ub_S0_C2_IM );
tmp_1_im = _mm256_fnmadd_pd( tbc_phase_im , ub_S0_C2_RE , tmp_1_im );
ub_S0_C2_RE = tmp_1_re;
ub_S0_C2_IM = tmp_1_im;
tmp_1_re = _mm256_mul_pd( tbc_phase_re , ub_S1_C0_RE );
tmp_1_re = _mm256_fmadd_pd( tbc_phase_im , ub_S1_C0_IM , tmp_1_re );
tmp_1_im = _mm256_mul_pd( tbc_phase_re , ub_S1_C0_IM );
tmp_1_im = _mm256_fnmadd_pd( tbc_phase_im , ub_S1_C0_RE , tmp_1_im );
ub_S1_C0_RE = tmp_1_re;
ub_S1_C0_IM = tmp_1_im;
tmp_1_re = _mm256_mul_pd( tbc_phase_re , ub_S1_C1_RE );
tmp_1_re = _mm256_fmadd_pd( tbc_phase_im , ub_S1_C1_IM , tmp_1_re );
tmp_1_im = _mm256_mul_pd( tbc_phase_re , ub_S1_C1_IM );
tmp_1_im = _mm256_fnmadd_pd( tbc_phase_im , ub_S1_C1_RE , tmp_1_im );
ub_S1_C1_RE = tmp_1_re;
ub_S1_C1_IM = tmp_1_im;
tmp_1_re = _mm256_mul_pd( tbc_phase_re , ub_S1_C2_RE );
tmp_1_re = _mm256_fmadd_pd( tbc_phase_im , ub_S1_C2_IM , tmp_1_re );
tmp_1_im = _mm256_mul_pd( tbc_phase_re , ub_S1_C2_IM );
tmp_1_im = _mm256_fnmadd_pd( tbc_phase_im , ub_S1_C2_RE , tmp_1_im );
ub_S1_C2_RE = tmp_1_re;
ub_S1_C2_IM = tmp_1_im;
out_S1_C0_RE = _mm256_fmadd_pd( beta_vec , ub_S1_C0_RE , out_S1_C0_RE );
out_S1_C0_IM = _mm256_fmadd_pd( beta_vec , ub_S1_C0_IM , out_S1_C0_IM );
out_S1_C1_RE = _mm256_fmadd_pd( beta_vec , ub_S1_C1_RE , out_S1_C1_RE );
out_S1_C1_IM = _mm256_fmadd_pd( beta_vec , ub_S1_C1_IM , out_S1_C1_IM );
out_S1_C2_RE = _mm256_fmadd_pd( beta_vec , ub_S1_C2_RE , out_S1_C2_RE );
out_S1_C2_IM = _mm256_fmadd_pd( beta_vec , ub_S1_C2_IM , out_S1_C2_IM );
out_S3_C0_RE = _mm256_fmadd_pd( beta_vec , ub_S1_C0_RE , out_S3_C0_RE );
out_S3_C0_IM = _mm256_fmadd_pd( beta_vec , ub_S1_C0_IM , out_S3_C0_IM );
out_S3_C1_RE = _mm256_fmadd_pd( beta_vec , ub_S1_C1_RE , out_S3_C1_RE );
out_S3_C1_IM = _mm256_fmadd_pd( beta_vec , ub_S1_C1_IM , out_S3_C1_IM );
out_S3_C2_RE = _mm256_fmadd_pd( beta_vec , ub_S1_C2_RE , out_S3_C2_RE );
out_S3_C2_IM = _mm256_fmadd_pd( beta_vec , ub_S1_C2_IM , out_S3_C2_IM );
}
 if ( accumulate[7] ) { 
__m256d beta_vec = _mm256_setzero_pd();
beta_vec = _mm256_broadcast_sd((&coeff_t_f));

psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*tfBase)[0][0][0] + offs[0])), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*tfBase)[0][0][0] + offs[2])), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*tfBase)[0][0][1] + offs[0])), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*tfBase)[0][0][1] + offs[2])), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*tfBase)[0][2][0] + offs[0])), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*tfBase)[0][2][0] + offs[2])), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*tfBase)[0][2][1] + offs[0])), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*tfBase)[0][2][1] + offs[2])), 1);

b_S0_C0_RE = _mm256_sub_pd( psi_S0_RE , psi_S1_RE );
b_S0_C0_IM = _mm256_sub_pd( psi_S0_IM , psi_S1_IM );
psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*tfBase)[1][0][0] + offs[0])), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*tfBase)[1][0][0] + offs[2])), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*tfBase)[1][0][1] + offs[0])), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*tfBase)[1][0][1] + offs[2])), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*tfBase)[1][2][0] + offs[0])), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*tfBase)[1][2][0] + offs[2])), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*tfBase)[1][2][1] + offs[0])), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*tfBase)[1][2][1] + offs[2])), 1);

b_S0_C1_RE = _mm256_sub_pd( psi_S0_RE , psi_S1_RE );
b_S0_C1_IM = _mm256_sub_pd( psi_S0_IM , psi_S1_IM );
psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*tfBase)[2][0][0] + offs[0])), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*tfBase)[2][0][0] + offs[2])), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*tfBase)[2][0][1] + offs[0])), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*tfBase)[2][0][1] + offs[2])), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*tfBase)[2][2][0] + offs[0])), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*tfBase)[2][2][0] + offs[2])), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*tfBase)[2][2][1] + offs[0])), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*tfBase)[2][2][1] + offs[2])), 1);

b_S0_C2_RE = _mm256_sub_pd( psi_S0_RE , psi_S1_RE );
b_S0_C2_IM = _mm256_sub_pd( psi_S0_IM , psi_S1_IM );
u_00_re = _mm256_load_pd((*gBase)[7][0][0][0]);

u_00_im = _mm256_load_pd((*gBase)[7][0][0][1]);

u_01_re = _mm256_load_pd((*gBase)[7][0][1][0]);

u_01_im = _mm256_load_pd((*gBase)[7][0][1][1]);

u_02_re = _mm256_load_pd((*gBase)[7][0][2][0]);

u_02_im = _mm256_load_pd((*gBase)[7][0][2][1]);

u_10_re = _mm256_load_pd((*gBase)[7][1][0][0]);

u_10_im = _mm256_load_pd((*gBase)[7][1][0][1]);

u_11_re = _mm256_load_pd((*gBase)[7][1][1][0]);

u_11_im = _mm256_load_pd((*gBase)[7][1][1][1]);

u_12_re = _mm256_load_pd((*gBase)[7][1][2][0]);

u_12_im = _mm256_load_pd((*gBase)[7][1][2][1]);

u_20_re = _mm256_mul_pd( u_01_re , u_12_re );
u_20_re = _mm256_fnmadd_pd( u_01_im , u_12_im , u_20_re );
u_20_re = _mm256_fnmadd_pd( u_02_re , u_11_re , u_20_re );
u_20_re = _mm256_fmadd_pd( u_02_im , u_11_im , u_20_re );
u_20_im = _mm256_mul_pd( u_02_re , u_11_im );
u_20_im = _mm256_fmadd_pd( u_02_im , u_11_re , u_20_im );
u_20_im = _mm256_fnmadd_pd( u_01_re , u_12_im , u_20_im );
u_20_im = _mm256_fnmadd_pd( u_01_im , u_12_re , u_20_im );
u_21_re = _mm256_mul_pd( u_02_re , u_10_re );
u_21_re = _mm256_fnmadd_pd( u_02_im , u_10_im , u_21_re );
u_21_re = _mm256_fnmadd_pd( u_00_re , u_12_re , u_21_re );
u_21_re = _mm256_fmadd_pd( u_00_im , u_12_im , u_21_re );
u_21_im = _mm256_mul_pd( u_00_re , u_12_im );
u_21_im = _mm256_fmadd_pd( u_00_im , u_12_re , u_21_im );
u_21_im = _mm256_fnmadd_pd( u_02_re , u_10_im , u_21_im );
u_21_im = _mm256_fnmadd_pd( u_02_im , u_10_re , u_21_im );
u_22_re = _mm256_mul_pd( u_00_re , u_11_re );
u_22_re = _mm256_fnmadd_pd( u_00_im , u_11_im , u_22_re );
u_22_re = _mm256_fnmadd_pd( u_01_re , u_10_re , u_22_re );
u_22_re = _mm256_fmadd_pd( u_01_im , u_10_im , u_22_re );
u_22_im = _mm256_mul_pd( u_01_re , u_10_im );
u_22_im = _mm256_fmadd_pd( u_01_im , u_10_re , u_22_im );
u_22_im = _mm256_fnmadd_pd( u_00_re , u_11_im , u_22_im );
u_22_im = _mm256_fnmadd_pd( u_00_im , u_11_re , u_22_im );
ub_S0_C0_RE = _mm256_mul_pd( u_00_re , b_S0_C0_RE );
ub_S0_C0_RE = _mm256_fnmadd_pd( u_00_im , b_S0_C0_IM , ub_S0_C0_RE );
ub_S0_C0_IM = _mm256_mul_pd( u_00_re , b_S0_C0_IM );
ub_S0_C0_IM = _mm256_fmadd_pd( u_00_im , b_S0_C0_RE , ub_S0_C0_IM );
ub_S0_C0_RE = _mm256_fmadd_pd( u_10_re , b_S0_C1_RE , ub_S0_C0_RE );
ub_S0_C0_RE = _mm256_fnmadd_pd( u_10_im , b_S0_C1_IM , ub_S0_C0_RE );
ub_S0_C0_IM = _mm256_fmadd_pd( u_10_re , b_S0_C1_IM , ub_S0_C0_IM );
ub_S0_C0_IM = _mm256_fmadd_pd( u_10_im , b_S0_C1_RE , ub_S0_C0_IM );
ub_S0_C0_RE = _mm256_fmadd_pd( u_20_re , b_S0_C2_RE , ub_S0_C0_RE );
ub_S0_C0_RE = _mm256_fnmadd_pd( u_20_im , b_S0_C2_IM , ub_S0_C0_RE );
ub_S0_C0_IM = _mm256_fmadd_pd( u_20_re , b_S0_C2_IM , ub_S0_C0_IM );
ub_S0_C0_IM = _mm256_fmadd_pd( u_20_im , b_S0_C2_RE , ub_S0_C0_IM );
ub_S0_C1_RE = _mm256_mul_pd( u_01_re , b_S0_C0_RE );
ub_S0_C1_RE = _mm256_fnmadd_pd( u_01_im , b_S0_C0_IM , ub_S0_C1_RE );
ub_S0_C1_IM = _mm256_mul_pd( u_01_re , b_S0_C0_IM );
ub_S0_C1_IM = _mm256_fmadd_pd( u_01_im , b_S0_C0_RE , ub_S0_C1_IM );
ub_S0_C1_RE = _mm256_fmadd_pd( u_11_re , b_S0_C1_RE , ub_S0_C1_RE );
ub_S0_C1_RE = _mm256_fnmadd_pd( u_11_im , b_S0_C1_IM , ub_S0_C1_RE );
ub_S0_C1_IM = _mm256_fmadd_pd( u_11_re , b_S0_C1_IM , ub_S0_C1_IM );
ub_S0_C1_IM = _mm256_fmadd_pd( u_11_im , b_S0_C1_RE , ub_S0_C1_IM );
ub_S0_C1_RE = _mm256_fmadd_pd( u_21_re , b_S0_C2_RE , ub_S0_C1_RE );
ub_S0_C1_RE = _mm256_fnmadd_pd( u_21_im , b_S0_C2_IM , ub_S0_C1_RE );
ub_S0_C1_IM = _mm256_fmadd_pd( u_21_re , b_S0_C2_IM , ub_S0_C1_IM );
ub_S0_C1_IM = _mm256_fmadd_pd( u_21_im , b_S0_C2_RE , ub_S0_C1_IM );
ub_S0_C2_RE = _mm256_mul_pd( u_02_re , b_S0_C0_RE );
ub_S0_C2_RE = _mm256_fnmadd_pd( u_02_im , b_S0_C0_IM , ub_S0_C2_RE );
ub_S0_C2_IM = _mm256_mul_pd( u_02_re , b_S0_C0_IM );
ub_S0_C2_IM = _mm256_fmadd_pd( u_02_im , b_S0_C0_RE , ub_S0_C2_IM );
ub_S0_C2_RE = _mm256_fmadd_pd( u_12_re , b_S0_C1_RE , ub_S0_C2_RE );
ub_S0_C2_RE = _mm256_fnmadd_pd( u_12_im , b_S0_C1_IM , ub_S0_C2_RE );
ub_S0_C2_IM = _mm256_fmadd_pd( u_12_re , b_S0_C1_IM , ub_S0_C2_IM );
ub_S0_C2_IM = _mm256_fmadd_pd( u_12_im , b_S0_C1_RE , ub_S0_C2_IM );
ub_S0_C2_RE = _mm256_fmadd_pd( u_22_re , b_S0_C2_RE , ub_S0_C2_RE );
ub_S0_C2_RE = _mm256_fnmadd_pd( u_22_im , b_S0_C2_IM , ub_S0_C2_RE );
ub_S0_C2_IM = _mm256_fmadd_pd( u_22_re , b_S0_C2_IM , ub_S0_C2_IM );
ub_S0_C2_IM = _mm256_fmadd_pd( u_22_im , b_S0_C2_RE , ub_S0_C2_IM );
tmp_1_re = _mm256_mul_pd( tbc_phase_re , ub_S0_C0_RE );
tmp_1_re = _mm256_fnmadd_pd( tbc_phase_im , ub_S0_C0_IM , tmp_1_re );
tmp_1_im = _mm256_mul_pd( tbc_phase_re , ub_S0_C0_IM );
tmp_1_im = _mm256_fmadd_pd( tbc_phase_im , ub_S0_C0_RE , tmp_1_im );
ub_S0_C0_RE = tmp_1_re;
ub_S0_C0_IM = tmp_1_im;
tmp_1_re = _mm256_mul_pd( tbc_phase_re , ub_S0_C1_RE );
tmp_1_re = _mm256_fnmadd_pd( tbc_phase_im , ub_S0_C1_IM , tmp_1_re );
tmp_1_im = _mm256_mul_pd( tbc_phase_re , ub_S0_C1_IM );
tmp_1_im = _mm256_fmadd_pd( tbc_phase_im , ub_S0_C1_RE , tmp_1_im );
ub_S0_C1_RE = tmp_1_re;
ub_S0_C1_IM = tmp_1_im;
tmp_1_re = _mm256_mul_pd( tbc_phase_re , ub_S0_C2_RE );
tmp_1_re = _mm256_fnmadd_pd( tbc_phase_im , ub_S0_C2_IM , tmp_1_re );
tmp_1_im = _mm256_mul_pd( tbc_phase_re , ub_S0_C2_IM );
tmp_1_im = _mm256_fmadd_pd( tbc_phase_im , ub_S0_C2_RE , tmp_1_im );
ub_S0_C2_RE = tmp_1_re;
ub_S0_C2_IM = tmp_1_im;
tmp_1_re = _mm256_mul_pd( tbc_phase_re , ub_S1_C0_RE );
tmp_1_re = _mm256_fnmadd_pd( tbc_phase_im , ub_S1_C0_IM , tmp_1_re );
tmp_1_im = _mm256_mul_pd( tbc_phase_re , ub_S1_C0_IM );
tmp_1_im = _mm256_fmadd_pd( tbc_phase_im , ub_S1_C0_RE , tmp_1_im );
ub_S1_C0_RE = tmp_1_re;
ub_S1_C0_IM = tmp_1_im;
tmp_1_re = _mm256_mul_pd( tbc_phase_re , ub_S1_C1_RE );
tmp_1_re = _mm256_fnmadd_pd( tbc_phase_im , ub_S1_C1_IM , tmp_1_re );
tmp_1_im = _mm256_mul_pd( tbc_phase_re , ub_S1_C1_IM );
tmp_1_im = _mm256_fmadd_pd( tbc_phase_im , ub_S1_C1_RE , tmp_1_im );
ub_S1_C1_RE = tmp_1_re;
ub_S1_C1_IM = tmp_1_im;
tmp_1_re = _mm256_mul_pd( tbc_phase_re , ub_S1_C2_RE );
tmp_1_re = _mm256_fnmadd_pd( tbc_phase_im , ub_S1_C2_IM , tmp_1_re );
tmp_1_im = _mm256_mul_pd( tbc_phase_re , ub_S1_C2_IM );
tmp_1_im = _mm256_fmadd_pd( tbc_phase_im , ub_S1_C2_RE , tmp_1_im );
ub_S1_C2_RE = tmp_1_re;
ub_S1_C2_IM = tmp_1_im;
out_S0_C0_RE = _mm256_fmadd_pd( beta_vec , ub_S0_C0_RE , out_S0_C0_RE );
out_S0_C0_IM = _mm256_fmadd_pd( beta_vec , ub_S0_C0_IM , out_S0_C0_IM );
out_S0_C1_RE = _mm256_fmadd_pd( beta_vec , ub_S0_C1_RE , out_S0_C1_RE );
out_S0_C1_IM = _mm256_fmadd_pd( beta_vec , ub_S0_C1_IM , out_S0_C1_IM );
out_S0_C2_RE = _mm256_fmadd_pd( beta_vec , ub_S0_C2_RE , out_S0_C2_RE );
out_S0_C2_IM = _mm256_fmadd_pd( beta_vec , ub_S0_C2_IM , out_S0_C2_IM );
out_S2_C0_RE = _mm256_fnmadd_pd( beta_vec , ub_S0_C0_RE , out_S2_C0_RE );
out_S2_C0_IM = _mm256_fnmadd_pd( beta_vec , ub_S0_C0_IM , out_S2_C0_IM );
out_S2_C1_RE = _mm256_fnmadd_pd( beta_vec , ub_S0_C1_RE , out_S2_C1_RE );
out_S2_C1_IM = _mm256_fnmadd_pd( beta_vec , ub_S0_C1_IM , out_S2_C1_IM );
out_S2_C2_RE = _mm256_fnmadd_pd( beta_vec , ub_S0_C2_RE , out_S2_C2_RE );
out_S2_C2_IM = _mm256_fnmadd_pd( beta_vec , ub_S0_C2_IM , out_S2_C2_IM );
psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*tfBase)[0][1][0] + offs[0])), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*tfBase)[0][1][0] + offs[2])), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*tfBase)[0][1][1] + offs[0])), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*tfBase)[0][1][1] + offs[2])), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*tfBase)[0][3][0] + offs[0])), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*tfBase)[0][3][0] + offs[2])), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*tfBase)[0][3][1] + offs[0])), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*tfBase)[0][3][1] + offs[2])), 1);

b_S1_C0_RE = _mm256_sub_pd( psi_S0_RE , psi_S1_RE );
b_S1_C0_IM = _mm256_sub_pd( psi_S0_IM , psi_S1_IM );
psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*tfBase)[1][1][0] + offs[0])), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*tfBase)[1][1][0] + offs[2])), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*tfBase)[1][1][1] + offs[0])), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*tfBase)[1][1][1] + offs[2])), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*tfBase)[1][3][0] + offs[0])), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*tfBase)[1][3][0] + offs[2])), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*tfBase)[1][3][1] + offs[0])), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*tfBase)[1][3][1] + offs[2])), 1);

b_S1_C1_RE = _mm256_sub_pd( psi_S0_RE , psi_S1_RE );
b_S1_C1_IM = _mm256_sub_pd( psi_S0_IM , psi_S1_IM );
psi_S0_RE = _mm256_setzero_pd();
psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*tfBase)[2][1][0] + offs[0])), 0);

psi_S0_RE =  _mm256_insertf128_pd(psi_S0_RE, _mm_load_pd(((*tfBase)[2][1][0] + offs[2])), 1);

psi_S0_IM = _mm256_setzero_pd();
psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*tfBase)[2][1][1] + offs[0])), 0);

psi_S0_IM =  _mm256_insertf128_pd(psi_S0_IM, _mm_load_pd(((*tfBase)[2][1][1] + offs[2])), 1);

psi_S1_RE = _mm256_setzero_pd();
psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*tfBase)[2][3][0] + offs[0])), 0);

psi_S1_RE =  _mm256_insertf128_pd(psi_S1_RE, _mm_load_pd(((*tfBase)[2][3][0] + offs[2])), 1);

psi_S1_IM = _mm256_setzero_pd();
psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*tfBase)[2][3][1] + offs[0])), 0);

psi_S1_IM =  _mm256_insertf128_pd(psi_S1_IM, _mm_load_pd(((*tfBase)[2][3][1] + offs[2])), 1);

b_S1_C2_RE = _mm256_sub_pd( psi_S0_RE , psi_S1_RE );
b_S1_C2_IM = _mm256_sub_pd( psi_S0_IM , psi_S1_IM );
ub_S1_C0_RE = _mm256_mul_pd( u_00_re , b_S1_C0_RE );
ub_S1_C0_RE = _mm256_fnmadd_pd( u_00_im , b_S1_C0_IM , ub_S1_C0_RE );
ub_S1_C0_IM = _mm256_mul_pd( u_00_re , b_S1_C0_IM );
ub_S1_C0_IM = _mm256_fmadd_pd( u_00_im , b_S1_C0_RE , ub_S1_C0_IM );
ub_S1_C0_RE = _mm256_fmadd_pd( u_10_re , b_S1_C1_RE , ub_S1_C0_RE );
ub_S1_C0_RE = _mm256_fnmadd_pd( u_10_im , b_S1_C1_IM , ub_S1_C0_RE );
ub_S1_C0_IM = _mm256_fmadd_pd( u_10_re , b_S1_C1_IM , ub_S1_C0_IM );
ub_S1_C0_IM = _mm256_fmadd_pd( u_10_im , b_S1_C1_RE , ub_S1_C0_IM );
ub_S1_C0_RE = _mm256_fmadd_pd( u_20_re , b_S1_C2_RE , ub_S1_C0_RE );
ub_S1_C0_RE = _mm256_fnmadd_pd( u_20_im , b_S1_C2_IM , ub_S1_C0_RE );
ub_S1_C0_IM = _mm256_fmadd_pd( u_20_re , b_S1_C2_IM , ub_S1_C0_IM );
ub_S1_C0_IM = _mm256_fmadd_pd( u_20_im , b_S1_C2_RE , ub_S1_C0_IM );
ub_S1_C1_RE = _mm256_mul_pd( u_01_re , b_S1_C0_RE );
ub_S1_C1_RE = _mm256_fnmadd_pd( u_01_im , b_S1_C0_IM , ub_S1_C1_RE );
ub_S1_C1_IM = _mm256_mul_pd( u_01_re , b_S1_C0_IM );
ub_S1_C1_IM = _mm256_fmadd_pd( u_01_im , b_S1_C0_RE , ub_S1_C1_IM );
ub_S1_C1_RE = _mm256_fmadd_pd( u_11_re , b_S1_C1_RE , ub_S1_C1_RE );
ub_S1_C1_RE = _mm256_fnmadd_pd( u_11_im , b_S1_C1_IM , ub_S1_C1_RE );
ub_S1_C1_IM = _mm256_fmadd_pd( u_11_re , b_S1_C1_IM , ub_S1_C1_IM );
ub_S1_C1_IM = _mm256_fmadd_pd( u_11_im , b_S1_C1_RE , ub_S1_C1_IM );
ub_S1_C1_RE = _mm256_fmadd_pd( u_21_re , b_S1_C2_RE , ub_S1_C1_RE );
ub_S1_C1_RE = _mm256_fnmadd_pd( u_21_im , b_S1_C2_IM , ub_S1_C1_RE );
ub_S1_C1_IM = _mm256_fmadd_pd( u_21_re , b_S1_C2_IM , ub_S1_C1_IM );
ub_S1_C1_IM = _mm256_fmadd_pd( u_21_im , b_S1_C2_RE , ub_S1_C1_IM );
ub_S1_C2_RE = _mm256_mul_pd( u_02_re , b_S1_C0_RE );
ub_S1_C2_RE = _mm256_fnmadd_pd( u_02_im , b_S1_C0_IM , ub_S1_C2_RE );
ub_S1_C2_IM = _mm256_mul_pd( u_02_re , b_S1_C0_IM );
ub_S1_C2_IM = _mm256_fmadd_pd( u_02_im , b_S1_C0_RE , ub_S1_C2_IM );
ub_S1_C2_RE = _mm256_fmadd_pd( u_12_re , b_S1_C1_RE , ub_S1_C2_RE );
ub_S1_C2_RE = _mm256_fnmadd_pd( u_12_im , b_S1_C1_IM , ub_S1_C2_RE );
ub_S1_C2_IM = _mm256_fmadd_pd( u_12_re , b_S1_C1_IM , ub_S1_C2_IM );
ub_S1_C2_IM = _mm256_fmadd_pd( u_12_im , b_S1_C1_RE , ub_S1_C2_IM );
ub_S1_C2_RE = _mm256_fmadd_pd( u_22_re , b_S1_C2_RE , ub_S1_C2_RE );
ub_S1_C2_RE = _mm256_fnmadd_pd( u_22_im , b_S1_C2_IM , ub_S1_C2_RE );
ub_S1_C2_IM = _mm256_fmadd_pd( u_22_re , b_S1_C2_IM , ub_S1_C2_IM );
ub_S1_C2_IM = _mm256_fmadd_pd( u_22_im , b_S1_C2_RE , ub_S1_C2_IM );
tmp_1_re = _mm256_mul_pd( tbc_phase_re , ub_S0_C0_RE );
tmp_1_re = _mm256_fnmadd_pd( tbc_phase_im , ub_S0_C0_IM , tmp_1_re );
tmp_1_im = _mm256_mul_pd( tbc_phase_re , ub_S0_C0_IM );
tmp_1_im = _mm256_fmadd_pd( tbc_phase_im , ub_S0_C0_RE , tmp_1_im );
ub_S0_C0_RE = tmp_1_re;
ub_S0_C0_IM = tmp_1_im;
tmp_1_re = _mm256_mul_pd( tbc_phase_re , ub_S0_C1_RE );
tmp_1_re = _mm256_fnmadd_pd( tbc_phase_im , ub_S0_C1_IM , tmp_1_re );
tmp_1_im = _mm256_mul_pd( tbc_phase_re , ub_S0_C1_IM );
tmp_1_im = _mm256_fmadd_pd( tbc_phase_im , ub_S0_C1_RE , tmp_1_im );
ub_S0_C1_RE = tmp_1_re;
ub_S0_C1_IM = tmp_1_im;
tmp_1_re = _mm256_mul_pd( tbc_phase_re , ub_S0_C2_RE );
tmp_1_re = _mm256_fnmadd_pd( tbc_phase_im , ub_S0_C2_IM , tmp_1_re );
tmp_1_im = _mm256_mul_pd( tbc_phase_re , ub_S0_C2_IM );
tmp_1_im = _mm256_fmadd_pd( tbc_phase_im , ub_S0_C2_RE , tmp_1_im );
ub_S0_C2_RE = tmp_1_re;
ub_S0_C2_IM = tmp_1_im;
tmp_1_re = _mm256_mul_pd( tbc_phase_re , ub_S1_C0_RE );
tmp_1_re = _mm256_fnmadd_pd( tbc_phase_im , ub_S1_C0_IM , tmp_1_re );
tmp_1_im = _mm256_mul_pd( tbc_phase_re , ub_S1_C0_IM );
tmp_1_im = _mm256_fmadd_pd( tbc_phase_im , ub_S1_C0_RE , tmp_1_im );
ub_S1_C0_RE = tmp_1_re;
ub_S1_C0_IM = tmp_1_im;
tmp_1_re = _mm256_mul_pd( tbc_phase_re , ub_S1_C1_RE );
tmp_1_re = _mm256_fnmadd_pd( tbc_phase_im , ub_S1_C1_IM , tmp_1_re );
tmp_1_im = _mm256_mul_pd( tbc_phase_re , ub_S1_C1_IM );
tmp_1_im = _mm256_fmadd_pd( tbc_phase_im , ub_S1_C1_RE , tmp_1_im );
ub_S1_C1_RE = tmp_1_re;
ub_S1_C1_IM = tmp_1_im;
tmp_1_re = _mm256_mul_pd( tbc_phase_re , ub_S1_C2_RE );
tmp_1_re = _mm256_fnmadd_pd( tbc_phase_im , ub_S1_C2_IM , tmp_1_re );
tmp_1_im = _mm256_mul_pd( tbc_phase_re , ub_S1_C2_IM );
tmp_1_im = _mm256_fmadd_pd( tbc_phase_im , ub_S1_C2_RE , tmp_1_im );
ub_S1_C2_RE = tmp_1_re;
ub_S1_C2_IM = tmp_1_im;
out_S1_C0_RE = _mm256_fmadd_pd( beta_vec , ub_S1_C0_RE , out_S1_C0_RE );
out_S1_C0_IM = _mm256_fmadd_pd( beta_vec , ub_S1_C0_IM , out_S1_C0_IM );
out_S1_C1_RE = _mm256_fmadd_pd( beta_vec , ub_S1_C1_RE , out_S1_C1_RE );
out_S1_C1_IM = _mm256_fmadd_pd( beta_vec , ub_S1_C1_IM , out_S1_C1_IM );
out_S1_C2_RE = _mm256_fmadd_pd( beta_vec , ub_S1_C2_RE , out_S1_C2_RE );
out_S1_C2_IM = _mm256_fmadd_pd( beta_vec , ub_S1_C2_IM , out_S1_C2_IM );
out_S3_C0_RE = _mm256_fnmadd_pd( beta_vec , ub_S1_C0_RE , out_S3_C0_RE );
out_S3_C0_IM = _mm256_fnmadd_pd( beta_vec , ub_S1_C0_IM , out_S3_C0_IM );
out_S3_C1_RE = _mm256_fnmadd_pd( beta_vec , ub_S1_C1_RE , out_S3_C1_RE );
out_S3_C1_IM = _mm256_fnmadd_pd( beta_vec , ub_S1_C1_IM , out_S3_C1_IM );
out_S3_C2_RE = _mm256_fnmadd_pd( beta_vec , ub_S1_C2_RE , out_S3_C2_RE );
out_S3_C2_IM = _mm256_fnmadd_pd( beta_vec , ub_S1_C2_IM , out_S3_C2_IM );
}
tmp_1_re = _mm256_permute2f128_pd(out_S0_C0_RE, out_S0_C0_IM, 32);
tmp_2_re = _mm256_permute2f128_pd(out_S0_C0_RE, out_S0_C0_IM, 49);
_mm256_stream_pd((((*oBase)[0][0][0] + offs[0])+0), tmp_1_re);

_mm256_stream_pd((((*oBase)[0][0][0] + offs[2])+0), tmp_2_re);

tmp_1_re = _mm256_permute2f128_pd(out_S1_C0_RE, out_S1_C0_IM, 32);
tmp_2_re = _mm256_permute2f128_pd(out_S1_C0_RE, out_S1_C0_IM, 49);
_mm256_stream_pd((((*oBase)[0][0][0] + offs[0])+4), tmp_1_re);

_mm256_stream_pd((((*oBase)[0][0][0] + offs[2])+4), tmp_2_re);

tmp_1_re = _mm256_permute2f128_pd(out_S2_C0_RE, out_S2_C0_IM, 32);
tmp_2_re = _mm256_permute2f128_pd(out_S2_C0_RE, out_S2_C0_IM, 49);
_mm256_stream_pd((((*oBase)[0][0][0] + offs[0])+8), tmp_1_re);

_mm256_stream_pd((((*oBase)[0][0][0] + offs[2])+8), tmp_2_re);

tmp_1_re = _mm256_permute2f128_pd(out_S3_C0_RE, out_S3_C0_IM, 32);
tmp_2_re = _mm256_permute2f128_pd(out_S3_C0_RE, out_S3_C0_IM, 49);
_mm256_stream_pd((((*oBase)[0][0][0] + offs[0])+12), tmp_1_re);

_mm256_stream_pd((((*oBase)[0][0][0] + offs[2])+12), tmp_2_re);

tmp_1_re = _mm256_permute2f128_pd(out_S0_C1_RE, out_S0_C1_IM, 32);
tmp_2_re = _mm256_permute2f128_pd(out_S0_C1_RE, out_S0_C1_IM, 49);
_mm256_stream_pd((((*oBase)[0][0][0] + offs[0])+16), tmp_1_re);

_mm256_stream_pd((((*oBase)[0][0][0] + offs[2])+16), tmp_2_re);

tmp_1_re = _mm256_permute2f128_pd(out_S1_C1_RE, out_S1_C1_IM, 32);
tmp_2_re = _mm256_permute2f128_pd(out_S1_C1_RE, out_S1_C1_IM, 49);
_mm256_stream_pd((((*oBase)[0][0][0] + offs[0])+20), tmp_1_re);

_mm256_stream_pd((((*oBase)[0][0][0] + offs[2])+20), tmp_2_re);

tmp_1_re = _mm256_permute2f128_pd(out_S2_C1_RE, out_S2_C1_IM, 32);
tmp_2_re = _mm256_permute2f128_pd(out_S2_C1_RE, out_S2_C1_IM, 49);
_mm256_stream_pd((((*oBase)[0][0][0] + offs[0])+24), tmp_1_re);

_mm256_stream_pd((((*oBase)[0][0][0] + offs[2])+24), tmp_2_re);

tmp_1_re = _mm256_permute2f128_pd(out_S3_C1_RE, out_S3_C1_IM, 32);
tmp_2_re = _mm256_permute2f128_pd(out_S3_C1_RE, out_S3_C1_IM, 49);
_mm256_stream_pd((((*oBase)[0][0][0] + offs[0])+28), tmp_1_re);

_mm256_stream_pd((((*oBase)[0][0][0] + offs[2])+28), tmp_2_re);

tmp_1_re = _mm256_permute2f128_pd(out_S0_C2_RE, out_S0_C2_IM, 32);
tmp_2_re = _mm256_permute2f128_pd(out_S0_C2_RE, out_S0_C2_IM, 49);
_mm256_stream_pd((((*oBase)[0][0][0] + offs[0])+32), tmp_1_re);

_mm256_stream_pd((((*oBase)[0][0][0] + offs[2])+32), tmp_2_re);

tmp_1_re = _mm256_permute2f128_pd(out_S1_C2_RE, out_S1_C2_IM, 32);
tmp_2_re = _mm256_permute2f128_pd(out_S1_C2_RE, out_S1_C2_IM, 49);
_mm256_stream_pd((((*oBase)[0][0][0] + offs[0])+36), tmp_1_re);

_mm256_stream_pd((((*oBase)[0][0][0] + offs[2])+36), tmp_2_re);

tmp_1_re = _mm256_permute2f128_pd(out_S2_C2_RE, out_S2_C2_IM, 32);
tmp_2_re = _mm256_permute2f128_pd(out_S2_C2_RE, out_S2_C2_IM, 49);
_mm256_stream_pd((((*oBase)[0][0][0] + offs[0])+40), tmp_1_re);

_mm256_stream_pd((((*oBase)[0][0][0] + offs[2])+40), tmp_2_re);

tmp_1_re = _mm256_permute2f128_pd(out_S3_C2_RE, out_S3_C2_IM, 32);
tmp_2_re = _mm256_permute2f128_pd(out_S3_C2_RE, out_S3_C2_IM, 49);
_mm256_stream_pd((((*oBase)[0][0][0] + offs[0])+44), tmp_1_re);

_mm256_stream_pd((((*oBase)[0][0][0] + offs[2])+44), tmp_2_re);
kostrzewa commented 7 years ago

There are still some issues with passing compiler flags properly to the subcommand. Looking into it.

kostrzewa commented 7 years ago

There are still some issues with passing compiler flags properly to the subcommand. Looking into it.

I take it back, it seems to have been a problem with cached values. Sorry!

martin-ueding commented 7 years ago

I think that this looks good, I have merged everything in.

bjoo commented 7 years ago

This is right. It is entirely possible that a different compiler is needed to compile the generator than the actual generated code. For the Intel family they could be the same, but for example for a KNL build you may be building on a regular Xeon so you may not want any arch flags for building the code generator, while you may want -xMIC-AVX512 for compiling the target code.

Best, B

On Jun 20, 2017, at 9:43 AM, Martin Ueding notifications@github.com wrote:

@martin-ueding commented on this pull request.

In CMakeLists.txt:

This custom target executes

add_custom_target( codegen_lib ALL

  • COMMAND CXX=${target_cxx} CXXFLAGS=${target_cxxflags} cmake ${CMAKE_CURRENT_BINARY_DIR}/generated/${isa}
  • COMMAND cmake ${CMAKE_CURRENT_BINARY_DIR}/generated/${isa}

The kernels should be compiled with the target (= production) compiler. Only the code generator needs to be compiled with the host compiler.

— You are receiving this because you were mentioned. Reply to this email directly, view it on GitHub, or mute the thread.


Dr Balint Joo High Performance Computational Scientist Jefferson Lab 12000 Jefferson Ave, Suite 3, MS 12B2, Room F217, Newport News, VA 23606, USA Tel: +1-757-269-5339, Fax: +1-757-269-5427 email: bjoo@jlab.org