Improve performance in matmul with transposed arrays

Benchmarking we issued that the matrix multiplication of two transposed arrays is really inefficient as can be seen in the plot: In-memory:

On-disk:

We should somehow improve this performance.

The code to reproduce it is:

import iarray as ia
import numpy as np
import os

def iarray_matmul(a, b):
    return ia.matmul(a, b)

def iarray_transpose(array):
    return array.transpose()

ia.set_config_defaults(dtype=np.float64)

func = "transpose"
shape = (100_000, 25000, 1000)
amshape = (shape[0], shape[1])
bmshape = (shape[1], shape[2])

# Obtain optimal chunk and block shapes
mparams = ia.matmul_params(amshape, bmshape)
amchunks, amblocks, bmchunks, bmblocks = mparams

if func == "transpose":
    amshape = (shape[1], shape[0])
    bmshape = (shape[2], shape[1])
    amchunks = np.array(amchunks).transpose()
    amblocks = np.array(amblocks).transpose()
    bmchunks = np.array(bmchunks).transpose()
    bmblocks = np.array(bmblocks).transpose()

filename = func + "arr-gemm.iarr"
cmd = 'vmtouch -e ' + filename 
if not os.path.exists(filename):
    ia.set_config_defaults(btune=False)
    am = ia.random.normal(amshape, 3, 2, chunks=amchunks, blocks=amblocks, urlpath=filename, fp_mantissa_bits=20)
os.system(cmd)

am = ia.load(filename)

w = np.ones(bmshape)
bm = ia.numpy2iarray(w, chunks=bmchunks, blocks=bmblocks)
print(bm.info)

a_opt = iarray_transpose(am)
b_opt = iarray_transpose(bm)
res = iarray_matmul(a_opt, b_opt)

inaos / iron-array

Improve performance in matmul with transposed arrays #603