speed improvement with dictionary indexing

The innermost loop of the gibbs function must access different slices of an array millions of times during a single run. These slices are columns, their order of selection is random (each pass of the loop selects a different column), and they may only be selected one at a time.

If we could speed up the speed at which numpy returned the slices, we might be able to save some time. As a test of this idea I used the following code to simulate drawing from a table with 100 sinks, 2000 features, and a number of inner-loop passes (slices taken) between 2**5 and 2**20 (30-1,000,000).

The three schemes I tried are: an input array that is in row-major format (C-order), and input array in column-major format (F-order) and a dictionary with keys as column indices and values as a 1D-array of the row values. figure_1

The results show that the dictionary indexing method takes approximately 90% of the time of the F-order method.

_dict[:, 0] / forder[:, 0]
array([ 0.84039466,  1.01139224,  0.92655346,  0.88919508,  0.83832869,
        0.86242302,  0.87799022,  0.92113187,  0.84468985,  0.89854044,
        0.90343308,  0.89311444,  0.89205119,  0.89685498,  0.9020272 ,
        0.90483933])

This approach seems promising to me. Pending new profiling results we should see if rewriting the internals to save 10% on indexing is worth it.

import numpy as np
import matplotlib.pyplot as plt
import time

rows = 100
cols = 2000

arr_data = np.random.randint(0, rows, size=rows*cols).reshape(rows, cols)

arr_f = np.empty((rows, cols), dtype=np.int64, order='F')
arr_f[:, :] = arr_data[:, :]

arr_c = np.empty((rows, cols), dtype=np.int64, order='C')
arr_c[:, :] = arr_data[:, :]

arr_d = {i:arr_data[:, i].astype(np.int64) for i in range(cols)}

def f_arr(arr, cols, iters):
    for _ in range(iters):
        col_idx = np.random.randint(0, cols)
        x = arr[:, col_idx] * 4

def f_dict(arr_d, cols, iters):
    for _ in range(iters):
        col_idx = np.random.randint(0, cols)
        x = arr_d[col_idx] * 4

loops = (2**np.arange(5, 21)).astype(np.int64)

forder = []
corder = []
_dict = []

for niter in loops:
    print(niter)
    tmp = []
    for _ in range(100):
        start = time.time()
        f_arr(arr_f, cols, niter)
        stop = time.time()
        tmp.append(stop - start)
    forder.append([np.array(tmp).mean(), np.array(tmp).std()])

    tmp = []
    for _ in range(100):
        start = time.time()
        f_arr(arr_c, cols, niter)
        stop = time.time()
        tmp.append(stop - start)
    corder.append([np.array(tmp).mean(), np.array(tmp).std()])

    tmp = []
    for _ in range(100):
        start = time.time()
        f_dict(arr_d, cols, niter)
        stop = time.time()
        tmp.append(stop - start)
    _dict.append([np.array(tmp).mean(), np.array(tmp).std()])

forder = np.array(forder)
corder = np.array(corder)
_dict = np.array(_dict)

xs = np.arange(5, 21)
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
ax.plot(xs, forder[:, 0], marker = 'o', lw=3, alpha = .5, color='blue',
                    label = 'F-order')
ax.plot(xs, corder[:, 0], marker = 'o', lw=3, alpha = .5, color='green',
                    label = 'C-order')
ax.plot(xs, _dict[:, 0], marker = 'o', lw=3, alpha = .5, color='red',
                    label = 'Dict')
ax.set_xlim(4, 21)
ax.set_xticks(xs)
ax.set_xlabel('Slices taken (2^x)')
ax.set_ylabel('Mean seconds per 2^x slices')
plt.legend(loc=2)

If upcoming benchmarking tests reveal that the most robust parameters return better answers, we might again look to see if speed can be increased if it's a limiting factor for users

On Jul 13, 2016, at 14:01, Will Van Treuren notifications@github.com wrote:

import numpy as np import matplotlib.pyplot as plt import time

rows = 100 cols = 2000

arr_data = np.random.randint(0, rows, size=rows*cols).reshape(rows, cols)

arr_f = np.empty((rows, cols), dtype=np.int64, order='F') arr_f[:, :] = arr_data[:, :]

arr_c = np.empty((rows, cols), dtype=np.int64, order='C') arr_c[:, :] = arr_data[:, :]

arr_d = {i:arr_data[:, i].astype(np.int64) for i in range(cols)}

def farr(arr, cols, iters): for in range(iters): col_idx = np.random.randint(0, cols) x = arr[:, col_idx] * 4

def f_dict(arrd, cols, iters): for in range(iters): col_idx = np.random.randint(0, cols) x = arr_d[col_idx] * 4

loops = (2**np.arange(5, 21)).astype(np.int64)

forder = [] corder = [] _dict = []

for niter in loops: print(niter) tmp = [] for _ in range(100): start = time.time() f_arr(arr_f, cols, niter) stop = time.time() tmp.append(stop - start) forder.append([np.array(tmp).mean(), np.array(tmp).std()])
tmp = []
for _ in range(100):
    start = time.time()
    f_arr(arr_c, cols, niter)
    stop = time.time()
    tmp.append(stop - start)
corder.append([np.array(tmp).mean(), np.array(tmp).std()])

tmp = []
for _ in range(100):
    start = time.time()
    f_dict(arr_d, cols, niter)
    stop = time.time()
    tmp.append(stop - start)
_dict.append([np.array(tmp).mean(), np.array(tmp).std()])
forder = np.array(forder) corder = np.array(corder) _dict = np.array(_dict)

xs = np.arange(5, 21) fig = plt.figure() ax = fig.add_subplot(1,1,1) ax.plot(xs, forder[:, 0], marker = 'o', lw=3, alpha = .5, color='blue', label = 'F-order') ax.plot(xs, corder[:, 0], marker = 'o', lw=3, alpha = .5, color='green', label = 'C-order') ax.plot(xs, _dict[:, 0], marker = 'o', lw=3, alpha = .5, color='red', label = 'Dict') ax.set_xlim(4, 21) ax.set_xticks(xs) ax.set_xlabel('Slices taken (2^x)') ax.set_ylabel('Mean seconds per 2^x slices') plt.legend(loc=2) — You are receiving this because you are subscribed to this thread. Reply to this email directly, view it on GitHub, or mute the thread.

With #80 we don't likely need to explore this as it would take a lot of refactoring.

caporaso-lab / sourcetracker2

speed improvement with dictionary indexing #49