Closed drasmuss closed 4 years ago
Note for posterity. The transform=None
changes caused performance to decrease on the integrator benchmark, but after looking into this for a while I am relatively convinced that this is just a quirk of that particular model rather than a general issue. Basically the removal of some of the unnecessary x*1
ElementwiseInc operators means that one of the remaining ElementwiseInc
operators ends up writing to a partial signal block rather than the whole signal block, which is less efficient. But that is just a quirk of how the operators and signals are ordered/merged in that particular model, not an effect we'd expect to see in general. Larger scale tests (e.g. on the Spaun model) show a general speedup with the transform=None
changes.
In case it is useful in the future, here is a benchmark script I made to test the cost of different read/write types:
from collections import defaultdict
import timeit
import numpy as np
import tensorflow as tf
from tensorflow.python.eager import profiler
from tensorflow.python.ops.gen_state_ops import (
TemporaryVariable,
DestroyTemporaryVariable,
)
tf.config.optimizer.set_experimental_options({"disable_meta_optimizer": True})
minibatch_size = 64
base_shape = (minibatch_size, 16384)
read_write_size = 4096
reps = 1000
with tf.Graph().as_default() as graph:
results = defaultdict(list)
idxs = tf.constant(
np.random.uniform(0, base_shape[1], size=read_write_size), dtype=tf.int32
)
idxs_nd = tf.stack(
tf.meshgrid(tf.range(minibatch_size, dtype=tf.int32), idxs, indexing="ij",),
axis=-1,
)
base = tf.compat.v1.placeholder(shape=base_shape, dtype=tf.float32)
read_identity = tf.compat.v1.placeholder(
shape=(minibatch_size, read_write_size), dtype=tf.float32
)
for i in range(reps):
with tf.control_dependencies(results["read_identity"]):
results["read_identity"] = [read_identity]
with tf.control_dependencies(results["read_slice"]):
results["read_slice"] = [
tf.strided_slice(base, [0, 0], [minibatch_size, read_write_size])
]
with tf.control_dependencies(results["read_gather"]):
results["read_gather"] = [tf.gather(base, idxs, axis=1)]
with tf.control_dependencies(results["read_slice_concat"]):
results["read_slice_concat"] = [
tf.concat(
[
tf.strided_slice(
base, [0, 0], [minibatch_size, read_write_size // 2]
),
tf.strided_slice(
base,
[0, base_shape[1] - read_write_size // 2],
[minibatch_size, base_shape[1]],
),
],
axis=1,
)
]
with tf.control_dependencies(results["write_assign"]):
results["write_assign"] = [read_identity]
with tf.control_dependencies(results["write_assign_add"]):
if i == 0:
results["write_assign_add"] = [read_identity]
else:
results["write_assign_add"] = [
results["write_assign_add"][0] + read_identity
]
with tf.control_dependencies(results["write_scatter_add"]):
results["write_scatter_add"] = [
tf.tensor_scatter_nd_add(base, idxs_nd, read_identity)
]
with tf.control_dependencies(results["write_scatter_update"]):
results["write_scatter_update"] = [
tf.tensor_scatter_nd_update(base, idxs_nd, read_identity)
]
with tf.control_dependencies(results["write_temp_var_add"]):
var = TemporaryVariable(shape=base.shape, dtype=base.dtype)
var_name = var.op.name
var = tf.compat.v1.assign(var, base)
var = tf.compat.v1.scatter_nd_add(var, idxs_nd, read_identity)
results["write_temp_var_add"] = [
DestroyTemporaryVariable(ref=var, var_name=var_name)
]
with tf.control_dependencies(results["write_temp_var_update"]):
var = TemporaryVariable(shape=base.shape, dtype=base.dtype)
var_name = var.op.name
var = tf.compat.v1.assign(var, base)
var = tf.compat.v1.scatter_nd_update(var, idxs_nd, read_identity)
results["write_temp_var_update"] = [
DestroyTemporaryVariable(ref=var, var_name=var_name)
]
# change all the results to the same output, to remove i/o discrepancies
for k, v in results.items():
with tf.control_dependencies(v):
results[k] = tf.constant(1)
with tf.compat.v1.Session(graph=graph) as sess:
feed_dict = {
base: np.random.uniform(size=base_shape),
read_identity: np.random.uniform(size=(minibatch_size, read_write_size)),
}
# profiler.start()
sess.run(results, feed_dict=feed_dict)
# profiler.save("tmp2_profile", profiler.stop())
for key, vals in results.items():
print(key)
time = 1e10
for _ in range(50):
start = timeit.default_timer()
sess.run(
vals, feed_dict=feed_dict,
)
time = min(time, timeit.default_timer() - start)
print(time)
I'm getting the following warnings in master
:
/home/ehunsber/workspace/nengo-dl/nengo_dl/converter.py:1097: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
broadcast_scale[slices] = scale[i]
/home/ehunsber/workspace/nengo-dl/nengo_dl/converter.py:1098: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
broadcast_bias[slices] = bias[i]
Might want to fix these here. I think it's just a matter of using tuple(slices)
in these lines.
That's done in this commit https://github.com/nengo/nengo-dl/pull/126/commits/c03bd5882408b1f11c470b5487508cb2832333e2, and some other instances in this commit https://github.com/nengo/nengo-dl/pull/119/commits/bab395085159e1e97c86cb5de1c3587bd4bacbf9 (I just stuck them in with a larger commit as they came up since the change was so minor).
Fixing some issues found in converter for more complex models, and improving performance of converted networks.