Closed mratsim closed 6 years ago
After a few tests, the follwing proc can grow to up to 2GB before GC failing to allocate/collect on my machine at the zeros_like
step:
proc softmax_cross_entropy_backward1[T](
gradient: Tensor[T] or T,
cached_tensor: Tensor[T],
target: Tensor[T]
): Tensor[T] {.noInit.}=
let batch_size = cached_tensor.shape[1]
when gradient is T:
let grad = gradient
elif gradient is Tensor:
let grad = gradient.data[gradient.offset]
echo "cached_tensor shape:"
echo $cached_tensor.shape
result = zeros_like(cached_tensor)
for i in 0||(batch_size-1): # Can't use OpenMP - SIGSEGV Illegal Address
let (max, sumexp) = cached_tensor[_,i].streaming_max_sumexp
var res_slice = result[_, i]
apply3_inline(res_slice, cached_tensor[_,i], target[_,i]):
grad * (stable_softmax(y, max, sumexp) - z) / T(batch_size)
After discussion with Araq, Nim has a new allocator with max chunk size = 2GB.
Tests shows that the proc is failing after 13 allocations = 100000 200 8 bytes * 13 alloc = 2.08 GB. So upstream issue
Further investigation shows that apply3_inline and map3_inline are suspicious. Changing them to map2 in the following code prevents RAM rampage:
proc softmax_cross_entropy_backward1[T](
gradient: Tensor[T] or T,
cached_tensor: Tensor[T],
target: Tensor[T]
): Tensor[T] {.noInit.}=
let batch_size = cached_tensor.shape[1]
when gradient is T:
let grad = gradient
elif gradient is Tensor:
let grad = gradient.data[gradient.offset]
result = zeros_like(cached_tensor)
for i in 0||(batch_size-1):
let (max, sumexp) = cached_tensor[_,i].streaming_max_sumexp
var res_slice = result[_, i]
apply3_inline(res_slice, cached_tensor[_,i], target[_,i]):
grad * (stable_softmax(y, max, sumexp) - z) / T(batch_size)
############ New optimized
proc softmax_cross_entropy_backward2[T](
gradient: Tensor[T] or T,
cached_tensor: Tensor[T],
target: Tensor[T]
): Tensor[T] {.noInit.}=
let batch_size = cached_tensor.shape[1]
when gradient is T:
let grad = gradient
elif gradient is Tensor:
let grad = gradient.data[gradient.offset]
let axis_max_sumexp = cached_tensor.streaming_max_sumexp(axis = 1).broadcast(cached_tensor.shape)
# let axis_max_sumexp = cached_tensor.classic_max_sumexp(axis = 1).broadcast(cached_tensor.shape)
result = map3_inline(cached_tensor, target, axis_max_sumexp):
grad * (stable_softmax(x, z.max, z.sumexp) - y) / T(batch_size)
Furthermore in this complete example, exchanging the order of softmax_cross_entropy_backward1
and 2
prevents the bug from happening:
import ../src/tensor/tensor,
math, times
proc streaming_max_sumexp[T](t: Tensor[T]): tuple[max:T, sumexp: T] {.noSideEffect, inline.}=
# One pass but with branching
result.max = -Inf.T # will store the streaming max of the tensor
result.sumexp = 0.T # will store the streaming sum of exp of the tensor
for x in t:
if x <= result.max:
result.sumexp += exp(x - result.max)
else:
result.sumexp *= exp(result.max - x)
result.sumexp += 1
result.max = x
proc classic_max_sumexp[T](t: Tensor[T]): tuple[max:T, sumexp: T] =
# 2 pass no branching
result.max = t.max
result.sumexp = t.fold_inline() do:
x = exp(y - result.max)
do:
x += exp(y - result.max)
do: x += y
proc streaming_max_sumexp[T](t: Tensor[T], axis: int): Tensor[tuple[max:T, sumexp: T]] {.noInit.}=
# Only 2D tensor are supported for now. (i.e. no 3D Softmax)
assert axis in {0, 1}
result = newTensorUninit[tuple[max:T, sumexp: T]](t.shape[axis])
for i in `||`(0, t.shape[axis]-1, "simd"):
result.data[i] = t.atAxisIndex(axis, i).streaming_max_sumexp
# Reexpand the tensor to be consistent with fold_axis/reduce_axis
if axis == 0:
result = result.unsqueeze(1)
else:
result = result.unsqueeze(0)
proc classic_max_sumexp[T](t: Tensor[T], axis: int): Tensor[tuple[max:T, sumexp: T]] {.noInit.}=
# Only 2D tensor are supported for now. (i.e. no 3D Softmax)
assert axis in {0, 1}
result = newTensorUninit[tuple[max:T, sumexp: T]](t.shape[axis])
for i in `||`(0, t.shape[axis]-1, "simd"):
result.data[i] = t.atAxisIndex(axis, i).classic_max_sumexp
# Reexpand the tensor to be consistent with fold_axis/reduce_axis
if axis == 0:
result = result.unsqueeze(1)
else:
result = result.unsqueeze(0)
proc classic_logsumexp_v2[T: SomeReal](t: Tensor[T]): T =
# 2 pass no branching
let (max, sumexp) = t.classic_max_sumexp
result = max + ln(sumexp)
proc softmax_cross_entropy1[T](input, target: Tensor[T]): T =
var sample_softmax_xentropy = zeros[T](1, input.shape[1])
var i = 0
for sample_input, sample_target in zipAxis(input, target, 1):
let lse = sample_input.classic_logsumexp_v2 # y.streaming_log_sumexp
sample_softmax_xentropy[0, i] = sum:
map2_inline(sample_input, sample_target):
y * (lse - x)
inc i
result = sample_softmax_xentropy.mean
###### Backprop bench
proc softmax_cross_entropy_backward1[T](
gradient: Tensor[T] or T,
cached_tensor: Tensor[T],
target: Tensor[T]
): Tensor[T] {.noInit.}=
let batch_size = cached_tensor.shape[1]
when gradient is T:
let grad = gradient
elif gradient is Tensor:
let grad = gradient.data[gradient.offset]
result = zeros_like(cached_tensor)
for i in 0||(batch_size-1):
let (max, sumexp) = cached_tensor[_,i].streaming_max_sumexp
var res_slice = result[_, i]
apply3_inline(res_slice, cached_tensor[_,i], target[_,i]):
grad * ((y + max + sumexp) - z) / T(batch_size)
############ New optimized
proc softmax_cross_entropy_backward2[T](
gradient: Tensor[T] or T,
cached_tensor: Tensor[T],
target: Tensor[T]
): Tensor[T] {.noInit.}=
let batch_size = cached_tensor.shape[1]
when gradient is T:
let grad = gradient
elif gradient is Tensor:
let grad = gradient.data[gradient.offset]
let axis_max_sumexp = cached_tensor.streaming_max_sumexp(axis = 1).broadcast(cached_tensor.shape)
# let axis_max_sumexp = cached_tensor.classic_max_sumexp(axis = 1).broadcast(cached_tensor.shape)
result = map3_inline(cached_tensor, target, axis_max_sumexp):
grad * ((x + z.max + z.sumexp) - y) / T(batch_size)
####### Bench:
let batch_size = 200
let nb_classes = 100000
# Create a sparse label tensor of shape: [batch_size]
let sparse_labels = randomTensor(batch_size, nb_classes)
# Create the corresponding dense label tensor of shape [nb_classes, batch_size]
var labels = zeros[float64](nb_classes, batch_size)
# Fill in the non-zeros values
for sample_id, nonzero_idx in enumerate(sparse_labels):
labels[nonzero_idx, sample_id] = 1
# Create a random tensor with predictions:
let pred = randomTensor(nb_classes, batch_size, -1.0..1.0)
echo "### Reference"
let sce_loss = softmax_cross_entropy1(pred, labels)
echo sce_loss
var start = epochTime()
echo "###### Backpropagation"
start = epochTime()
for i in 0..<20:
discard softmax_cross_entropy_backward1(sce_loss, pred, labels)
echo "Softmax xentropy backward: ", epochTime() - start
start = epochTime()
for i in 0..<20:
discard softmax_cross_entropy_backward2(sce_loss, pred, labels)
echo "Backprop SCE optimized: ", epochTime() - start
Could it be that this par is not properly collected?
let (max, sumexp) = cached_tensor[_,i].streaming_max_sumexp
var res_slice = result[_, i]
Confirmed, the culprit is let (max, sumexp) = cached_tensor[_,i].streaming_max_sumexp
, it is not collected properly.
Or not, there seem to be very subtile and necessary conditions to trigger the bug:
Necessary:
let .. = cached_tensor[0, i].streaming_max_sumexp
even if not usedmap3_inline
even if it's just to put zerosMinimum test case as of commit https://github.com/mratsim/Arraymancer/commit/8d81857081724bc81625232b1457c06294541d62
Note randomTensor still uses the old deprecated random function as well
import ../src/tensor/tensor,
math
proc streaming_max_sumexp[T](t: Tensor[T]): T {.noSideEffect, inline.}=
return -Inf.T # Plot twist, a function is needed
proc softmax_cross_entropy1*[T](input, target: Tensor[T]): T =
var sample_softmax_xentropy = zeros[T](1, input.shape[1])
var i = 0
for sample_input, sample_target in zipAxis(input, target, 1):
let lse = 10.T
sample_softmax_xentropy[0, i] = sum: # Plt twist, this is needed as well
map2_inline(sample_input, sample_target):
y * (lse - x)
inc i
result = sample_softmax_xentropy.mean
###### Backprop bench
proc softmax_cross_entropy_backward1[T](
gradient: T,
cached_tensor: Tensor[T],
target: Tensor[T]
): Tensor[T] {.noInit.}=
let batch_size = cached_tensor.shape[1]
result = zeros_like(cached_tensor)
for i in 0..<batch_size:
let test = cached_tensor[_,i].streaming_max_sumexp
var res_slice = result[_, i]
apply3_inline(res_slice, cached_tensor[_,i], target[_,i]):
gradient * ((y + y + y) - z) / T(batch_size) # Plot twist: y + y + y needed, / batch_size needed
############ New optimized
proc softmax_cross_entropy_backward2[T](
gradient: Tensor[T] or T,
cached_tensor: Tensor[T],
target: Tensor[T]
): Tensor[T] {.noInit.}=
result = map3_inline(cached_tensor, target, cached_tensor): # Plot twist: this is also needed
0.T
####### Bench:
let batch_size = 200
let nb_classes = 100000
# Create a sparse label tensor of shape: [batch_size]
let sparse_labels = randomTensor(batch_size, nb_classes)
# Create the corresponding dense label tensor of shape [nb_classes, batch_size]
var labels = zeros[float64](nb_classes, batch_size)
# Fill in the non-zeros values
for sample_id, nonzero_idx in enumerate(sparse_labels):
labels[nonzero_idx, sample_id] = 1
# Create a random tensor with predictions:
let pred = randomTensor(nb_classes, batch_size, -1.0..1.0)
echo "### Reference"
let sce_loss = softmax_cross_entropy1(pred, labels)
echo sce_loss
import times # Plot twist - this is needed
var start = epochTime()
echo "###### Backpropagation"
for i in 0..<20:
discard softmax_cross_entropy_backward1(sce_loss, pred, labels)
echo "Softmax xentropy backward: ", epochTime() - start # Plot twist - this is needed
for i in 0..<20:
discard softmax_cross_entropy_backward2(sce_loss, pred, labels)
echo "Backprop SCE optimized: ", epochTime() - start
Investigating further is not worth it as this is such a rare combination, we can just not use the offending alternative in the library.
Edit, summary: I needed to keep some very rare conditions in my code otherwise it just disappeared:
Removing the low priority tag and changing to showstopper, this also happens reliably on the MNIST example
import ../src/arraymancer
# This is an early minimum viable example of handwritten digits recognition.
# It uses convolutional neural networks to achieve high accuracy.
#
# Data files (MNIST) can be downloaded here http://yann.lecun.com/exdb/mnist/
# and must be decompressed in "./bin/" (or change the path "bin/..." below)
#
# Note:
# In the future, model, weights and optimizer definition will be streamlined.
# Also, currently this only works on Nim 0.17.2
# until I debug the new Nim allocator introduced in November 2017 in devel branch.
let
ctx = newContext Tensor[float32] # Autograd/neural network graph
n = 32 # Batch size
let
# Training data is 60k 28x28 greyscale images from 0-255,
# neural net prefers input rescaled to [0, 1] or [-1, 1]
x_train = read_mnist_images("bin/train-images.idx3-ubyte").astype(float32) / 255'f32
# Change shape from [N, H, W] to [N, C, H, W], with C = 1 (unsqueeze). Convolution expect 4d tensors
# And store in the context to track operations applied and build a NN graph
X_train = ctx.variable x_train.unsqueeze(1)
# Labels are uint8, we must convert them to int
y_train = read_mnist_labels("bin/train-labels.idx1-ubyte").astype(int)
# Idem for testing data
x_test = read_mnist_images("bin/t10k-images.idx3-ubyte").astype(float32) / 255'f32
X_test = ctx.variable x_test.unsqueeze(1)
y_test = read_mnist_labels("bin/t10k-labels.idx1-ubyte").astype(int)
# Config
let
# We randomly initialize all weights and bias between [-0.5, 0.5]
cv1_w = ctx.variable randomTensor[float32](20, 1, 5, 5, 1'f32) .- 0.5'f32 # Weight of 1st convolution
cv1_b = ctx.variable randomTensor[float32](20, 1, 1, 1'f32) .- 0.5'f32 # Bias of 1st convolution
cv2_w = ctx.variable randomTensor[float32](50, 20, 5, 5, 1'f32) .- 0.5'f32 # Weight of 2nd convolution
cv2_b = ctx.variable randomTensor[float32](50, 1, 1, 1'f32) .- 0.5'f32 # Bias of 2nd convolution
fc3 = ctx.variable randomTensor[float32](500, 800, 1'f32) .- 0.5'f32 # Fully connected: 800 in, 500 out
classifier = ctx.variable randomTensor[float32](10, 500, 1'f32) .- 0.5'f32 # Fully connected: 500 in, 10 classes out
proc model[TT](x: Variable[TT]): Variable[TT] =
# The formula of the output size of convolutions and maxpools is:
# H_out = (H_in + (2*padding.height) - kernel.height) / stride.height + 1
# W_out = (W_in + (2*padding.width) - kernel.width) / stride.width + 1
let cv1 = x.conv2d(cv1_w, cv1_b).relu() # Conv1: [N, 1, 28, 28] --> [N, 20, 24, 24] (kernel: 5, padding: 0, strides: 1)
let mp1 = cv1.maxpool2D((2,2), (0,0), (2,2)) # Maxpool1: [N, 20, 24, 24] --> [N, 20, 12, 12] (kernel: 2, padding: 0, strides: 2)
let cv2 = mp1.conv2d(cv2_w, cv2_b).relu() # Conv2: [N, 20, 12, 12] --> [N, 50, 8, 8] (kernel: 5, padding: 0, strides: 1)
let mp2 = cv2.maxpool2D((2,2), (0,0), (2,2)) # Maxpool1: [N, 50, 8, 8] --> [N, 50, 4, 4] (kernel: 2, padding: 0, strides: 2)
let f = mp2.flatten # [N, 50, 4, 4] -> [N, 800]
let hidden = f.linear(fc3).relu # [N, 800] -> [N, 500]
result = hidden.linear(classifier) # [N, 500] -> [N, 10]
# Stochastic Gradient Descent (API will change)
let optim = newSGD[float32](
cv1_w, cv1_b, cv2_w, cv2_b, fc3, classifier, 0.01f # 0.01 is the learning rate
)
# Learning loop
for epoch in 0..1:
for batch_id in 0 ..< 1: # X_train.value.shape[0] div n: # some at the end may be missing, oh well ...
# minibatch offset in the Tensor
let offset = batch_id * n
let x = X_train[offset ..< offset + n, _]
let target = y_train[offset ..< offset + n]
# Running through the network and computing loss
let clf = x.model
let loss = clf.sparse_softmax_cross_entropy(target)
echo "Epoch is:" & $epoch
echo "Batch id:" & $batch_id
echo "Loss is:" & $loss.value.data[0]
# Compute the gradient (i.e. contribution of each parameter to the loss)
loss.backprop()
# Correct the weights now that we have the gradient information
optim.update()
echo "\nEpoch #" & $epoch & " done. Testing accuracy"
let y_pred = X_test.model.value.softmax.argmax(axis = 1).indices
echo accuracy_score(y_test, y_pred)
echo "\n"
$ ./bin/mnist_example
Epoch is:0
Batch id:0
Loss is:213.9321136474609
Epoch #0 done. Testing accuracy
Traceback (most recent call last)
mnist_example.nim(92) mnist_example
mnist_example.nim(60) model
relu.nim(51) relu
ag_data_structure.nim(62) forward
relu.nim(27) forward
init_cpu.nim(132) zeros_like
init_cpu.nim(123) zeros
data_structure.nim(104) data=
assign.nim(155) genericSeqAssign
assign.nim(118) genericAssign
assign.nim(65) genericAssignAux
mmdisp.nim(568) nimNewSeqOfCap
gc.nim(492) newObjNoInit
alloc.nim(682) rawAlloc
alloc.nim(589) getBigChunk
alloc.nim(567) splitChunk
alloc.nim(215) addChunkToMatrix
SIGSEGV: Illegal storage access. (Attempt to read from nil?)
From IRC discussion with Araq: he knows about one bug with the cycle collector.
Using GC_disableMarkAndSweep()
to remove the cycle collector prevents crash in the MNIST example.
I think there is another bug with setters as GC_disable... does nothing for the softmax_cross_entropy benchmark. However removing all usage of this setter to directly use the field prevents all crashes:
proc `data=`*[T](t: var Tensor[T], s: seq[T]) {.inline, noSideEffect.}=
# Set tensor raw data
# This is intended for library writer
t.storage.Fdata = s
https://github.com/mratsim/Arraymancer/commit/a75647e1db531f8543b378d8c164b37bace76503 (not using the data=
setter) has completely removed the issue in the softmax benchmark.
GC_disableMarkAndSweep()
does not do anything for that as we simply run OutOfMem.
Further improvements would be:
conv2d().relu
mrelu
, etcAnother fix for allocator issues in the pipe! https://github.com/nim-lang/Nim/pull/7338
While running the softmax benchmark:
https://github.com/mratsim/Arraymancer/blob/8d81857081724bc81625232b1457c06294541d62/benchmarks/implementation/softmax_cross_entropy_bench.nim#L165-L177