A question about the network architecture

toshiyuki123 commented 5 years ago

Thank you for your great work. I'm so interested in it, and I'm trying to reproduce it (deep visuo tactile learning composed of encorder-decorder) but I couldn't.

I have a question about the architecture. Based on your paper, I made the code and tested the output corresponding to the input using chainer 6.0.0. As far as I understand, the output shape should be (sample_number, 3(axes), 1, 16(taxel_number), 90(timestep)) corresponding to the input shape(sample_number, 1(in_channel), 200, 200). But my code's output shape is (1, 3, 6, 6, 90) which doesn't match shape (1, 3, 1, 16, 90). My code is as follows.

I would appreciate if you could give some advice. Thank you.

import numpy as np
import chainer
from chainer import cuda, Function, gradient_check, report, training, utils, Variable
from chainer import datasets, iterators, optimizers, serializers
from chainer import Link, Chain, ChainList
import chainer.functions as F
import chainer.links as L
from chainer.training import extensions
from keras.datasets import mnist
import cupy as cp

x_train = np.arange(1*1*200*200, dtype=np.float32).reshape(1,1,200,200)
class MyModel(Chain):
    def __init__(self):
        super().__init__()
        with self.init_scope():
            self.conv1=L.Convolution2D(1,32,ksize=(8,8),stride=(2,2),pad=(0,0))
            self.conv2=L.Convolution2D(32,32,ksize=(8,8),stride=(2,2),pad=(0,0))
            self.conv3=L.Convolution2D(32,32,ksize=(4,4),stride=(2,2),pad=(0,0))
            self.conv4=L.Convolution2D(32,32,ksize=(4,4),stride=(2,2),pad=(0,0))
            self.l1=L.Linear(9*9*32,4) 
            self.l2=L.Linear(4,160)
            self.dconv1=L.Deconvolution3D(1,32,ksize=(1,1,3),stride=(1,1,1),pad=(0,0,0))
            self.dconv2=L.Deconvolution3D(32,32,ksize=(1,1,3),stride=(1,1,2),pad=(0,0,0))
            self.dconv3=L.Deconvolution3D(32,32,ksize=(2,2,4),stride=(1,1,2),pad=(0,0,3))
            self.dconv4=L.Deconvolution3D(32,3,ksize=(2,2,4),stride=(1,1,2),pad=(0,0,2))
            self.bnorm1=L.BatchNormalization(32)
            self.bnorm2=L.BatchNormalization(4)
            self.bnorm3=L.BatchNormalization(160)

    def forward(self,x):
        h = F.relu(self.bnorm1(self.conv1(x)))
        h = F.relu(self.bnorm1(self.conv2(h)))
        h = F.relu(self.bnorm1(self.conv3(h)))
        h = F.tanh(self.bnorm1(self.conv4(h)))
        h = F.relu(self.bnorm2(self.l1(h)))
        h = F.relu(self.bnorm3(self.l2(h)))
        h = F.reshape(h,(-1,1,4,4,10))
        h = F.relu(self.bnorm1(self.dconv1(h)))
        h = F.relu(self.bnorm1(self.dconv2(h)))
        h = F.relu(self.bnorm1(self.dconv3(h)))
        h = F.tanh(self.dconv4(h))
        return h

model = MyModel()
optimizer = optimizers.Adam(alpha=0.001)
optimizer.setup(model)
output = model(x_train)
print("output.shape:", output.shape)

output.shape: (1, 3, 6, 6, 90)

kuniyuki-takahashi commented 5 years ago

Dear toshiyuki123, I found a mistake with the values of padding in our article.

self.dconv4=L.Deconvolution3D(32,3,ksize=(2,2,4),stride=(1,1,2),pad=(0,0,2))

self.dconv4=L.Deconvolution3D(32,3,ksize=(2,2,4),stride=(1,1,2),pad=(1,1,2)) Sorry for inconvenient.

kuniyuki-takahashi commented 5 years ago

The modified version of arXiv article is published. https://arxiv.org/abs/1803.03435

toshiyuki123 commented 5 years ago

Dear kuniyuki-takahashi, Thank you for your quick response. I got the same shape.

just in case, I would like to confirm the input of batch normalization. In your paper,

For all layers except last layer in the network, we make use of batch normalization.

Does this mean that you use the batch normalization to the all layers in encorder, decorder, and hidden layers except the last layer of only decorder as my code above shows ?

kuniyuki-takahashi commented 5 years ago

Yes, we use the batch normalization to all layers in encorder, decorder, and hidden layers except the last layer of the only decorder.

Following code is for model

import math
import numpy as np
import chainer
import chainer.functions as F
import chainer.links as L
from chainer.utils import conv
from chainer import cuda, Function, gradient_check, report, training, utils, Variable
import os, re

latent_variable=4
mid_size_x=4
mid_size_y=4
mid_size_z=10
TIME_WINDOW = 5
print_flag = False

class V2T(chainer.Chain):
    def __init__(self, inchannel_image=1, inchannel_sensor=3):
        super(V2T, self).__init__(  
            conv1=L.ConvolutionND(ndim=2, in_channels=inchannel_image, out_channels=32, ksize=(8, 8), stride=(2, 2), pad=(0, 0)),
            bn1 = L.BatchNormalization(32),
            conv2=L.ConvolutionND(ndim=2, in_channels=32, out_channels=32, ksize=(8, 8), stride=(2, 2), pad=(0, 0)),
            bn2 = L.BatchNormalization(32),
            conv3=L.ConvolutionND(ndim=2, in_channels=32, out_channels=32, ksize=(4, 4), stride=(2, 2), pad=(0, 0)),
            bn3 = L.BatchNormalization(32),
            conv4=L.ConvolutionND(ndim=2, in_channels=32, out_channels=32, ksize=(4, 4), stride=(2, 2), pad=(0, 0)),
            bn4 = L.BatchNormalization(32),

            l1 = L.Linear(None, latent_variable),
            lbn1 = L.BatchNormalization(latent_variable),
            l2 = L.Linear(None, mid_size_x*mid_size_y*mid_size_z),
            lbn2 = L.BatchNormalization(mid_size_x*mid_size_y*mid_size_z),

            dconv1 = L.DeconvolutionND(ndim=3, in_channels=1, out_channels=32, ksize=(1, 1, TIME_WINDOW-2), stride=(1, 1, 1), pad=0),
            dbn1 = L.BatchNormalization(32),
            dconv2 = L.DeconvolutionND(ndim=3, in_channels=32, out_channels=32, ksize=(1, 1, TIME_WINDOW-2), stride=(1, 1, 2), pad=(0, 0, TIME_WINDOW-5)),
            dbn2 = L.BatchNormalization(32),
            dconv3 = L.DeconvolutionND(ndim=3, in_channels=32, out_channels=32, ksize=(2, 2, TIME_WINDOW-1), stride=(1, 1, 2), pad=(0, 0, 3)),
            dbn3 = L.BatchNormalization(32),
            dconv4 = L.DeconvolutionND(ndim=3, in_channels=32, out_channels=inchannel_sensor, ksize=(2, 2, TIME_WINDOW-1), stride=(1, 1, 2), pad=(1, 1, TIME_WINDOW-3)),
        )

    def encode(self, x):
        hid = F.relu(self.bn1(self.conv1(x)))
        hid = F.relu(self.bn2(self.conv2(hid)))
        hid = F.relu(self.bn3(self.conv3(hid)))
        hid = F.relu(self.bn4(self.conv4(hid)))
        hid = F.tanh(self.lbn1(self.l1(hid)))

        return hid

    def decode(self, x):
        hid = F.reshape(F.relu(self.lbn2(self.l2(x))), (x.data.shape[0], 1, mid_size_x, mid_size_y, mid_size_z))
        hid = F.relu(self.dbn1(self.dconv1(hid)))
        hid = F.relu(self.dbn2(self.dconv2(hid)))
        hid = F.relu(self.dbn3(self.dconv3(hid)))
        out = F.tanh(self.dconv4(hid))

        return out

    def forward(self, args, batch_data_in, batch_data_out):
        x = Variable(batch_data_out)
        hid = self.encode(batch_data_in)
        y = self.decode(hid)

        return F.mean_squared_error(x, y)

toshiyuki123 commented 5 years ago

Thank you for your detailed explanation :)

Benjizhang commented 12 months ago

Hi, when I run your codes about the v2t model. I found that since mid_size_x=4, mid_size_y=4, the output shape from the decode is [sample_number, 3, 4, 4, 90], rather than [sample_number, 3, 1, 16, 90].

If I set mid_size_x=1, mid_size_y=16, then I can get the expected shape [sample_number, 3, 1, 16, 90].

Can you explain more about this issue? Which is correct? Thanks a lot.

pfnet-research / Deep_visuo-tactile_learning_ICRA2019

A question about the network architecture #1