Does memonger work for gluon to save memory?

xinedison commented 6 years ago

Description

I want to reduce gpu memory costing when using gluon, I tryed MXNet memonger but it did not work for me, After that I setting os.environ['MXNET_BACKWARD_DO_MIRROR'] = '1', But it not work for me too.

Environment info (Required)

----------Python Info----------
('Version      :', '2.7.5')
('Compiler     :', 'GCC 4.8.5 20150623 (Red Hat 4.8.5-11)')
('Build        :', ('default', 'Nov  6 2016 00:28:07'))
('Arch         :', ('64bit', 'ELF'))
------------Pip Info-----------
('Version      :', '9.0.1')
('Directory    :', '/usr/lib/python2.7/site-packages/pip')
----------MXNet Info-----------
('Version      :', '1.2.0')
('Directory    :', '/home/yinghuang/incubator-mxnet-newest/python/mxnet')
Hashtag not found. Not installed from pre-built package.
----------System Info----------
('Platform     :', 'Linux-3.10.0-327.22.2.el7.x86_64-x86_64-with-centos-7.2.1511-Core')
('system       :', 'Linux')
('node         :', 'gz-open-gpu-c117')
('release      :', '3.10.0-327.22.2.el7.x86_64')
('version      :', '#1 SMP Thu Jun 23 17:05:11 UTC 2016')
----------Hardware Info----------
('machine      :', 'x86_64')
('processor    :', 'x86_64')
Architecture:          x86_64
CPU op-mode(s):        32-bit, 64-bit
Byte Order:            Little Endian
CPU(s):                32
On-line CPU(s) list:   0-31
Thread(s) per core:    2
Core(s) per socket:    8
座：                 2
NUMA 节点：         2
厂商 ID：           GenuineIntel
CPU 系列：          6
型号：              62
型号名称：        Genuine Intel(R) CPU  @ 2.80GHz
步进：              2
CPU MHz：             1706.250
BogoMIPS：            5617.25
虚拟化：           VT-x
L1d 缓存：          32K
L1i 缓存：          32K
L2 缓存：           256K
L3 缓存：           25600K
NUMA 节点0 CPU：    0-7,16-23
NUMA 节点1 CPU：    8-15,24-31
----------Network Test----------
Setting timeout: 10
Timing for MXNet: https://github.com/apache/incubator-mxnet, DNS: 0.0234 sec, LOAD: 1.4669 sec.
Timing for PYPI: https://pypi.python.org/pypi/pip, DNS: 0.0253 sec, LOAD: 0.4839 sec.
Timing for FashionMNIST: https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/fashion-mnist/train-labels-idx1-ubyte.gz, DNS: 0.3580 sec, LOAD: 2.5292 sec.
Timing for Conda: https://repo.continuum.io/pkgs/free/, DNS: 0.0229 sec, LOAD: 0.8054 sec.
Timing for Gluon Tutorial(en): http://gluon.mxnet.io, DNS: 0.7395 sec, LOAD: 1.3768 sec.
Timing for Gluon Tutorial(cn): https://zh.gluon.ai, DNS: 0.0230 sec, LOAD: 3.4856 sec.

question definition

I am using python 2 and newest mxnet gluon 3D convolution to do video action recognition. I want gpu memory to be effiently used, 1) so I try [memonger] to optimize the sym the hybridblock generated, but with no effect after hard working. 2) Then I try to save memory by only setting the environment variable MXNET_BACKWARD_DO_MIRROR to be '1', the memory not reduced too. Can someone show me what I have missed for memory saving.

Steps to reproduce

here is my code of my network


import sys
import os

mxnet_path = os.path.expanduser('~') + '/incubator-mxnet-newest'
sys.path.insert(0, os.path.abspath(os.path.join(mxnet_path, "python")))
import mxnet as mx
from mxnet.gluon import nn
from mxnet import nd
from mxnet.gluon.block import _flatten,_regroup

import memonger

def bn_relu_conv(ks, nout, stride, pad,  name=None):
    layer = nn.HybridSequential()
    layer.add(nn.BatchNorm())
    layer.add(nn.Activation('relu'))
    layer.add(nn.Conv3D(channels=nout, kernel_size=ks, padding=pad, strides=stride))
    return layer

def bn_relu_block(growth_rate):
    layer = nn.HybridSequential()
    layer.add(bn_relu_conv(1, nout=growth_rate, stride=1, pad=0))
    layer.add(bn_relu_conv(3, nout=growth_rate, stride=1, pad=1))
    return layer

def conv_act_layer(channels, kernel=(1,1,1) , pad=(0,0,0), stride=(1,1,1), act_type="relu", use_batchnorm=False):
    layer = nn.HybridSequential()
    layer.add(nn.Conv3D(channels=channels, kernel_size=kernel, padding=pad, strides=stride))
    layer.add(nn.BatchNorm())
    layer.add(nn.Activation(act_type))
    return layer

def transition(channels):
    transition_layer = nn.HybridSequential()
    transition_layer.add(bn_relu_conv(ks=1, nout=channels, stride=1, pad=0))
    transition_layer.add(nn.MaxPool3D(pool_size=2, strides=2))
    return transition_layer

def transition_w_o_pooling(channels):
    layer = bn_relu_conv(ks=1, nout=channels, stride=1, pad=0)
    return layer

class DsodBlock(nn.HybridBlock):
    def __init__(self, layers, growth_rate, use_memonger=False, **kwargs):
        super(DsodBlock, self).__init__(**kwargs)
        self.use_memonger = use_memonger
        self.net = nn.HybridSequential()
        for i in range(layers):
            lay = bn_relu_block(growth_rate)
            self.net.add(lay)

    def hybrid_forward(self, F, x):
        for idx, layer in enumerate(self.net):
            out = layer(x)
            x = F.concat(x, out, dim=1)
            if self.use_memonger and (idx % 2 == 0):
                #print("use memonger true")
                x._set_attr(mirror_stage='True')

        return x

class DenseNet(nn.HybridBlock):
    def __init__(self, net_def, num_classes, growth_rate, use_memonger=False, batch_size=32, input_depth=16, input_size=112, **kwargs):
        super(DenseNet, self).__init__(**kwargs)
        channels = 128
        self.use_memonger = use_memonger
        self.batch_size = batch_size
        self.input_depth = input_depth
        self.input_size = input_size

        #assert self.use_memonger
        with self.name_scope():
            self.features = nn.HybridSequential(prefix='')
            self.features.add(conv_act_layer(64, kernel=3, pad=1, stride=2, act_type="relu", use_batchnorm=True))
            self.features.add(conv_act_layer(64, kernel=3, pad=1, stride=1, act_type="relu", use_batchnorm=True))
            self.features.add(conv_act_layer(128, kernel=3, pad=1, stride=1, act_type="relu", use_batchnorm=True))

            self.features.add(nn.MaxPool3D(pool_size=2, strides=2, padding=1))

            for i,(dense_layers,transition_fun) in enumerate(net_def):
                self.features.add(DsodBlock(layers=dense_layers, growth_rate=growth_rate, use_memonger=use_memonger))
                channels += growth_rate*dense_layers
                self.features.add(transition_fun(channels))

            self.features.add(nn.BatchNorm())
            self.features.add(nn.Activation('relu'))
            self.features.add(nn.GlobalAvgPool3D())
            self.features.add(nn.Flatten())

            self.output = nn.Dense(num_classes, in_units=channels)

    def _get_graph(self, *args):
        #assert False
        if not self._cached_graph:
            args, self._in_format = _flatten(args)
            if len(args) > 1:
                inputs = [mx.symbol.var('data%d'%i) for i in range(len(args))]
            else:
                inputs = [mx.symbol.var('data')]
            grouped_inputs = _regroup(inputs, self._in_format)[0]

            params = {i: j.var() for i, j in self._reg_params.items()}
            with self.name_scope():
                out = self.hybrid_forward(mx.symbol, *grouped_inputs, **params)  # pylint: disable=no-value-for-parameter
            out, self._out_format = _flatten(out)

            assert len(out) == 1
            if self.use_memonger:
                assert len(inputs) == 1
                out = memonger.search_plan(out[0], data=(self.batch_size, 3, self.input_depth, self.input_size, self.input_size))
                out = [out]

            self._cached_graph = inputs, out[0] #mx.symbol.Group(out)

        return self._cached_graph

    def hybrid_forward(self, F, x):
        x = self.features(x)
        x = self.output(x)
        return x

def dsod_net(net_def, num_classes, growth_rate=64):
    growth_rate = growth_rate
    channels = 128
    net = nn.HybridSequential()
    with net.name_scope():
        ## dsod backbone
        net.add(conv_act_layer(64, kernel=3, pad=1, stride=2, act_type="relu", use_batchnorm=True))
        net.add(conv_act_layer(64, kernel=3, pad=1, stride=1, act_type="relu", use_batchnorm=True))
        net.add(conv_act_layer(128, kernel=3, pad=1, stride=1, act_type="relu", use_batchnorm=True))

        net.add(nn.MaxPool3D(pool_size=2, strides=2, padding=1))

        for i,(dense_layers,transition_fun) in enumerate(net_def):
            net.add(DsodBlock(layers=dense_layers, growth_rate=growth_rate))
            channels += growth_rate*dense_layers
            net.add(transition_fun(channels))

        classifier = nn.HybridSequential()
        classifier.add(nn.BatchNorm())
        classifier.add(nn.Activation('relu'))
        classifier.add(nn.GlobalAvgPool3D())
        classifier.add(nn.Flatten())
        classifier.add(nn.Dense(num_classes))

        net.add(classifier)

    return net

def dsod_net_v2(net_def, num_classes, growth_rate=64, use_memonger=False, **kwargs):
    net = DenseNet(net_def, num_classes, growth_rate, use_memonger, **kwargs)
    return net

def get_net(net_depth, num_classes, hybridize=True, growth_rate=64, **kwargs):
    densenet_spec = {30:[(6,transition), (8,transition), (8,transition_w_o_pooling), (8,transition_w_o_pooling)],
                     22:[(6,transition), (8,transition), (8,transition_w_o_pooling)]}
    net_def =  densenet_spec[net_depth]
    #net = dsod_net(net_def, num_classes, growth_rate)
    net = dsod_net_v2(net_def, num_classes, growth_rate, False, **kwargs)

    if hybridize:
        net.hybridize()
    return net

if __name__ == '__main__':
    dsod = get_net(22, 101, True, 32)
    #print dsod
    dsod.initialize(ctx=mx.gpu(4))
    x = mx.nd.ones((32,3,16,112,112), ctx=mx.gpu(4))
    res = dsod(x)
    #print res.shape
    #print res

lanking520 commented 6 years ago

@nswamy please add 'Python', 'performance' on this topic

eric-haibin-lin commented 6 years ago

We did done some memory optimization recently. You might want to check it out: https://github.com/apache/incubator-mxnet/pull/10847

sandeep-krishnamurthy commented 6 years ago

@xinedison - Memonger in Gluon, not yet as far as I know.

There is a new environment variable - MXNET_GPU_MEM_POOL_TYPE introduced for Memory pool strategy which significantly reduces memory usage in - #11041 You might want to check it out.

Created a feature tracking issue for evaluating using Memonger in Gluon - https://github.com/apache/incubator-mxnet/issues/12226

Resolving it here. Please reopen if closed in error.

apache / mxnet