apache / mxnet

Lightweight, Portable, Flexible Distributed/Mobile Deep Learning with Dynamic, Mutation-aware Dataflow Dep Scheduler; for Python, R, Julia, Scala, Go, Javascript and more
https://mxnet.apache.org
Apache License 2.0
20.76k stars 6.8k forks source link

why is it so slow (MXNET0.12)even with NVIDIA V100 GPU? #9026

Closed dbsxdbsx closed 6 years ago

dbsxdbsx commented 6 years ago

I test my py on AWS EC2 P3.2xlarge(GPU:V100), AMI: ami-77eb3a0f, python version : 2.7. The .py is as follow:

'''this is a py used to predict capts num ranging from 4 to 10~
Therefore, the DataIter should predict capt examples of different size, even in a batch_size'''

font_name = 'segoeuib.ttf'

import sys

sys.path.insert(0, "../../python")
sys.path.append('../')
sys.path.append('../../')
import mxnet as mx
import numpy as np
import cv2, random
from captcha.image import ImageCaptcha

class OCRBatch(object):
    def __init__(self, data_names, data, label_names, label):
        self.data = data
        self.label = label
        self.data_names = data_names
        self.label_names = label_names

    @property
    def provide_data(self):
        return [(n, x.shape) for n, x in zip(self.data_names, self.data)]

    @property
    def provide_label(self):
        return [(n, x.shape) for n, x in zip(self.label_names, self.label)]

def gen_rand(capt_num):
    buf = ""
    for i in range(capt_num):
        buf += str(random.randint(0, 9))
    return buf

def get_label(capt_str, capt_max_num):
    a = [int(x) for x in capt_str]
    # for max 10 label, if the capt_num is less than 10, fulfill with -1
    min_pos = len(a)
    for i in range(min_pos, capt_max_num):
        a.append(11)  # -1
    return np.array(a)

def gen_sample(captcha, width, height, capt_num):
    num = gen_rand(capt_num)
    img = captcha.generate(num)
    img = np.fromstring(img.getvalue(), dtype='uint8')
    img = cv2.imdecode(img, cv2.IMREAD_COLOR)
    img = cv2.resize(img, (width, height))

    img = np.multiply(img, 1 / 255.0)
    img = img.transpose(2, 0, 1)
    return (num, img)

class OCRIter(mx.io.DataIter):
    def __init__(self, count, batch_size, height, width):
        super(OCRIter, self).__init__()
        self.captcha = ImageCaptcha(fonts=[font_name])

        self.batch_size = batch_size
        self.count = count
        self.height = height
        self.width = width
        self.provide_data = [('data', (batch_size, 3, height, width))]
        self.capt_max_num = 10
        self.provide_label = [('softmax_label', (self.batch_size, self.capt_max_num))]  # ori version
        # self.provide_label = [('softmax1_label', (self.batch_size,)),
        #                       ('softmax2_label', (self.batch_size,)),
        #                       # ('softmax3_label', (self.batch_size,)),
        #                       # ('softmax4_label', (self.batch_size,)),
        #                       ]

    def __iter__(self):
        for k in range(self.count / self.batch_size):
            data = []
            label = []  # ori version
            # label = [[], [], [], []]
            for i in range(self.batch_size):
                capt_num_for_1_example = np.random.randint(4, 10, size=1)
                capt_num, img = gen_sample(self.captcha, self.width, self.height, capt_num_for_1_example)
                data.append(img)
                label.append(get_label(capt_num, self.capt_max_num))
                # num = [int(x) for x in num]
                # for i in range(4):
                #     label[i].append(num[i])

            data_all = [mx.nd.array(data)]
            label_all = [mx.nd.array(label)]
            data_names = ['data']
            label_names = ['softmax_label']

            data_batch = OCRBatch(data_names, data_all, label_names, label_all)  # ori version
            # data_batch = mx.io.DataBatch(data=data_all, label=label_all)
            yield data_batch

    def reset(self):
        pass

def get_ocrnet():
    data = mx.symbol.Variable('data')
    label = mx.symbol.Variable('softmax_label')
    conv1 = mx.symbol.Convolution(data=data, kernel=(5, 5), num_filter=32)
    pool1 = mx.symbol.Pooling(data=conv1, pool_type="max", kernel=(2, 2), stride=(1, 1))
    relu1 = mx.symbol.Activation(data=pool1, act_type="relu")

    conv2 = mx.symbol.Convolution(data=relu1, kernel=(5, 5), num_filter=32)
    pool2 = mx.symbol.Pooling(data=conv2, pool_type="avg", kernel=(2, 2), stride=(1, 1))
    relu2 = mx.symbol.Activation(data=pool2, act_type="relu")

    conv3 = mx.symbol.Convolution(data=relu2, kernel=(3, 3), num_filter=32)
    pool3 = mx.symbol.Pooling(data=conv3, pool_type="avg", kernel=(2, 2), stride=(1, 1))
    relu3 = mx.symbol.Activation(data=pool3, act_type="relu")

    conv4 = mx.symbol.Convolution(data=relu3, kernel=(3, 3), num_filter=32)
    pool4 = mx.symbol.Pooling(data=conv4, pool_type="avg", kernel=(2, 2), stride=(1, 1))
    relu4 = mx.symbol.Activation(data=pool4, act_type="relu")

    flatten = mx.symbol.Flatten(data=relu4)
    fc1 = mx.symbol.FullyConnected(data=flatten, num_hidden=256)
    fc21 = mx.symbol.FullyConnected(data=fc1, num_hidden=10)
    fc22 = mx.symbol.FullyConnected(data=fc1, num_hidden=10)
    fc23 = mx.symbol.FullyConnected(data=fc1, num_hidden=10)
    fc24 = mx.symbol.FullyConnected(data=fc1, num_hidden=10)
    fc25 = mx.symbol.FullyConnected(data=fc1, num_hidden=10)
    fc26 = mx.symbol.FullyConnected(data=fc1, num_hidden=10)
    fc27 = mx.symbol.FullyConnected(data=fc1, num_hidden=10)
    fc28 = mx.symbol.FullyConnected(data=fc1, num_hidden=10)
    fc29 = mx.symbol.FullyConnected(data=fc1, num_hidden=10)
    fc210 = mx.symbol.FullyConnected(data=fc1, num_hidden=10)

    fc2 = mx.symbol.Concat(*[fc21, fc22, fc23, fc24, fc25, fc26, fc27, fc28, fc29, fc210], dim=0)
    label = mx.symbol.transpose(data=label)
    label = mx.symbol.Reshape(data=label, target_shape=(0,))
    return mx.symbol.SoftmaxOutput(data=fc2, label=label, name="softmax")

def Accuracy(label, pred):
    """the old version, I just think the pos is calculated wrong~"""
    label = label.T.reshape((-1,))
    hit = 0
    total = 0
    for i in range(pred.shape[0] / capt_num):
        ok = True
        for j in range(capt_num):
            k = i * capt_num + j
            if np.argmax(pred[k]) != int(label[k]):
                ok = False
                break
        if ok:
            hit += 1
        total += 1
    return 1.0 * hit / total

def Accuracy2(label, pred):
    """new  version made by me"""
    hit = 0
    total = 0
    batch_size = pred.shape[0] / capt_num
    for i in range(batch_size):
        ok = True
        for j in range(capt_num):
            k = j * batch_size + i
            if np.argmax(pred[k]) != int(label[i, j]):
                ok = False
                break
        if ok:
            hit += 1
        total += 1
    return 1.0 * hit / total

import argparse

def parse_args(description):
    parser = argparse.ArgumentParser(description=description)
    parser.add_argument('--batch_size', dest='batch_size', type=int, default=8)
    parser.add_argument('--train_exp_num', dest='train_exp_num', type=int, default=2000)
    parser.add_argument('--epoch_num', dest='epoch_num', type=int, default=50)
    parser.add_argument('--gpu_num', dest='gpu_num', type=int, default=1)
    parser.add_argument('--lr', dest='lr', type=float, default=0.00075)

    args = parser.parse_args()
    return args

if __name__ == '__main__':
    import logging

    head = '%(asctime)-15s %(message)s'
    logging.basicConfig(level=logging.DEBUG, format=head)

    args = parse_args('train 4 to 10 capt in 1 net')
    network = get_ocrnet()
    batch_size = args.batch_size  # 50
    train_exp_num = args.train_exp_num  # 10000  # 50000
    test_exp_num = 1000
    epoch_num = args.epoch_num  # 2000
    gpu_num = args.gpu_num
    lr = args.lr
    devs = [mx.gpu(i) for i in range(gpu_num)]
    # model = mx.model.FeedForward(ctx=devs,
    #                              symbol=network,
    #                              num_epoch=epoch_num,
    #                              learning_rate=lr,
    #                              wd=0.00001,
    #                              initializer=mx.init.Xavier(factor_type="in", magnitude=2.34),
    #                              momentum=0.9)
    global capt_num
    capt_num = 10
    data_train = OCRIter(train_exp_num, batch_size, 30, 80)
    data_test = OCRIter(test_exp_num, batch_size, 30, 80)

    # model.fit(X=data_train,
    #           eval_data=data_test,
    #           eval_metric=[Accuracy, Accuracy2],
    #           batch_end_callback=mx.callback.Speedometer(batch_size, 50),
    #           epoch_end_callback=mx.callback.do_checkpoint(prefix='param', period=2))

    # new version
    lenet_model = mx.mod.Module(symbol=network, context=devs)
    # train with the same
    lenet_model.fit(data_train,
                    eval_data=data_test,
                    optimizer='sgd',
                    optimizer_params={'learning_rate': lr, 'momentum': 0.9, 'wd': 0.00001},
                    eval_metric=[Accuracy, Accuracy2],
                    initializer=mx.init.Xavier(factor_type="in", magnitude=2.34),
                    batch_end_callback=mx.callback.Speedometer(batch_size, 50),
                    epoch_end_callback=mx.callback.do_checkpoint(prefix='param', period=2),
                    num_epoch=epoch_num)

    # model.save("cnn-ocr")

On my own host, win10, mx0.12, gpu:940M, I got near 110 samples/seconds with default params, but surprisingly, on p3.2xlarge, I got only 170 samples/seconds. In detail, with watch -n 1 nvidia-smi, I found the volatile GPU utile is always near 0%, up t0 4%. WHY??? Is that just because I got a custom DataIter?

KellenSunderland commented 6 years ago

Hi @dbsxdbsx, thanks for posting demo code. Looking quickly at your code I would guess you're correct, it's probably data iterator related. If you run htop while training do you see one thread with near 100% cpu usage?

dbsxdbsx commented 6 years ago

@KellenSunderland ,thanks for your answer. This time , I change to another GPU EC2 called p2.xlarge. Anyway , this change would not make big difference.

the cpu usage pic

the gpu usage pic

As you can see , there are 6 threads running this script, one shows 101---I guess it means over 100% usage? And without exception, the gpu usage is still low.

and on contrary, running the mnist example below would give over 50% of GPU usage, and it really depends on batch_size:

import mxnet as mx

import  argparse
def parse_args(description):
    parser = argparse.ArgumentParser(description=description)
    parser.add_argument('--batch_size', dest='batch_size', type=int, default=8)
    parser.add_argument('--train_exp_num', dest='train_exp_num', type=int, default=2000)
    parser.add_argument('--epoch_num', dest='epoch_num', type=int, default=50)
    parser.add_argument('--gpu_num', dest='gpu_num', type=int, default=1)
    parser.add_argument('--lr', dest='lr', type=float, default=0.00075)

    args = parser.parse_args()
    return args

if __name__ == '__main__':
    args=parse_args('for mnist')
    mnist = mx.test_utils.get_mnist()

    batch_size =args.batch_size #100
    train_iter = mx.io.NDArrayIter(mnist['train_data'], mnist['train_label'], batch_size, shuffle=True)
    val_iter = mx.io.NDArrayIter(mnist['test_data'], mnist['test_label'], batch_size)

    data = mx.sym.var('data')
    # first conv layer
    conv1 = mx.sym.Convolution(data=data, kernel=(5, 5), num_filter=20)
    tanh1 = mx.sym.Activation(data=conv1, act_type="tanh")
    pool1 = mx.sym.Pooling(data=tanh1, pool_type="max", kernel=(2, 2), stride=(2, 2))
    # second conv layer
    conv2 = mx.sym.Convolution(data=pool1, kernel=(5, 5), num_filter=50)
    tanh2 = mx.sym.Activation(data=conv2, act_type="tanh")
    pool2 = mx.sym.Pooling(data=tanh2, pool_type="max", kernel=(2, 2), stride=(2, 2))
    # first fullc layer
    flatten = mx.sym.flatten(data=pool2)
    fc1 = mx.symbol.FullyConnected(data=flatten, num_hidden=500)
    tanh3 = mx.sym.Activation(data=fc1, act_type="tanh")
    # second fullc
    fc2 = mx.sym.FullyConnected(data=tanh3, num_hidden=10)
    # softmax loss
    lenet = mx.sym.SoftmaxOutput(data=fc2, name='softmax')

    import logging

    logging.getLogger().setLevel(logging.DEBUG)  # logging to stdout
    # create a trainable module on GPU 0
    lenet_model = mx.mod.Module(symbol=lenet, context=mx.gpu())
    # train with the same
    lenet_model.fit(train_iter,
                    eval_data=val_iter,
                    optimizer='sgd',
                    optimizer_params={'learning_rate': 0.1},
                    eval_metric='acc',
                    batch_end_callback=mx.callback.Speedometer(batch_size, 100),
                    num_epoch=10)

So I wonder how to make full usage of gpu? As far as I know, that increasing batch_size is useful--- but not for my script. And as the gpu usage often fluctuates from 0 to 19 in no time. I guess whether it is due to my dataiter creating data for every batch at real time ? If this is the problem, how to fix it?

jiayiliu commented 6 years ago

Could we switch to some faster way for data feeding? I didn't see any way to do multi-thread with DataIter. I don't think PrefetchingIter works here because the data is already in memory.

Any suggestions are welcome. Thank you!

ThomasDelteil commented 6 years ago

@dbsxdbsx I suggest that you switch to Gluon and use a DataLoader with num_workers>1. The bottleneck seems to be your image generation that is currently done synchronously rather than asynchronously. With Gluon you could simply subclass the Dataset class to generate your captcha asynchronously using a DataLoader. That should solve your I/O issue and you should witness an increase in the GPU utilization.

dbsxdbsx commented 6 years ago

@ThomasDelteil ,thanks for your answer. I think the problem here is that the dataset of captcha is generated online while training. Therefore, the gpu has to wait until cpu generates new batches of data. I guess this is the main issue, and I guess this is what you mean, right? And another problem is that I didn't see any example about using gluon Dataloader with data produced online---Could you show one? I DO WANT TO make everything with Dataloader.

safrooze commented 6 years ago

@dbsxdbsx It's quite easy to use DataLoader with a custom DataSet object. Your custom DataSet class needs to implement only two functions: __len__() and __getitem_(). You can then easily use DataLoader with num_workers>1. Here is a dummy example of a custom dataset class that contains 1000 elements, each one the index plus some random noise:

class MyRandomDataset(object):
    def __getitem__(self, idx):
        return nd.array([idx]) + nd.random.normal()

    def __len__(self):
        return 1000
dbsxdbsx commented 6 years ago

@safrooze Thanks