intel-analytics / analytics-zoo

Distributed Tensorflow, Keras and PyTorch on Apache Spark/Flink & Ray
https://analytics-zoo.readthedocs.io/
Apache License 2.0
16 stars 3 forks source link

[TorchNet] Memory optimization for torchnet #964

Open hhbyyh opened 5 years ago

hhbyyh commented 5 years ago

Use the issue to track the progress on memory optimization for TorchNet.

hhbyyh commented 5 years ago

Torch Script module from Python API, memory increases for the first 3 iterations.

import os

import psutil
import torch
import torchvision

torch.set_num_threads(1)
traced_script_module = torch.jit.trace(torchvision.models.resnet101(pretrained=True), torch.rand(1, 3, 224, 224))

for i in range(100):
    samples = torch.rand(32, 3, 224, 224)
    predictions = traced_script_module(samples)
    print(predictions)
    process = psutil.Process(os.getpid())
    print(process.memory_info().rss / 1e9, "G memory")

output:

tensor([[-2.2903e+00, -1.6855e+00,  3.8102e-02,  ..., -9.4584e-01,
          2.3668e+00, -3.5562e-01],
        [ 4.3640e-01, -3.0016e-01, -3.0599e-01,  ...,  1.1968e+00,
          1.4018e+00, -7.9118e-01],
        [-1.7140e+00,  2.7694e-03, -4.4360e-01,  ..., -1.3820e+00,
          2.4074e+00, -9.4551e-01],
        ...,
        [ 1.1089e+00,  6.9741e-01, -4.8336e-01,  ..., -7.7349e-01,
          1.4667e+00,  1.4881e+00],
        [-1.8032e+00, -1.1997e+00, -6.4645e-01,  ..., -2.4350e-01,
          4.4409e-01,  3.7147e+00],
        [-1.3564e+00,  3.7601e-01, -1.5289e+00,  ...,  2.5337e-01,
          3.0679e+00,  8.1390e-01]], grad_fn=<DifferentiableGraphBackward>)
5.928787968 G memory
tensor([[-1.3370, -2.2020, -3.9834,  ..., -0.8153,  1.4579,  0.1649],
        [-1.4750, -1.0122,  0.2388,  ..., -0.0842,  2.4480,  0.6924],
        [-2.3939, -1.4854,  1.9183,  ...,  0.0355,  0.3399,  1.6879],
        ...,
        [-0.2749,  1.6111, -0.7473,  ...,  0.5937,  1.2901, -0.4303],
        [-0.7707, -1.3135, -0.2625,  ...,  0.1321,  0.9864, -0.5495],
        [-0.2005, -1.6247,  1.1925,  ...,  0.5141,  0.9492, -1.1067]],
       grad_fn=<DifferentiableGraphBackward>)
8.865099776 G memory
tensor([[-1.2737, -0.9936, -0.1794,  ..., -0.9375,  0.8533,  0.8077],
        [-0.8139, -0.5009, -0.2187,  ..., -0.7565,  1.1524, -0.5535],
        [ 0.6839, -0.2231,  0.2724,  ...,  0.4830,  1.4113, -0.2519],
        ...,
        [ 0.0329,  2.2376,  1.1090,  ...,  0.0342,  1.0230,  0.8690],
        [ 0.4129, -0.4162,  0.0832,  ...,  2.2181,  2.3926,  0.4473],
        [-1.2382,  0.4778, -0.7004,  ...,  1.0786,  0.9393,  1.3353]],
       grad_fn=<DifferentiableGraphBackward>)
8.954937344 G memory
tensor([[ 2.9310e+00,  1.7859e+00, -1.0819e+00,  ..., -6.1685e-03,
         -5.7516e-01, -4.9102e-01],
        [ 1.6096e-03, -1.1933e+00,  4.7245e-02,  ..., -1.3567e-01,
         -3.8962e-01,  3.3101e-01],
        [ 1.2337e+00,  2.9382e+00, -9.1299e-01,  ..., -2.5608e-01,
          1.6244e+00, -1.1455e+00],
        ...,
        [ 8.9977e-02, -1.9731e+00, -7.0138e-01,  ..., -1.8564e-01,
          2.4577e+00,  1.2462e-01],
        [-1.1238e+00, -9.2590e-02, -9.8799e-01,  ..., -9.1702e-01,
          5.3392e-01,  1.5519e+00],
        [-1.9508e+00, -1.3581e+00, -2.5739e+00,  ..., -1.8627e+00,
          2.8877e+00,  1.7870e+00]], grad_fn=<DifferentiableGraphBackward>)
10.509094912 G memory
tensor([[-2.6716, -1.2049, -1.4170,  ..., -2.0941,  2.4951,  2.7314],
        [-1.6459, -0.7796, -2.1277,  ..., -0.2482,  2.4392, -0.2142],
        [ 0.0723,  0.1851, -1.0794,  ..., -0.1078,  0.6993,  0.5189],
        ...,
        [-1.1382,  0.8323, -2.3813,  ...,  0.2738,  1.2351, -0.9120],
        [ 0.4644, -1.4338,  2.6241,  ..., -0.0459,  1.0317,  2.3192],
        [-0.4867, -0.5519, -0.2944,  ..., -0.8683,  2.9462,  1.0247]],
       grad_fn=<DifferentiableGraphBackward>)
10.521800704 G memory
hhbyyh commented 5 years ago

torch script from cpp:

#include <cstring>
#include <stdio.h>
#include <stdlib.h>
#include <cstdio>
#include <cstdint>
#include <cstdlib>
#include <cassert>
#include <stdexcept>
#include <sstream>
#include <string>
#include <iostream>
#include <torch/script.h>
#include <memory>
#include <torch/torch.h>

#include <cstddef>
#include <cstdio>
#include <iostream>
#include <string>
#include <vector>
#include <typeinfo>

auto main() -> int {

    auto p_model_path = "/home/yuhao/PycharmProjects/pytorch_test/pts/resNet50.pt";
    std::shared_ptr<torch::jit::script::Module> model_ptr = torch::jit::load(p_model_path);

    for (int ii = 0; ii < 500; ii++) {
        auto x = torch::rand({64, 3, 224, 224});
        std::vector<torch::jit::IValue> modelInputs;
        modelInputs.push_back(x);
        auto output = model_ptr->forward(modelInputs);
        std::cout << ii << "\n";
    }
}

memory consumption bounces from 1.5g to 5g consistently for every iteration.

hhbyyh commented 5 years ago

TorchNet python inference:

from optparse import OptionParser

import torch
from torchvision import datasets, models, transforms
from zoo.common.nncontext import init_nncontext
from zoo.feature.common import *
from zoo.feature.image import *
from zoo.pipeline.api.net.torch_net import TorchNet

from bigdl.nn.layer import Model
from pyspark.sql.functions import col, udf
from pyspark.sql.types import StringType

from zoo.common.nncontext import *
from zoo.feature.image import *
from zoo.pipeline.nnframes import *

def inference(image_path, sc):

    model = models.resnet18(pretrained=True).eval()
    net = TorchNet.from_pytorch(model, [1, 3, 224, 224])

    imageDF = NNImageReader.readImages(image_path, sc, resizeH=300, resizeW=300, image_codec=1)
    getName = udf(lambda row: row[0], StringType())
    transformer = ChainedPreprocessing(
        [RowToImageFeature(), ImageResize(256, 256), ImageCenterCrop(224, 224),
         ImageChannelNormalize(123.0, 117.0, 104.0, 255.0, 255.0, 255.0),
         ImageMatToTensor(), ImageFeatureToTensor()])

    # Model.loadModel('/home/yuhao/workspace/model/bigdl_vgg-16_imagenet_0.4.0.model')
    classifier_model = NNClassifierModel(net, transformer) \
        .setFeaturesCol("image").setBatchSize(4)
    predictionDF = classifier_model.transform(imageDF).withColumn("name", getName(col("image")))
    return predictionDF

if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Need parameters: <modelPath> <imagePath>")
        exit(-1)

    sparkConf = init_spark_conf().setAppName("testNNClassifer").setMaster('local[1]').set('spark.driver.memory', '20g')
    sc = init_nncontext(sparkConf)

    image_path = sys.argv[1]

    predictionDF = inference(image_path, sc)
    predictionDF.select("name", "prediction").orderBy("name").show(20, False)

memory usage increases from 2.4g to 7.4g in about 50 batches, and keep stable after that.