ray-project / ray

Ray is an AI compute engine. Ray consists of a core distributed runtime and a set of AI Libraries for accelerating ML workloads.
https://ray.io
Apache License 2.0
34.14k stars 5.8k forks source link

limiting tensorflow memory failed in actor or function #6633

Open JaeLiiin opened 4 years ago

JaeLiiin commented 4 years ago

What is your question?

limiting tensorflow memory failed in actor or function

Ray version and other system information (Python version, TensorFlow version, OS): ray:0.8.0 tensorflow:2.0.0-dev20191002 python:3.6 OS: Ubuntu 18.04.3 LTS I want to limit the GPU memory usage in tensorflow.

import ray
import numpy as np
from tensorflow.keras import layers
# import tensorflow as tf

def create_keras_model():
    import tensorflow as tf
    gpus = tf.config.experimental.list_physical_devices('GPU')
    gpu_id = 0
    tf.config.experimental.set_visible_devices(gpus[gpu_id], "GPU")
    tf.config.experimental.set_memory_growth(gpus[gpu_id], True)

    model = tf.keras.Sequential()
    # Adds a densely-connected layer with 64 units to the model:
    model.add(layers.Dense(64, activation="relu", input_shape=(32, )))
    # Add another:
    model.add(layers.Dense(64, activation="relu"))
    # Add a softmax layer with 10 output units:
    model.add(layers.Dense(10, activation="softmax"))

    model.compile(
        optimizer=tf.keras.optimizers.RMSprop(0.01),
        loss=tf.keras.losses.categorical_crossentropy,
        metrics=[tf.keras.metrics.categorical_accuracy])
    return model

ray.init()

def random_one_hot_labels(shape):
    n, n_class = shape
    classes = np.random.randint(0, n_class, n)
    labels = np.zeros((n, n_class))
    labels[np.arange(n), classes] = 1
    return labels

# Use GPU wth
# @ray.remote(num_gpus=1)
@ray.remote(num_gpus=0.5)
class Network(object):
    def __init__(self):
        self.model = create_keras_model()
        self.dataset = np.random.random((1000, 32))
        self.labels = random_one_hot_labels((1000, 10))

    def train(self):
        history = self.model.fit(self.dataset, self.labels, verbose=1)
        return history.history

    def get_weights(self):
        return self.model.get_weights()

    def set_weights(self, weights):
        # Note that for simplicity this does not handle the optimizer state.
        self.model.set_weights(weights)
    def evaluate(self):
        test_error,test_acc = self.model.evaluate(self.dataset,self.labels,verbose=0)
        return test_error,test_acc
NetworkActor = Network.remote()
result_object_id = NetworkActor.train.remote()
ray.get(result_object_id)
NetworkActor2 = Network.remote()
NetworkActor2.train.remote()
weights = ray.get(
    [NetworkActor.get_weights.remote(),
     NetworkActor2.get_weights.remote()])

averaged_weights = [(layer1 + layer2) / 2
                    for layer1, layer2 in zip(weights[0], weights[1])]

weight_id = ray.put(averaged_weights)
[
    actor.set_weights.remote(weight_id)
    for actor in [NetworkActor, NetworkActor2]
]

when I run the python script, i get

2019-12-30 02:45:49,367 INFO resource_spec.py:216 -- Starting Ray with 37.01 GiB memory available for workers and up to 18.51 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
Traceback (most recent call last):
  File "testgpu.py", line 60, in <module>
    ray.get(result_object_id)
  File "/usr/local/lib/python3.6/dist-packages/ray/worker.py", line 1457, in get
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(AttributeError): ray::Network.__init__() (pid=7743, ip=172.17.0.2)
  File "python/ray/_raylet.pyx", line 626, in ray._raylet.execute_task
  File "python/ray/_raylet.pyx", line 633, in ray._raylet.execute_task
  File "python/ray/_raylet.pyx", line 634, in ray._raylet.execute_task
  File "python/ray/_raylet.pyx", line 636, in ray._raylet.execute_task
  File "python/ray/_raylet.pyx", line 619, in ray._raylet.execute_task.function_executor
  File "testgpu.py", line 41, in __init__
    self.model = create_keras_model()
  File "testgpu.py", line 9, in create_keras_model
    gpus = tf.config.experimental.list_physical_devices('GPU')
AttributeError: module 'tensorflow' has no attribute 'config'

How can I limit the GPU memory usage in tensorflow so that I can run multi processes in a single GPU?

fmxFranky commented 4 years ago

Hi @JaeLiiin @ericl. Is there any progress in response to this issue?I also meet the same issue~ Thanks a lot!