jjbrosnan commented 2 years ago


deephaven.learn, when used in conjunction with PyTorch/TensorFlow causes SIGSEGV crashes. These crashes are SEGV_MAPERR according to the logs. This occurs with our supported version of PyTorch (1.10.2) that we use in the server-pytorch image. This also occurs with the latest TensorFlow (2.8.0), and I am going to test with the supported version of TensorFlow from the server-tensorflow image shortly.

Steps to reproduce

From a Python session:

import os
os.system("pip install torch==1.10.2")

That takes a couple of minutes. Then run this:

# Deephaven imports
from deephaven import DynamicTableWriter
from deephaven import dtypes as dht
from deephaven.learn import gather
from deephaven.csv import read
from deephaven import learn

# Machine learning imports
import torch
import torch.nn as nn
import torch.nn.functional as F

# Python imports
import numpy as np, random, threading, time

# Read and quantize the Iris dataset
iris_raw = read("")

classes = {}
num_classes = 0
def get_class_number(c):
    global classes, num_classes
    if c not in classes:
        classes[c] = num_classes
        num_classes += 1
    return classes[c]

iris = iris_raw.update(formulas = ["Class = (int)(byte)get_class_number(Class)"])

# Our neural network class
class IrisANN(nn.Module):
    def __init__(self):
        self.fc1 = nn.Linear(in_features = 4, out_features = 16)
        self.fc2 = nn.Linear(in_features = 16, out_features = 12)
        self.output = nn.Linear(in_features = 12, out_features = 3)

    def forward(self, x):
        x = x.float()
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.output(x)
        return x

# Create the neural network
model = IrisANN()

# A function that trains the model
def train_model(X_train, Y_train):
    global model
    # Set training parameters
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr = 0.01)
    epochs = 100

    loss_arr = []

    for i in range(epochs):
        Y_hat = model.forward(X_train)
        loss = criterion(Y_hat, Y_train.long())

        if i % 10 == 0:
            print(f'Epoch: {i} Loss: {loss}')


# A function to gather data from table columns into a torch tensor of doubles
def table_to_tensor_double(rows, cols):
    return torch.from_numpy(gather.table_to_numpy_2d(rows, cols, np_type = np.double))

# A function to gather data from table columns into a torch tensor of integers
def table_to_tensor_int(rows, cols):
    return torch.from_numpy(np.squeeze(gather.table_to_numpy_2d(rows, cols, np_type = np.intc)))

# A function to extract a prediction and cast the value to an integer
def get_predicted_class(data, idx):
    return int(data[idx])

That code should run without issue. This next code block will result in the error:

    table = iris,
    model_func = train_model,
    inputs = [learn.Input(["SepalLengthCM", "SepalWidthCM", "PetalLengthCM", "PetalWidthCM"], table_to_tensor_double), learn.Input("Class", table_to_tensor_int)],
    outputs = None,
    batch_size = 150

Expected results

The Torch model to train. There should be some printouts to the console window as well.

Actual results

The table viewer window in the console complains for a second, then the screen just goes blank. TRANSIENT_FAILURE messages appear in the terminal window, and a file is generated: hs_err_pid1.log (among other files).

Additional details and attachments

This file is from one of the TensorFlow crashes.


I will include the TensorFlow code blocks in a comment.


jjbrosnan commented 2 years ago

TensorFlow code (note that I don't enforce a version in the pip install, so that could be from a version mismatch. I will be performing more rigorous testing of that now.

import os
os.system("pip install tensorflow")
# Deephaven imports
from deephaven import DynamicTableWriter
from deephaven import dtypes as dht
from deephaven.learn import gather
from deephaven.csv import read
from deephaven import learn

# Machine learning imports
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Python imports
import numpy as np, random, threading, time

iris_raw = read("")

classes = {}
num_classes = 0
def get_class_number(c):
    global classes, num_classes
    if c not in classes:
        classes[c] = num_classes
        num_classes += 1
    return classes[c]

iris = iris_raw.update(formulas = ["Class = (int)(byte)get_class_number(Class)"])

# Our neural network
model = Sequential()
model.add(Dense(16, activation = tf.nn.relu))
model.add(Dense(12, activation = tf.nn.relu))
model.add(Dense(3, activation = tf.nn.softmax))

# A function that trains the model
def train_model(X_train, Y_train):
    model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = 0.01), loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True), metrics = ["accuracy"]) = X_train, y = Y_train, epochs = 100)

# A function that gets the model's predictions on input data
def predict_with_model(features):
    if features.ndim == 1:
        features = np.expand_dims(features, 0)
    predictions = model.predict(features)
    return np.array([np.argmax(item) for item in predictions], dtype = np.intc)

# A function to gather data from table columns into a NumPy array of doubles
def table_to_array_double(rows, cols):
    return gather.table_to_numpy_2d(rows, cols, np_type = np.double)

# A function to gather data from table columns into a NumPy array of doubles
def table_to_array_int(rows, cols):
    return gather.table_to_numpy_2d(rows, cols, np_type = np.intc)

# A function to extract a list element and cast to an integer
def get_predicted_class(data, idx):
    return int(data[idx])
# Use the learn function to train our neural network
    table = iris,
    model_func = train_model,
    inputs = [learn.Input(["SepalLengthCM", "SepalWidthCM", "PetalLengthCM", "PetalWidthCM"], table_to_array_double), learn.Input(["Class"], table_to_array_int)],
    outputs = None,
    batch_size = 150
niloc132 commented 2 years ago

I'm not able to reproduce this in docker (though on host OS of linux), going to need some more info or to debug on your machine when you are available.

jjbrosnan commented 2 years ago

Worth noting: I get the same SIGSEGV core dump if I just convert the table data to NumPy directly and attempt to train the neural network model

niloc132 commented 2 years ago

Ok, so we can't rule out DH breaking this yet (though we did test on a call removing the jvm's signal handlers, just in case these libraries use sigsegv like the jvm does and they are fighting in some way), but at least that it isnt the fact that the engine is providing the data that is the problem.

Next steps, use the modified steps to repro (i.e. just numpy data) and try to train on older DH releases. If we can identify the last version it works and first version it fails, collect pip list from both, and compare to see what might have chaged.

Assigned to @jjbrosnan for triage.

jjbrosnan commented 2 years ago

This is the python code that works outside of a Deephaven console. The module versions versions I have installed outside of Deephaven are numpy 1.21.5 and tensorflow 2.8.0. The codedoes not use anything Deephaven, but results in a SIGSEG core dump when called from within DH.

import pandas as pd
import random
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Read and quantize the dataset
iris = pd.read_csv("")
iris_mappings = {
    "Iris-setosa" : 0,
    "Iris-virginica" : 1,
    "Iris-versicolor" : 2
iris["Class"] = iris["Class"].apply(lambda x: iris_mappings[x])

# Split the DataFrame into training and testing sets
iris_shuffled = iris.sample(frac = 1)
train_size = int(0.75 * len(iris_shuffled))
train_set = iris_shuffled[:train_size]
test_set = iris_shuffled[train_size:]

# Separate our data into features and targets (X and Y)
X_train = train_set.drop("Class", axis = 1).values
Y_train = train_set["Class"].values
X_test = test_set.drop("Class", axis = 1).values
Y_test = test_set["Class"].values

# Create the ANN
model = Sequential()
model.add(Dense(16, input_shape = (4,), activation = tf.nn.relu))
model.add(Dense(12, activation = tf.nn.relu))
model.add(Dense(3, activation = tf.nn.softmax))

# Compile, fit, and evaluate the predictions of the ANN
model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = 0.01), loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True),  metrics = ["accuracy"]) = X_train, y = Y_train, epochs = 100)
model.evaluate(X_test, Y_test)
jjbrosnan commented 2 years ago

The code in the comment just above this one works just fine in deephaven v0.9. Here are the results from pip list:

deephaven-deployment-server-1      | 2022-04-01T20:03:03.616Z | r-Scheduler-Serial-1 |  INFO | .u.PythonDeephavenSession | Evaluating command: os.system("pip list")
deephaven-deployment-server-1      | Package                 Version
deephaven-deployment-server-1      | ----------------------- ---------
deephaven-deployment-server-1      | absl-py                 0.15.0
deephaven-deployment-server-1      | astunparse              1.6.3
deephaven-deployment-server-1      | cached-property         1.5.2
deephaven-deployment-server-1      | cachetools              4.2.4
deephaven-deployment-server-1      | certifi                 2021.10.8
deephaven-deployment-server-1      | charset-normalizer      2.0.12
deephaven-deployment-server-1      | clang                   5.0
deephaven-deployment-server-1      | deephaven               0.9.0
deephaven-deployment-server-1      | deephaven-jpy           0.9.0
deephaven-deployment-server-1      | deephaven2              0.9.0
deephaven-deployment-server-1      | dill                    0.3.4
deephaven-deployment-server-1      | flatbuffers             1.12
deephaven-deployment-server-1      | gast                    0.4.0
deephaven-deployment-server-1      | google-auth             1.35.0
deephaven-deployment-server-1      | google-auth-oauthlib    0.4.6
deephaven-deployment-server-1      | google-pasta            0.2.0
deephaven-deployment-server-1      | grpcio                  1.44.0
deephaven-deployment-server-1      | h5py                    3.1.0
deephaven-deployment-server-1      | idna                    3.3
deephaven-deployment-server-1      | importlib-metadata      4.11.3
deephaven-deployment-server-1      | keras                   2.6.0
deephaven-deployment-server-1      | Keras-Preprocessing     1.1.2
deephaven-deployment-server-1      | llvmlite                0.38.0
deephaven-deployment-server-1      | Markdown                3.3.6
deephaven-deployment-server-1      | numba                   0.55.0
deephaven-deployment-server-1      | numpy                   1.19.5
deephaven-deployment-server-1      | oauthlib                3.2.0
deephaven-deployment-server-1      | opt-einsum              3.3.0
deephaven-deployment-server-1      | pandas                  1.3.5
deephaven-deployment-server-1      | pip                     21.3.1
deephaven-deployment-server-1      | pkg_resources           0.0.0
deephaven-deployment-server-1      | protobuf                3.20.0
deephaven-deployment-server-1      | pyasn1                  0.4.8
deephaven-deployment-server-1      | pyasn1-modules          0.2.8
deephaven-deployment-server-1      | python-dateutil         2.8.2
deephaven-deployment-server-1      | pytz                    2021.3
deephaven-deployment-server-1      | requests                2.27.1
deephaven-deployment-server-1      | requests-oauthlib       1.3.1
deephaven-deployment-server-1      | rsa                     4.8
deephaven-deployment-server-1      | setuptools              60.5.0
deephaven-deployment-server-1      | six                     1.15.0
deephaven-deployment-server-1      | tensorboard             2.6.0
deephaven-deployment-server-1      | tensorboard-data-server 0.6.1
deephaven-deployment-server-1      | tensorboard-plugin-wit  1.8.1
deephaven-deployment-server-1      | tensorflow              2.6.3
deephaven-deployment-server-1      | tensorflow-estimator    2.6.0
deephaven-deployment-server-1      | termcolor               1.1.0
deephaven-deployment-server-1      | typing-extensions
deephaven-deployment-server-1      | urllib3                 1.26.9
deephaven-deployment-server-1      | Werkzeug                2.1.1
deephaven-deployment-server-1      | wheel                   0.37.1
deephaven-deployment-server-1      | wrapt                   1.12.1
deephaven-deployment-server-1      | zipp                    3.7.0
deephaven-deployment-server-1      | WARNING: You are using pip version 21.3.1; however, version 22.0.4 is available.
deephaven-deployment-server-1      | You should consider upgrading via the '/opt/deephaven-venv/bin/python -m pip install --upgrade pip' command.
jjbrosnan commented 2 years ago

The code works in deephaven v0.10 when doing a build from pre-built images. Here's a diff of the pip list outputs between v0.10 and v0.9:

< deephaven-deployment-server-1      | deephaven               0.10.0
< deephaven-deployment-server-1      | deephaven-jpy           0.10.0
< deephaven-deployment-server-1      | deephaven2              0.10.0
> deephaven-deployment-server-1      | deephaven               0.9.0
> deephaven-deployment-server-1      | deephaven-jpy           0.9.0
> deephaven-deployment-server-1      | deephaven2              0.9.0
< deephaven-deployment-server-1      | numba                   0.55.1
> deephaven-deployment-server-1      | numba                   0.55.0
< deephaven-deployment-server-1      | pip                     22.0.3
> deephaven-deployment-server-1      | pip                     21.3.1
< deephaven-deployment-server-1      | setuptools              60.9.1
> deephaven-deployment-server-1      | setuptools              60.5.0
< deephaven-deployment-server-1      | WARNING: You are using pip version 22.0.3; however, version 22.0.4 is available.
> deephaven-deployment-server-1      | WARNING: You are using pip version 21.3.1; however, version 22.0.4 is available.
jjbrosnan commented 2 years ago

I re-tested tensorflow version 2.7.1 when building from both source and images alone, and am getting segfaults when using both. The segfaults are also occurring in version 2.6.3.

jjbrosnan commented 2 years ago

Yesterday, I tested locally with the server-tensorflow image using a little stress tester. It ran the TensorFlow code that has been shown to crash with the SIGSEGV 100 times. Each time the code ran, it was from a new set of docker containers. The output was tracked, and if the string Epoch 100/100 appeared in a log file, it was considered a success (the crash would always happen after Epoch 1/100 was printed).

The tests were successful 100 out of 100 times. I was unable to replicate the SIGSEGV crashes yesterday using the v0.11 release with the server-tensorflow image. That image, as of v0.11, has TensorFlow version 2.6.3.

Jeremiahcheng1206 commented 2 years ago

Hello, I was trying to run my python code using DHaaL on my local machine, the script is as below

from deephaven_server import Server
s = Server(port=8080, jvm_args=["-Xmx4g"])
from deephaven import ugp
ugp.auto_locking = True

import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    print("Name:",, "  Type:", gpu.device_type)
### Deephaven imports
from deephaven import DynamicTableWriter
from deephaven import dtypes as dht
from deephaven.learn import gather
from deephaven import read_csv
from deephaven import learn
from deephaven import new_table
from deephaven.column import string_col, double_col,int_col

### Python imports
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import numpy as np
import threading
import time
# Read the iris csv file and change its format to deephaven table, along with some data pre-processing
iris_data = read_csv("")
table2= new_table([
    string_col("Class", ["Iris-setosa", "Iris-versicolor", "Iris-virginica"]),
    int_col("class_1", [0, 1, 2])
iris=iris_data.exact_join(table=table2, on=["Class"]).drop_columns(cols=["Class"]).rename_columns(cols=["Class=class_1"])

# Create our neural network
model = Sequential()
model.add(Dense(16, input_shape=(4,), activation=tf.nn.relu))
model.add(Dense(12, activation=tf.nn.relu))
model.add(Dense(3, activation=tf.nn.softmax))

# A function to train the model
def train_model(features, targets):
        tf.keras.optimizers.Adam(learning_rate=0.001), \
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), \
        metrics=["accuracy"]), y=targets, epochs=5)

# Make predictions with the trained model
def predict_with_model(features):
    predictions = model.predict(features)
    return [np.argmax(item) for item in predictions]

# A function to gather data from table columns into a NumPy array of doubles
def table_to_array_double(rows, cols):
    return gather.table_to_numpy_2d(rows, cols, np_type=np.double)

# A function to gather data from table columns into a NumPy array of integers
def table_to_array_int(rows, cols):
    return gather.table_to_numpy_2d(rows, cols, np_type=np.intc)

# A function to extract a list element at a given index
def get_predicted_class(data, idx):
    return data[idx]

# The four features of the data set
inps=["SepalLengthCM", "SepalWidthCM", "PetalLengthCM", "PetalWidthCM"]
# Train the model
    inputs=[learn.Input(inps, table_to_array_double), learn.Input(["Class"], table_to_array_int)],
# Apply the trained model to the data set
iris_predicted_static = learn.learn(
    inputs=[learn.Input(inps, table_to_array_double)],
    outputs=[learn.Output("PredictedClass", get_predicted_class, "int")],

