NSL, errors when trying to build a mlp classifier with float features

Hi all,

I am trying to run nsl on some of my own data but getting some issues reading the training data into the mlp model in my code.

The code is basically a combination of the first two tutorials.

I have my own embedding values and then use those to define a graph using your build_graph tool, running: python nsl_repo/neural_structured_learning/tools/build_graph.py train.tfr train_graph.tsv --similarity_threshold 0.90 -v 1

I am using the embedding float values as features, 128 dimensions, and then also have labels plus ids byte value. Then, I want to try to classify 11 node types using a sequential MLP as a model.

But I keep getting the following error:

InvalidArgumentError                      Traceback (most recent call last)
~/OneDrive - AZCollaboration/projects/nsl_onBIKG/train_with_NSL.py in <module>
    161     loss='sparse_categorical_crossentropy',
    162     metrics=['accuracy'])

--> 163 base_model.fit(train_dataset, epochs=HPARAMS.train_epochs, verbose=1)
    164
    165 # def make_feed_forward_model():

/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation
_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, u
se_multiprocessing, **kwargs)
    732         max_queue_size=max_queue_size,
    733         workers=workers,
--> 734         use_multiprocessing=use_multiprocessing)
    735
    736   def evaluate(self,

/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training_v2.py in fit(self, model, x, y, batch_size, epochs, verbose, callbacks,
validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, **kwargs)
    322                 mode=ModeKeys.TRAIN,
    323                 training_context=training_context,
--> 324                 total_epochs=epochs)
    325             cbks.make_logs(model, epoch_logs, training_result, ModeKeys.TRAIN)
    326

/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training_v2.py in run_one_epoch(model, iterator, execution_function, dataset_size
, batch_size, strategy, steps_per_epoch, num_samples, mode, training_context, total_epochs)
    121         step=step, mode=mode, size=current_batch_size) as batch_logs:
    122       try:
--> 123         batch_outs = execution_function(iterator)
    124       except (StopIteration, errors.OutOfRangeError):
    125         # TODO(kaftan): File bug about tf function and errors.OutOfRangeError?

/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training_v2_utils.py in execution_function(input_fn)

     84     # `numpy` translates Tensors to values in Eager mode.
     85     return nest.map_structure(_non_none_constant_value,
---> 86                               distributed_function(input_fn))
     87
     88   return execution_function

/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/eager/def_function.py in __call__(self, *args, **kwds)
    437         # Lifting succeeded, so variables are initialized and we can run the
    438         # stateless function.
--> 439         return self._stateless_fn(*args, **kwds)
    440     else:
    441       canon_args, canon_kwds = \

/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/eager/function.py in __call__(self, *args, **kwargs)
   1820     """Calls a graph function specialized to the inputs."""
   1821     graph_function, args, kwargs = self._maybe_define_function(args, kwargs)
-> 1822     return graph_function._filtered_call(args, kwargs)  # pylint: disable=protected-access
   1823
   1824   @property

/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/eager/function.py in _filtered_call(self, args, kwargs)
   1139          if isinstance(t, (ops.Tensor,
   1140                            resource_variable_ops.BaseResourceVariable))),
-> 1141         self.captured_inputs)
   1142
   1143   def _call_flat(self, args, captured_inputs, cancellation_manager=None):

/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/eager/function.py in _call_flat(self, args, captured_inputs, cancellation_manager)
   1222     if executing_eagerly:
   1223       flat_outputs = forward_function.call(
-> 1224           ctx, args, cancellation_manager=cancellation_manager)
   1225     else:
   1226       gradient_name = self._delayed_rewrite_functions.register()

/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/eager/function.py in call(self, ctx, args, cancellation_manager)
    509               inputs=args,
    510               attrs=("executor_type", executor_type, "config_proto", config),
--> 511               ctx=ctx)
    512         else:

    513           outputs = execute.execute_with_cancellation(

/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/eager/execute.py in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
     65     else:
     66       message = e.message
---> 67     six.raise_from(core._status_to_exception(e.code, message), None)
     68   except TypeError as e:
     69     keras_symbolic_tensors = [

/anaconda3/lib/python3.7/site-packages/six.py in raise_from(value, from_value)

InvalidArgumentError:   Feature: NL_nbr_0_embedding (data type: float) is required but could not be found.
         [[{{node ParseSingleExample/ParseSingleExample}}]]
         [[IteratorGetNext]] [Op:__inference_distributed_function_57927]

Function call stack:
distributed_function -> distributed_function

The following code generates the training and test TFRecord files:

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import random
import time

from absl import app
from absl import flags
from absl import logging
import neural_structured_learning as nsl
from neural_structured_learning.tools import graph_utils
from neural_structured_learning.tools import build_graph
from neural_structured_learning.tools import pack_nbrs
import six
import tensorflow as tf
import sys
import pandas as pd
import numpy as np
import os

# try:
#     node_f = sys.argv[1]
#     edge_f = sys.argv[2]
# except:
#     print ('.py [input_file] ')
#     sys.argv[1]

def _int64_feature(*value):
  """Returns int64 tf.train.Feature from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=list(value)))

def _bytes_feature(value):
  """Returns bytes tf.train.Feature."""
  return tf.train.Feature(
      bytes_list=tf.train.BytesList(value=[value.encode('utf-8')]))

def _float_feature(value):
  """Returns float tf.train.Feature."""
  return tf.train.Feature(float_list=tf.train.FloatList(value=value.tolist()))

#def parse_bikg_content(in_file, train_percentage):

def parse_generate_bikg_training_data():
  """Converts the bikg content (in tsv) to `tf.train.Example` instances.

  This function parses bikg content (in tsv), converts gene-disease edges into
  positive examples and other edges as negative.  This is setup for a link
  prediction task.

  randomly splits the data into training and test sets, and returns
  the training and test sets as outputs.

  Args:
    in_file: A string indicating the input file path.
    train_percentage: A float indicating the percentage of training examples
      over the dataset.

  Returns:
    train_examples: A dict with keys being example IDs (string) and values being
    `tf.train.Example` instances.
    test_examples: A dict with keys being example IDs (string) and values being
    `tf.train.Example` instances.
  """

# Fixes the random seed so the train/test split can be reproduced.
random.seed(1)
train_examples = {}
test_examples = {}

train_percentage = 0.7

df = pd.read_csv('result_embeddings.tsv', sep='\t', header=None)

# need to find translation for the labels to node types
# try to predict gene -> disease edges for this examples
node_df = pd.read_csv( 'node_data.csv', sep='\t')

id_to_type = { row[1]:row[3]  for row in node_df.values }

#use types as labels in this case
df['type'] = df[0].apply(lambda x: id_to_type[x])

#codify unique label types to index
label_to_index = {typ:i for i,typ in  enumerate(df['type'].unique()) }

for i, entry in enumerate( df.values ):
  # entries contains [ID, embedding]
  #embedding = map(float, entry[1:-1])
  features = {
      'embedding': _float_feature(np.asarray(entry[1:-1])),
      'id': _bytes_feature(str(i)),
      'label': _int64_feature(np.asarray( [ label_to_index[entry[-1]] ] ) )
  }

  #print (words)
  #print (entries)

  example_features = tf.train.Example(
      features=tf.train.Features(feature=features))
  example_id = entry[0]
  if random.uniform(0, 1) <= train_percentage:  # for train/test split.
    train_examples[example_id] = example_features
  else:
    test_examples[example_id] = example_features

# Writes 'train_examples'
with tf.io.TFRecordWriter('train.tfr') as writer:
    for example in six.itervalues(train_examples):
        writer.write(example.SerializeToString())

# Writes 'test_examples'
with tf.io.TFRecordWriter('test.tfr') as writer:
    for example in six.itervalues(test_examples):
        writer.write(example.SerializeToString())

The following code is where the NN model is defined.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import random
import time

from absl import app
from absl import flags
from absl import logging
import neural_structured_learning as nsl
from neural_structured_learning.tools import graph_utils
from neural_structured_learning.tools import pack_nbrs
import six
import tensorflow as tf
import sys
import pandas as pd

NBR_FEATURE_PREFIX = 'NL_nbr_'
NBR_WEIGHT_SUFFIX = '_weight'

class HParams(object):
  """Hyperparameters used for training."""
  def __init__(self):
    ### dataset parameters
    self.num_classes = 11
    #self.vocab_size = 128
    self.embedding_size = 128

    ### neural graph learning parameters
    self.distance_type = nsl.configs.DistanceType.L2
    self.graph_regularization_multiplier = 0.1
    self.num_neighbors = 3
    ### model architecture
    self.num_embedding_dims = 16
    self.num_lstm_dims = 64
    self.num_fc_units = 64
    ### training parameters
    self.train_epochs = 10
    self.batch_size = 128
    self.dropout_rate = 0.5
    ### eval parameters
    self.eval_steps = None  # All instances in the test set are evaluated.

def parse_example(example_proto):
  """Extracts relevant fields from the `example_proto`.

  Args:
    example_proto: An instance of `tf.train.Example`.

  Returns:
    A pair whose first value is a dictionary containing relevant features
    and whose second value contains the ground truth labels.
  """

  # The 'embedding' feature is a fix length float vector of 128.
  feature_spec = {
      'embedding': tf.io.FixedLenFeature([HPARAMS.embedding_size], tf.float32,
                        default_value=tf.constant(
                        0, dtype=tf.float32, shape=[HPARAMS.embedding_size])),
      'label': tf.io.FixedLenFeature((), tf.int64, default_value=-1),
  }
  # We also extract corresponding neighbor features in a similar manner to
  # the features above.
  for i in range(HPARAMS.num_neighbors):
    nbr_feature_key = '{}{}_{}'.format(NBR_FEATURE_PREFIX, i, 'embedding')
    nbr_weight_key = '{}{}{}'.format(NBR_FEATURE_PREFIX, i, NBR_WEIGHT_SUFFIX)
    feature_spec[nbr_feature_key] = tf.io.FixedLenFeature([HPARAMS.embedding_size], tf.float32)

    # We assign a default value of 0.0 for the neighbor weight so that
    # graph regularization is done on samples based on their exact number
    # of neighbors. In other words, non-existent neighbors are discounted.
    feature_spec[nbr_weight_key] = tf.io.FixedLenFeature(
        [1], tf.float32, default_value=tf.constant([0.0]))

  features = tf.io.parse_single_example(example_proto, feature_spec)

  #print ('features: ', features)

  labels = features.pop('label')
  return features, labels

def make_dataset(file_path, training=False):
  """Creates a `tf.data.TFRecordDataset`.

  Args:
    file_path: Name of the file in the `.tfrecord` format containing
      `tf.train.Example` objects.
    training: Boolean indicating if we are in training mode.

  Returns:
    An instance of `tf.data.TFRecordDataset` containing the `tf.train.Example`
    objects.
  """
  dataset = tf.data.TFRecordDataset([file_path])
  if training:
    dataset = dataset.shuffle(10000)
  dataset = dataset.map(parse_example)
  dataset = dataset.batch(HPARAMS.batch_size)
  return dataset

HPARAMS = HParams()

training_samples_count = sum(1 for record in tf.data.TFRecordDataset([
                            'nsl_train_data.tfr']))

train_dataset = make_dataset('./nsl_train_data.tfr', training=True)
test_dataset = make_dataset('./test_data.tfr')

def make_mlp_sequential_model(hparams):
  """Creates a sequential multi-layer perceptron model."""
  model = tf.keras.Sequential()
  model.add(
      tf.keras.layers.InputLayer(
          input_shape=(hparams.embedding_size,), name='embedding'))
  # Input is already one-hot encoded in the integer format. We cast it to
  # floating point format here.
  model.add(
      tf.keras.layers.Lambda(lambda x: tf.keras.backend.cast(x, tf.float32)))
  for num_units in range( hparams.num_fc_units ):
    model.add(tf.keras.layers.Dense(num_units, activation='relu'))
    # For sequential models, by default, Keras ensures that the 'dropout' layer
    # is invoked only during training.
    model.add(tf.keras.layers.Dropout(hparams.dropout_rate))
  model.add(tf.keras.layers.Dense(hparams.num_classes, activation='softmax'))
  return model

base_model_tag, base_model = 'SEQUENTIAL', make_mlp_sequential_model(HPARAMS)
base_model.summary()

# Compile and train the base MLP model
base_model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy'])
base_model.fit(train_dataset, epochs=HPARAMS.train_epochs, verbose=1)

tensorflow / neural-structured-learning

NSL, errors when trying to build a mlp classifier with float features #31