Training issues - Githubissues

Hi,

I am trying to use your code in order to reproduce the results, but I hit kind of a brick wall. I managed to deploy the application on a CPU-only system, where I get around 2 to 3 it/s.

Now I am trying to use a GPU (Quadro P4000), but I do not get any speed ups at all, I remain at 1 it/s, while the GPU is running at full power. And after a while I get an out of memory error (8GB).

Is this something you also encountered and fixed?

As a caveat: I am using nvidia-docker:

docker run --runtime=nvidia --rm reinoldus/ontoemma:latest bash /ontoemma/run_emma.sh cuda

The docker-repo is here: https://github.com/reinoldus/ontoemma

The config I am using to train is attached bellow.
Data is downloaded from here: https://github.com/reinoldus/ontoemma/blob/master/download-data.sh
Dockerfile: https://github.com/reinoldus/ontoemma/blob/master/Dockerfile
- The Dockerfile attached to the build is actually: https://raw.githubusercontent.com/reinoldus/ontoemma/96c902bbefa63d2754466decb57bac41fb7eb193/Dockerfile
- I am currently building one with the cuda8 base image, maybe that helps

{
  "dataset_reader": {
    "type": "ontology_matcher",
    "name_token_indexer": {
      "tokens": {
        "type": "single_id",
        "lowercase_tokens": true
      },
      "token_characters": {
        "type": "characters"
      }
    },
    "token_only_indexer": {
      "tokens": {
        "type": "single_id",
        "lowercase_tokens": true
      }
    }
  },
  "train_data_path": "/ontoemma/data/ontoemma.context.train",
  "validation_data_path": "/ontoemma/data/ontoemma.context.dev",
  "model": {
    "type": "ontoemmaNN",
    "name_embedder": {
      "tokens": {
        "type": "embedding",
        "pretrained_file": "/ontoemma/data/weights100.txt.gz",
        "embedding_dim": 100,
        "trainable": false
      },
      "token_characters": {
        "type": "character_encoding",
        "embedding": {
          "embedding_dim": 100
        },
        "encoder": {
          "type": "cnn",
          "embedding_dim": 100,
          "num_filters": 50,
          "ngram_filter_sizes": [4, 5]
        },
        "dropout": 0.2
      }
    },
    "definition_embedder": {
      "tokens": {
        "type": "embedding",
        "pretrained_file": "/ontoemma/data/weights100.txt.gz",
        "embedding_dim": 100,
        "trainable": false
      }
    },
    "name_encoder": {
      "type": "lstm",
      "input_size": 200,
      "hidden_size": 100,
      "num_layers": 2,
      "dropout": 0.2,
      "bidirectional": true
    },
    "definition_encoder": {
      "type": "lstm",
      "input_size": 100,
      "hidden_size": 100,
      "num_layers": 2,
      "dropout": 0.2,
      "bidirectional": true
    },
    "siamese_feedforward": {
      "input_dim": 600,
      "num_layers": 2,
      "hidden_dims": 100,
      "activations": "relu",
      "dropout": 0.2
    },
    "decision_feedforward": {
      "input_dim": 232,
      "num_layers": 2,
      "hidden_dims": [232, 1],
      "activations": ["relu", "linear"],
      "dropout": [0.2, 0.0]
    },
    "initializer": [
      [".*linear_layers.*weight", {"type": "xavier_normal"}],
      [".*encoder.*module.*weight.*", {"type": "orthogonal"}]
     ]
  },
  "iterator": {
    "type": "bucket",
    "sorting_keys": [
                    ["s_ent_def", "num_tokens"],
                    ["t_ent_def", "num_tokens"],
                    ["s_ent_alias", "list_num_tokens"],
                    ["t_ent_alias", "list_num_tokens"],
                    ["s_ent_name", "num_tokens"],
                    ["t_ent_name", "num_tokens"]
    ],
    "batch_size": 32
  },
  "trainer": {
    "optimizer": "adam",
    "num_epochs": 50,
    "patience": 10,
    "validation_metric": "+f1_score",
    "cuda_device": 0
  }
}

allenai / ontoemma

Training issues #21