This problem occurs when running the script：allennlp.common.checks.ConfigurationError:key 'data_loader' is required

Traceback (most recent call last):
  File "/.virtualenvs/bort_test/lib/python3.6/site-packages/allennlp/common/params.py", line 239, in pop
    value = self.params.pop(key)
KeyError: 'data_loader'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/.virtualenvs/bort_test/bin/allennlp", line 9, in <module>
    sys.exit(run())
  File "/.virtualenvs/bort_test/lib/python3.6/site-packages/allennlp/__main__.py", line 34, in run
    main(prog="allennlp")
  File "/.virtualenvs/bort_test/lib/python3.6/site-packages/allennlp/commands/__init__.py", line 94, in main
    args.func(args)
  File "/.virtualenvs/bort_test/lib/python3.6/site-packages/allennlp/commands/train.py", line 112, in train_model_from_args
    file_friendly_logging=args.file_friendly_logging,
  File "/.virtualenvs/bort_test/lib/python3.6/site-packages/allennlp/commands/train.py", line 171, in train_model_from_file
    file_friendly_logging=file_friendly_logging,
  File "/.virtualenvs/bort_test/lib/python3.6/site-packages/allennlp/commands/train.py", line 232, in train_model
    file_friendly_logging=file_friendly_logging,
  File "/.virtualenvs/bort_test/lib/python3.6/site-packages/allennlp/commands/train.py", line 423, in _train_worker
    params=params, serialization_dir=serialization_dir, local_rank=process_rank,
  File "/.virtualenvs/bort_test/lib/python3.6/site-packages/allennlp/common/from_params.py", line 583, in from_params
    **extras,
  File "/.virtualenvs/bort_test/lib/python3.6/site-packages/allennlp/common/from_params.py", line 612, in from_params
    kwargs = create_kwargs(constructor_to_inspect, cls, params, **extras)
  File "/.virtualenvs/bort_test/lib/python3.6/site-packages/allennlp/common/from_params.py", line 182, in create_kwargs
    cls.__name__, param_name, annotation, param.default, params, **extras
  File "/.virtualenvs/bort_test/lib/python3.6/site-packages/allennlp/common/from_params.py", line 283, in pop_and_construct_arg
    popped_params = params.pop(name, default) if default != _NO_DEFAULT else params.pop(name)
  File "/.virtualenvs/bort_test/lib/python3.6/site-packages/allennlp/common/params.py", line 247, in pop
    raise ConfigurationError(msg)
allennlp.common.checks.ConfigurationError: key "data_loader" is required
Traceback (most recent call last):
  File "/usr/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/administrator/dont-stop-pretraining/scripts/train.py", line 145, in <module>
    main()
  File "/home/administrator/dont-stop-pretraining/scripts/train.py", line 141, in main
    subprocess.run(" ".join(allennlp_command), shell=True, check=True)
  File "/usr/lib/python3.6/subprocess.py", line 438, in run
    output=stdout, stderr=stderr)
subprocess.CalledProcessError: Command 'allennlp train --include-package dont_stop_pretraining training_config/classifier.jsonnet -s model_logs/citation_intent_base' returned non-zero exit status 1.

the script is :

python -m scripts.train \
        --config training_config/classifier.jsonnet \
        --serialization_dir model_logs/citation_intent_base \
        --hyperparameters ROBERTA_CLASSIFIER_SMALL \
        --dataset citation_intent \
        --model roberta-base \
        --device 0 \
        --perf +f1 \
        --evaluate_on_test

Reopening this ticket because I am currently running into this same issue. My config (default settings from latest-allennlp branch in dont-stop-pretraining repo located here) is shown below:

// data directory
local DATA_DIR = std.extVar("DATA_DIR");
// whether or not to evaluate on test
local EVALUATE_ON_TEST = std.parseInt(std.extVar("EVALUATE_ON_TEST")) == 1; 
// size of dataset
local DATASET_SIZE = std.parseInt(std.extVar("DATASET_SIZE"));
// learning rate
local LEARNING_RATE = std.extVar("LEARNING_RATE");
// dropout
local DROPOUT = std.extVar("DROPOUT");
// seed
local SEED = std.parseInt(std.extVar("SEED"));
// number of epochs
local NUM_EPOCHS = std.parseInt(std.extVar("NUM_EPOCHS"));
// lazy mode
local LAZY = std.parseInt(std.extVar("LAZY")) == 1;
// batch size
local BATCH_SIZE = std.parseInt(std.extVar("BATCH_SIZE"));
// will sample this amount of training data, if set
local TRAIN_THROTTLE = std.parseInt(std.extVar("TRAIN_THROTTLE"));
// gradient accumulation batch size
local GRAD_ACC = std.parseInt(std.extVar("NUM_GRAD_ACC_STEPS"));
local LR_SCHEDULE = std.parseInt(std.extVar("LR_SCHEDULE")) == 1;

// skip early stopping? turning this on will prevent dev eval at each epoch.
local SKIP_EARLY_STOPPING = std.parseInt(std.extVar("SKIP_EARLY_STOPPING")) == 1;
local SKIP_TRAINING = std.parseInt(std.extVar("SKIP_TRAINING")) == 1;
// are we jackknifing? only for hyperpartisan.
local JACKKNIFE = std.parseInt(std.extVar("JACKKNIFE")) == 1;
// jacknife file extension. only for hyperpartisan.
local JACKKNIFE_EXT = std.extVar("JACKKNIFE_EXT");
// embedding to use
local EMBEDDING = std.extVar("EMBEDDING");
// width multiplier on hidden size of feedforward network
local FEEDFORWARD_WIDTH_MULTIPLIER = std.parseInt(std.extVar("FEEDFORWARD_WIDTH_MULTIPLIER"));
// number of feedforward layers
local NUM_FEEDFORWARD_LAYERS = std.parseInt(std.extVar("NUM_FEEDFORWARD_LAYERS"));
// early stopping patience
local PATIENCE = std.parseInt(std.extVar("PATIENCE"));

local TRAIN_DATA_PATH = if JACKKNIFE then DATA_DIR + "jackknife/" + "train." + JACKKNIFE_EXT else DATA_DIR  + "train.jsonl";
local DEV_DATA_PATH = if JACKKNIFE then DATA_DIR + "jackknife/" + "dev." + JACKKNIFE_EXT else DATA_DIR  + "dev.jsonl";
local TEST_DATA_PATH = if JACKKNIFE then DATA_DIR + "jackknife/" + "dev." + JACKKNIFE_EXT else DATA_DIR  + "test.jsonl";

// ----------------------------
// INPUT EMBEDDING
// ----------------------------

local PRETRAINED_ROBERTA_FIELDS(TRAINABLE) = {
  "indexer": {
    "tokens": {
        "type": "pretrained_transformer",
        "model_name": "roberta-base",
        "max_length": 512
    }
  },
  "embedder": {
    "token_embedders": {
      "tokens":{
        "type": "pretrained_transformer",
        "model_name": MODEL_NAME,
        "max_length": 512
      }
    }
  },
  "tokenizer": {
    "type": "pretrained_transformer",
    "model_name": "roberta-base",
    "max_length": 512
    },

  "optimizer": {
        "type": "huggingface_adamw_str_lr",
        "lr": LEARNING_RATE,
        "betas": [0.9, 0.98],
        "eps": 1e-6,
         "weight_decay": 0.1,
         "parameter_groups": [
         [["bias", "LayerNorm.bias", "LayerNorm.weight", "layer_norm.weight"], {"weight_decay": 0.0}, []],
     ]
  },
  "scheduler": {
      "type": "slanted_triangular",
      "cut_frac": 0.06
    },
  "checkpointer": {
    "type": "roberta_default",
    "num_epochs": NUM_EPOCHS,
    "skip_early_stopping": SKIP_EARLY_STOPPING
  },
  "embedding_dim": 768
};

// CLS pooler fields
local CLS_FIELDS(embedding_dim) = {
    "type": "cls_pooler",
    "embedding_dim": embedding_dim
};

local ROBERTA_TRAINABLE = true;

local ROBERTA_EMBEDDING_DIM = PRETRAINED_ROBERTA_FIELDS(ROBERTA_TRAINABLE)['embedding_dim'];
local ENCODER_OUTPUT_DIM = PRETRAINED_ROBERTA_FIELDS(ROBERTA_TRAINABLE)['embedding_dim'];

{
    "numpy_seed": SEED,
    "pytorch_seed": SEED,
    "random_seed": SEED, 
    "dataset_reader": {
        "type": "text_classification_json_with_sampling",
        "lazy": LAZY,
        "tokenizer": PRETRAINED_ROBERTA_FIELDS(ROBERTA_TRAINABLE)['tokenizer'],
        "max_sequence_length": 512,
        "token_indexers": PRETRAINED_ROBERTA_FIELDS(ROBERTA_TRAINABLE)['indexer'],
    } + if TRAIN_THROTTLE > -1 then {"sample": TRAIN_THROTTLE} else {},
    "validation_dataset_reader": {
        "type": "text_classification_json_with_sampling",
        "lazy": LAZY,
        "tokenizer": PRETRAINED_ROBERTA_FIELDS(ROBERTA_TRAINABLE)['tokenizer'],
        "max_sequence_length": 512,
        "token_indexers": PRETRAINED_ROBERTA_FIELDS(ROBERTA_TRAINABLE)['indexer'],
    },
    "train_data_path": TRAIN_DATA_PATH,
    "validation_data_path": if SKIP_EARLY_STOPPING then null else DEV_DATA_PATH,
    "test_data_path": if EVALUATE_ON_TEST then TEST_DATA_PATH else if SKIP_EARLY_STOPPING then DEV_DATA_PATH else null,
    "evaluate_on_test" : if EVALUATE_ON_TEST then EVALUATE_ON_TEST else if SKIP_EARLY_STOPPING then SKIP_EARLY_STOPPING else false,
    "model": {
        "type": "basic_classifier_with_f1",
        "text_field_embedder": PRETRAINED_ROBERTA_FIELDS(ROBERTA_TRAINABLE)['embedder'],
        "seq2vec_encoder": CLS_FIELDS(ROBERTA_EMBEDDING_DIM),
        "feedforward_layer": {
            "input_dim": ENCODER_OUTPUT_DIM,
            "hidden_dims": ROBERTA_EMBEDDING_DIM * FEEDFORWARD_WIDTH_MULTIPLIER,
            "num_layers": NUM_FEEDFORWARD_LAYERS,
            "activations": "tanh"
        },
        "dropout": DROPOUT
    },
    "data_loader": {
        "batch_sampler": {
            "type": "bucket",
            "batch_size": BATCH_SIZE
        }
    },
    "trainer": {
        "num_epochs": NUM_EPOCHS,
        "patience": PATIENCE,
        "cuda_device": std.parseInt(std.extVar("CUDA_DEVICE")),
        "validation_metric": "+f1",
        "checkpointer": PRETRAINED_ROBERTA_FIELDS(ROBERTA_TRAINABLE)['checkpointer'],
        "optimizer": PRETRAINED_ROBERTA_FIELDS(ROBERTA_TRAINABLE)['optimizer'],
        "num_gradient_accumulation_steps": GRAD_ACC
    } + if SKIP_TRAINING then {"type": "no_op"} else {}
      + if LR_SCHEDULE then {"learning_rate_scheduler": PRETRAINED_ROBERTA_FIELDS(ROBERTA_TRAINABLE)['scheduler']} else {}
}

For more context, the error message received when running the following:

python -m scripts.train \
        --config training_config/classifier.jsonnet \
        --serialization_dir model_logs/citation-intent-base \
        --hyperparameters ROBERTA_CLASSIFIER_SMALL \
        --dataset citation_intent \
        --model roberta-base \
        --device 0 \
        --evaluate_on_test

is the following error message:

Traceback (most recent call last):
  File "/usr/local/envs/domains/bin/allennlp", line 8, in <module>
    sys.exit(run())
  File "/usr/local/envs/domains/lib/python3.7/site-packages/allennlp/__main__.py", line 34, in run
    main(prog="allennlp")
  File "/usr/local/envs/domains/lib/python3.7/site-packages/allennlp/commands/__init__.py", line 118, in main
    args.func(args)
  File "/usr/local/envs/domains/lib/python3.7/site-packages/allennlp/commands/train.py", line 119, in train_model_from_args
    file_friendly_logging=args.file_friendly_logging,
  File "/usr/local/envs/domains/lib/python3.7/site-packages/allennlp/commands/train.py", line 178, in train_model_from_file
    file_friendly_logging=file_friendly_logging,
  File "/usr/local/envs/domains/lib/python3.7/site-packages/allennlp/commands/train.py", line 226, in train_model
    training_util.create_serialization_dir(params, serialization_dir, recover, force)
  File "/usr/local/envs/domains/lib/python3.7/site-packages/allennlp/training/util.py", line 212, in create_serialization_dir
    f"Serialization directory ({serialization_dir}) already exists and is "
allennlp.common.checks.ConfigurationError: Serialization directory (model_logs/citation-intent-base) already exists and is not empty. Specify --recover to recover from an existing output folder.
Traceback (most recent call last):
  File "/usr/local/envs/domains/lib/python3.7/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/local/envs/domains/lib/python3.7/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/content/dont-stop-pretraining/scripts/train.py", line 133, in <module>
    main()
  File "/content/dont-stop-pretraining/scripts/train.py", line 129, in main
    subprocess.run(" ".join(allennlp_command), shell=True, check=True)
  File "/usr/local/envs/domains/lib/python3.7/subprocess.py", line 512, in run
    output=stdout, stderr=stderr)
subprocess.CalledProcessError: Command 'allennlp train --include-package dont_stop_pretraining training_config/classifier.jsonnet -s model_logs/citation-intent-base' returned non-zero exit status 1.

Thanks in advance!

allenai / allennlp

This problem occurs when running the script：allennlp.common.checks.ConfigurationError:key 'data_loader' is required #4925