Recode: perturbed human-eval

Add an option for HumanEval to not strip the prompts
Add the Recode benchmark applied to HumanEval.

Tested with Codegen-2B-mono and Codegen-16B-mono. Results are within 3% of paper's results (table 13) with --tasks perturbed-humaneval-func_name-num_seeds_5 --max_length_generation 1024 --n_samples 1 --do_sample False --batch_size 1

"perturbed-humaneval-func_name-num_seeds_5": {
    "FuncRenameInflectionalVariation": {
      "robust-pass-at-1": 0.21951219512195122
    },
    "FuncRenameSynonymSub": {
      "robust-pass-at-1": 0.22560975609756098
    },
    "FuncRenameSwapChar": {
      "robust-pass-at-1": 0.25609756097560976
    },
    "FuncRenameButterFinger": {
      "robust-pass-at-1": 0.21341463414634146
    },
    "FuncRenameChangeChar": {
      "robust-pass-at-1": 0.21341463414634146
    },
    "FuncRenameCamelCase": {
      "robust-pass-at-1": 0.25
    }
  },
  "config": {
    "model": "Salesforce/codegen-2B-mono",
    "revision": null,
    "temperature": 0.2,
    "n_samples": 1
  }

{
  "perturbed-humaneval-func_name-num_seeds_5": {
    "FuncRenameInflectionalVariation": {
      "robust-pass-at-1": 0.27439024390243905
    },
    "FuncRenameSynonymSub": {
      "robust-pass-at-1": 0.2682926829268293
    },
    "FuncRenameSwapChar": {
      "robust-pass-at-1": 0.27439024390243905
    },
    "FuncRenameButterFinger": {
      "robust-pass-at-1": 0.27439024390243905
    },
    "FuncRenameChangeChar": {
      "robust-pass-at-1": 0.23780487804878048
    },
    "FuncRenameCamelCase": {
      "robust-pass-at-1": 0.27439024390243905
    }
  },
  "config": {
    "model": "Salesforce/codegen-16B-mono",
    "revision": null,
    "temperature": 0.2,
    "n_samples": 1
  }

bigcode-project / bigcode-evaluation-harness

Recode: perturbed human-eval #127