Little problem for eval.py

In order for the evaluation to work, you need to update the code in few places: Here is the patch file to apply:

diff --git a/eval.py b/eval.py
index def929e..169214f 100644
--- a/eval.py
+++ b/eval.py
@@ -51,7 +51,7 @@ import re
 from bert_score import score
 from metrics.chrF import computeChrF
 from metrics.bleurt.bleurt import score as bleurt_score
-sys.argv = sys.argv[:1]
+# sys.argv = sys.argv[:1]
 from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
 from razdel import tokenize
 from tabulate import tabulate
@@ -68,7 +68,7 @@ def parse(refs_path, hyps_path, num_refs, lng='en'):
     for i in range(num_refs):
         fname = refs_path + str(i) if num_refs > 1 else refs_path
         with codecs.open(fname, 'r', 'utf-8') as f:
-            texts = f.read().split('\n')
+            texts = f.readlines() # f.read().split('\n')
             for j, text in enumerate(texts):
                 if len(references) <= j:
                     references.append([text])
@@ -85,7 +85,7 @@ def parse(refs_path, hyps_path, num_refs, lng='en'):

     # hypothesis
     with codecs.open(hyps_path, 'r', 'utf-8') as f:
-        hypothesis = f.read().split('\n')
+        hypothesis = f.readlines() # f.read().split('\n')

     # hypothesis tokenized
     hypothesis_tok = copy.copy(hypothesis)

the f.readlines() is to make sure the reference and hypothesis have the same number of entries otherwise METEOR will just fail.

WebNLG / GenerationEval

Little problem for eval.py #4