Reference implementation of code generation projects from Facebook AI Research. General toolkit to apply machine learning to code, from dataset creation to model training and evaluation. Comes with pretrained models.
MIT License
710
stars
144
forks
source link
cannot import name 'tokenize_v14_international' from 'sacrebleu' #71
I am trying to run the preprocessing.py file with following argument
run codegen_sources/preprocessing/new_preprocess.py data/test_dataset --mode obfuscation --langs python --mode obfuscation --train_splits 7 --job_mem 250 --tokenization_timeout 400 --bpe_timeout 220 --train_bpe_timeout 400 --bpe_mode fast --fastbpe_use_vocab False --fastbpe_vocab_path CodeGen/data/newtest_dataset --keep_comments False --fastbpe_code_path C:/Users/sushantk/Anaconda3/CodeGen/codegen_sources/model/tools/fastBPE --ncodes 40000 --percent_test_valid 2
and I am getting the following error
`ImportError Traceback (most recent call last)
~\Anaconda3\CodeGen\codegen_sources\preprocessing\new_preprocess.py in
13 from codegen_sources.preprocessing.bpe_modes.fast_bpe_mode import FastBPEMode
14 from codegen_sources.preprocessing.bpe_modes.roberta_bpe_mode import RobertaBPEMode
---> 15 from codegen_sources.preprocessing.dataset_modes.monolingual_functions_mode import (
16 MonolingualFunctionsMode,
17 )
~\Anaconda3\CodeGen\codegen_sources\preprocessing\dataset_modes\monolingual_functions_mode.py in
12
13 import submitit
---> 14 from codegen_sources.preprocessing.dataset_modes.dataset_mode import DatasetMode
15 from codegen_sources.preprocessing.lang_processors.lang_processor import LangProcessor
16 from codegen_sources.preprocessing.obfuscation.utils_deobfuscation import REPLACE_DICT
~\Anaconda3\CodeGen\codegen_sources\preprocessing\dataset_modes\dataset_mode.py in
27 from codegen_sources.preprocessing.bpe_modes.bpe_mode import BPEMode
28 from codegen_sources.preprocessing.obfuscation.utils_deobfuscation import SEPARATOR
---> 29 from codegen_sources.preprocessing.lang_processors.cpp_processor import CppProcessor
30 from codegen_sources.preprocessing.lang_processors.java_processor import JavaProcessor
31 from codegen_sources.preprocessing.lang_processors.python_processor import (
~\Anaconda3\CodeGen\codegen_sources\preprocessing\lang_processors\cpp_processor.py in
5 # LICENSE file in the root directory of this source tree.
6 #
----> 7 from codegen_sources.preprocessing.lang_processors.tree_sitter_processor import (
8 TreeSitterLangProcessor,
9 NEW_LINE,
~\Anaconda3\CodeGen\codegen_sources\preprocessing\lang_processors\tree_sitter_processor.py in
6 #
7 from codegen_sources.preprocessing.lang_processors.lang_processor import LangProcessor
----> 8 from codegen_sources.preprocessing.lang_processors.tokenization_utils import (
9 process_string,
10 replace_tokens,
~\Anaconda3\CodeGen\codegen_sources\preprocessing\lang_processors\tokenization_utils.py in
6 #
7 import re
----> 8 from sacrebleu import tokenize_v14_international
9
10 # IMPORTED
ImportError: cannot import name 'tokenize_v14_international' from 'sacrebleu' (C:\Users\sushantk\Anaconda3\lib\site-packages\sacrebleu__init__.py)
`
I am trying to run the preprocessing.py file with following argument
run codegen_sources/preprocessing/new_preprocess.py data/test_dataset --mode obfuscation --langs python --mode obfuscation --train_splits 7 --job_mem 250 --tokenization_timeout 400 --bpe_timeout 220 --train_bpe_timeout 400 --bpe_mode fast --fastbpe_use_vocab False --fastbpe_vocab_path CodeGen/data/newtest_dataset --keep_comments False --fastbpe_code_path C:/Users/sushantk/Anaconda3/CodeGen/codegen_sources/model/tools/fastBPE --ncodes 40000 --percent_test_valid 2
and I am getting the following error `ImportError Traceback (most recent call last) ~\Anaconda3\CodeGen\codegen_sources\preprocessing\new_preprocess.py in
13 from codegen_sources.preprocessing.bpe_modes.fast_bpe_mode import FastBPEMode
14 from codegen_sources.preprocessing.bpe_modes.roberta_bpe_mode import RobertaBPEMode
---> 15 from codegen_sources.preprocessing.dataset_modes.monolingual_functions_mode import (
16 MonolingualFunctionsMode,
17 )
~\Anaconda3\CodeGen\codegen_sources\preprocessing\dataset_modes\monolingual_functions_mode.py in
12
13 import submitit
---> 14 from codegen_sources.preprocessing.dataset_modes.dataset_mode import DatasetMode
15 from codegen_sources.preprocessing.lang_processors.lang_processor import LangProcessor
16 from codegen_sources.preprocessing.obfuscation.utils_deobfuscation import REPLACE_DICT
~\Anaconda3\CodeGen\codegen_sources\preprocessing\dataset_modes\dataset_mode.py in
27 from codegen_sources.preprocessing.bpe_modes.bpe_mode import BPEMode
28 from codegen_sources.preprocessing.obfuscation.utils_deobfuscation import SEPARATOR
---> 29 from codegen_sources.preprocessing.lang_processors.cpp_processor import CppProcessor
30 from codegen_sources.preprocessing.lang_processors.java_processor import JavaProcessor
31 from codegen_sources.preprocessing.lang_processors.python_processor import (
~\Anaconda3\CodeGen\codegen_sources\preprocessing\lang_processors\cpp_processor.py in
5 # LICENSE file in the root directory of this source tree.
6 #
----> 7 from codegen_sources.preprocessing.lang_processors.tree_sitter_processor import (
8 TreeSitterLangProcessor,
9 NEW_LINE,
~\Anaconda3\CodeGen\codegen_sources\preprocessing\lang_processors\tree_sitter_processor.py in
6 #
7 from codegen_sources.preprocessing.lang_processors.lang_processor import LangProcessor
----> 8 from codegen_sources.preprocessing.lang_processors.tokenization_utils import (
9 process_string,
10 replace_tokens,
~\Anaconda3\CodeGen\codegen_sources\preprocessing\lang_processors\tokenization_utils.py in
6 #
7 import re
----> 8 from sacrebleu import tokenize_v14_international
9
10 # IMPORTED
ImportError: cannot import name 'tokenize_v14_international' from 'sacrebleu' (C:\Users\sushantk\Anaconda3\lib\site-packages\sacrebleu__init__.py) `