ghchen18 / leca

Code for Lexical-Constraint-Aware Neural Machine Translation via Data Augmentation
Other
17 stars 5 forks source link

about readme #4

Closed Haojin-Hu closed 3 years ago

Haojin-Hu commented 3 years ago

Hi ghchen,could you give me a detailed txt for how to run this code

ghchen18 commented 3 years ago

Hi, Haojin,

The code is similar with the official fairseq code, you can find more tutorials and documents on the fairseq github, https://github.com/pytorch/fairseq/tree/v0.6.1. As it says in the README, you can find more details in the scripts/run.sh file.

One thing different is that the processed source sentence can contain constraints. When testing, one example for the src sentence after BPE is like x1 x2 x3 ... xn constraint1 constraint2 constraint3 constraint4 , then the trained model can translate with the given constraints. During training, the code will handle this if you set the flag '--consnmt'.

Thanks for your question.

Haojin-Hu commented 3 years ago

Hi ghchen,I preprocess the data of en-de, just use the the fairseq, and i ready to run the code with the data from the fairseq,but i always meet a problem,can you help me? start to train leca_ptrnet NMT model ... train leca_ptrnet NMT on en-to-de ... Namespace(adam_betas='(0.9, 0.98)', adam_eps=1e-08, adaptive_softmax_cutoff=None, adaptive_softmax_dropout=0, arch='transformer_wmt_en_de', attention_dropout=0.0, beam=2, bucket_cap_mb=25, clip_norm=25, consnmt=True, cpu=False, criterion='label_smoothed_cross_entropy', curriculum=0, data=['/public/home/zhchynnu/perl5/panda/nlp/leca/processed_data'], ddp_backend='c10d', decoder_attention_heads=8, decoder_embed_dim=512, decoder_embed_path=None, decoder_ffn_embed_dim=2048, decoder_input_dim=512, decoder_layers=6, decoder_learned_pos=False, decoder_normalize_before=False, decoder_output_dim=512, decoding_path=None, device_id=0, distributed_backend='nccl', distributed_init_method=None, distributed_port=-1, distributed_rank=0, distributed_world_size=1, diverse_beam_groups=-1, diverse_beam_strength=0.5, dropout=0.3, encoder_attention_heads=8, encoder_embed_dim=512, encoder_embed_path=None, encoder_ffn_embed_dim=2048, encoder_layers=6, encoder_learned_pos=False, encoder_normalize_before=False, exp_name='leca_ptrnet-en2de', fix_batches_to_gpus=False, fp16=False, fp16_init_scale=128, fp16_scale_tolerance=0.0, fp16_scale_window=None, keep_interval_updates=-1, keep_last_epochs=80, label_smoothing=0.1, lazy_load=False, left_pad_source='False', left_pad_target='False', lenpen=1, log_format=None, log_interval=1000, lr=[0.0005], lr_scheduler='inverse_sqrt', lr_shrink=0.1, match_source_len=False, max_constraints_number=50, max_epoch=0, max_len_a=2, max_len_b=200, max_sentences=None, max_sentences_valid=None, max_source_positions=1024, max_target_positions=1024, max_tokens=4000, max_update=60000, memory_efficient_fp16=False, min_len=1, min_loss_scale=0.0001, min_lr=1e-09, model_overrides='{}', momentum=0.99, moses_multi_bleu_path=None, nbest=1, no_beamable_mm=False, no_early_stop=False, no_epoch_checkpoints=False, no_progress_bar=False, no_repeat_ngram_size=0, no_save=False, no_token_positional_embeddings=False, num_workers=0, optimizer='adam', optimizer_overrides='{}', path=None, prefix_size=0, print_alignment=False, quiet=False, raw_text=False, relu_dropout=0.0, remove_bpe=None, replace_unk=None, reset_lr_scheduler=False, reset_optimizer=False, restore_file='checkpoint_last.pt', sacrebleu=False, sampling=False, sampling_temperature=1, sampling_topk=-1, save_dir='/public/home/zhchynnu/perl5/panda/nlp/leca/models', save_interval=1, save_interval_updates=0, score_reference=False, seed=1, sentence_avg=False, share_all_embeddings=True, share_decoder_input_output_embed=False, skip_invalid_size_inputs_valid_test=False, source_lang='en', target_lang='de', task='translation', tensorboard_logdir='/public/home/zhchynnu/perl5/panda/nlp/leca/models/tensorboard', test_seed=1, testclean=False, threshold_loss_scale=None, train_subset='train', unkpen=0, unnormalized=False, update_freq=[8], upsample_primary=1, use_ptrnet=True, user_dir=None, valid_subset='valid', validate_interval=1, validate_interval_updates=1000, warmup_init_lr=1e-07, warmup_updates=16000, weight_decay=0.0001) N special symbols: ['', '', '', '', ''] N special symbols: ['', '', '', '', ''] | [en] dictionary: 6633 types | [de] dictionary: 8849 types Traceback (most recent call last): File "train.py", line 434, in cli_main() File "train.py", line 430, in cli_main main(args) File "train.py", line 43, in main load_dataset_splits(task, ['train', 'valid']) File "train.py", line 386, in load_dataset_splits task.load_dataset(split, combine=True) File "/public/home/zhchynnu/perl5/panda/nlp/leca/fairseq/tasks/translation.py", line 168, in load_dataset src_datasets.append(indexed_dataset(prefix + src, self.src_dict, tgt_dict=self.tgt_dict)) File "/public/home/zhchynnu/perl5/panda/nlp/leca/fairseq/tasks/translation.py", line 145, in indexed_dataset return IndexedCachedDataset(path, fix_lua_indexing=True) File "/public/home/zhchynnu/perl5/panda/nlp/leca/fairseq/data/indexed_dataset.py", line 118, in init super().init(path, fix_lua_indexing=fix_lua_indexing) File "/public/home/zhchynnu/perl5/panda/nlp/leca/fairseq/data/indexed_dataset.py", line 56, in init self.read_index(path) File "/public/home/zhchynnu/perl5/panda/nlp/leca/fairseq/data/indexed_dataset.py", line 63, in read_index assert magic == b'TNTIDX\x00\x00' AssertionError (1) Test on constraint-free test set test on clean dataset... Namespace(beam=5, consnmt=True, cpu=False, data=['/public/home/zhchynnu/perl5/panda/nlp/leca/processed_data'], decoding_path='/public/home/zhchynnu/perl5/panda/nlp/leca/result', diverse_beam_groups=-1, diverse_beam_strength=0.5, exp_name='wmt16-en2de', fp16=False, fp16_init_scale=128, fp16_scale_tolerance=0.0, fp16_scale_window=None, gen_subset='test', lazy_load=False, left_pad_source='False', left_pad_target='False', lenpen=1, log_format=None, log_interval=1000, match_source_len=False, max_len_a=2, max_len_b=200, max_sentences=20, max_source_positions=1024, max_target_positions=1024, max_tokens=None, memory_efficient_fp16=False, min_len=1, min_loss_scale=0.0001, model_overrides="{'beam':10}", moses_multi_bleu_path=None, nbest=1, no_beamable_mm=False, no_early_stop=False, no_progress_bar=False, no_repeat_ngram_size=0, num_shards=1, num_workers=0, path='/public/home/zhchynnu/perl5/panda/nlp/leca/models/checkpoint_best_bleu.pt', prefix_size=0, print_alignment=False, quiet=True, raw_text=False, remove_bpe='@@ ', replace_unk=None, sacrebleu=True, sampling=False, sampling_temperature=1, sampling_topk=-1, score_reference=False, seed=1, shard_id=0, skip_invalid_size_inputs_valid_test=False, source_lang='en', target_lang='de', task='translation', tensorboard_logdir='', test_seed=1, testclean=True, threshold_loss_scale=None, unkpen=0, unnormalized=False, upsample_primary=1, use_ptrnet=True, user_dir=None) N special symbols: ['', '', '', '', ''] N special symbols: ['', '', '', '', ''] | [en] dictionary: 6633 types | [de] dictionary: 8849 types Traceback (most recent call last): File "generate.py", line 225, in cli_main() File "generate.py", line 221, in cli_main main(args) File "generate.py", line 38, in main task.load_dataset(args.gen_subset) File "/public/home/zhchynnu/perl5/panda/nlp/leca/fairseq/tasks/translation.py", line 168, in load_dataset src_datasets.append(indexed_dataset(prefix + src, self.src_dict, tgt_dict=self.tgt_dict)) File "/public/home/zhchynnu/perl5/panda/nlp/leca/fairseq/tasks/translation.py", line 145, in indexed_dataset return IndexedCachedDataset(path, fix_lua_indexing=True) File "/public/home/zhchynnu/perl5/panda/nlp/leca/fairseq/data/indexed_dataset.py", line 118, in init super().init(path, fix_lua_indexing=fix_lua_indexing) File "/public/home/zhchynnu/perl5/panda/nlp/leca/fairseq/data/indexed_dataset.py", line 56, in init self.read_index(path) File "/public/home/zhchynnu/perl5/panda/nlp/leca/fairseq/data/indexed_dataset.py", line 63, in read_index assert magic == b'TNTIDX\x00\x00' AssertionError