fastnlp / fastNLP

fastNLP: A Modularized and Extensible NLP Framework. Currently still in incubation.
https://gitee.com/fastnlp/fastNLP
Apache License 2.0
3.07k stars 448 forks source link

官方例子报错 #363

Closed hengee closed 3 years ago

hengee commented 3 years ago

Ref: https://github.com/fastnlp/fastNLP/issues/275

我也遇到这情况,重新下了最新的fastNLP(github的),卸了pip的也不行。 我是用自定义的dataset, 以下是我代码, output我也comment了。 以下我用了两种不同的training 模式, 分别commented as case 1 and case 2 in my code below:

  1. 使用 Trainer [Docs » 快速入门 » 文本分类](source: https://fastnlp.readthedocs.io/zh/latest/tutorials/%E6%96%87%E6%9C%AC%E5%88%86%E7%B1%BB.html)
  2. 使用DataSetIter自己编写训练过程 (source: https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_6_datasetiter.html?highlight=model#id3)

Hope it helps in reproducing my problem. Overall, I find fastNLP is well documented and the overall pipeline is good, so I would like to give it a try. Thanks!

train0 = CSVLoader().load('train0')
test0 = CSVLoader().load('test0')

train0.apply(lambda ins: ins['text'].split(), new_field_name='words')
test_0.apply(lambda ins: ins['text'].split(), new_field_name='words')
# In total 1 datasets:
#   train has 548 instances.

train0.datasets['train']
"""
+---------------------------+--------+---------------------------+
| text                      | target | words                     |
+---------------------------+--------+---------------------------+
|  USER ' so much chuffe... | 0      | ['USER', "'", 'so', 'm... |
|  heard you stealing cl... | 0      | ['heard', 'you', 'stea... |
|  USER seriously if won... | 1      | ['USER', 'seriously', ... |
| ...                       | ...    | ...                       |
+---------------------------+--------+---------------------------+
"""
train_0 = train0.datasets['train']
test_0 = test0.datasets['train'

vocab = Vocabulary()
vocab.from_dataset(train_0, field_name='words', no_create_entry_dataset=[test_0])
vocab.index_dataset(train_0, field_name='words')

target_vocab = Vocabulary(padding=None, unknown=None)
target_vocab.from_dataset(train_0, field_name='target', no_create_entry_dataset=[test_0])
target_vocab.index_dataset(train_0, field_name='target')

data_bundle = DataBundle()

data_bundle.set_dataset(train_0, 'train')
data_bundle.set_dataset(test_0, 'test')

data_bundle.set_vocab(vocab, 'vocab')
data_bundle.set_vocab(target_vocab, 'target_vocab')

data_bundle.datasets['train'].add_seq_len('words', new_field_name='seq_len')
data_bundle.datasets['test'].add_seq_len('words', new_field_name='seq_len')

data_bundle.datasets['train'].set_input('words', 'seq_len')
data_bundle.datasets['test'].set_input('words', 'seq_len')

data_bundle.datasets['train'].set_target('target')

device = 0 if torch.cuda.is_available() else 'cpu'
# device = 0 if torch.cuda.is_available() else 'cpu'

# case 1
vocab = db.vocabs['vocab']
embedding = StaticEmbedding(vocab, model_dir_or_name='.vector_cache/glove.6B.300d.txt')
model = CNNText(embedding, len(db.get_vocab('target_vocab')), dropout=0.1)
loss = CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=0.001)
metric = AccuracyMetric()
trainer = Trainer(train_data=db.get_dataset('train'), model=model, loss=loss, optimizer=optimizer, batch_size=5, dev_data=db.get_dataset('test'), metrics=metric, device=device)

"""
Found 90623 out of 375077 words in the pre-training embedding.
input fields after batch(if batch size is 2):
    words: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 9587]) 
    seq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) 
target fields after batch(if batch size is 2):
    target: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) 

---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-49-1ad8aa4147cf> in <module>
     10     trainer = Trainer(train_data=db.get_dataset('train'), model=model, loss=loss,
     11                   optimizer=optimizer, batch_size=5, dev_data=db.get_dataset('test'),
---> 12                   metrics=metric, device=device)
     13     ls.append(trainer.train())
     14     del trainer

~/anaconda3/envs/venv/lib/python3.6/site-packages/fastNLP/core/trainer.py in __init__(self, train_data, model, optimizer, loss, batch_size, sampler, drop_last, update_every, num_workers, n_epochs, print_every, dev_data, metrics, metric_key, validate_every, save_path, use_tqdm, device, callbacks, check_code_level, fp16, **kwargs)
    555             _check_code(dataset=train_data, model=self.model, losser=losser, forward_func=self._forward_func, metrics=metrics,
    556                         dev_data=dev_dataset, metric_key=self.metric_key, check_level=check_code_level,
--> 557                         batch_size=check_batch_size)
    558 
    559         self.train_data = train_data

~/anaconda3/envs/venv/lib/python3.6/site-packages/fastNLP/core/trainer.py in _check_code(dataset, model, losser, metrics, forward_func, batch_size, dev_data, metric_key, check_level)
    999         tester = Tester(data=dev_data[:batch_size * DEFAULT_CHECK_NUM_BATCH], model=model, metrics=metrics,
   1000                         batch_size=batch_size, verbose=-1, use_tqdm=False)
-> 1001         evaluate_results = tester.test()
   1002         _check_eval_results(metrics=evaluate_results, metric_key=metric_key, metric_list=metrics)
   1003 

~/anaconda3/envs/venv/lib/python3.6/site-packages/fastNLP/core/tester.py in test(self)
    182                         _move_dict_value_to_device(batch_x, batch_y, device=self._model_device)
    183                         with self.auto_cast():
--> 184                             pred_dict = self._data_forward(self._predict_func, batch_x)
    185                             if not isinstance(pred_dict, dict):
    186                                 raise TypeError(f"The return value of {_get_func_signature(self._predict_func)} "

~/anaconda3/envs/venv/lib/python3.6/site-packages/fastNLP/core/tester.py in _data_forward(self, func, x)
    231         x = _build_args(func, **x)
--> 232         y = self._predict_func_wrapper(**x)
    233         return y
    234 

~/anaconda3/envs/venv/lib/python3.6/site-packages/fastNLP/models/cnn_text_classification.py in predict(self, words, seq_len)

---> 74         output = self(words, seq_len)
     75         _, predict = output[C.OUTPUT].max(dim=1)
     76         return {C.OUTPUT: predict}

~/anaconda3/envs/venv/lib/python3.6/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    720             result = self._slow_forward(*input, **kwargs)
    721         else:
--> 722             result = self.forward(*input, **kwargs)
    723         for hook in itertools.chain(
    724                 _global_forward_hooks.values(),

~/anaconda3/envs/venv/lib/python3.6/site-packages/fastNLP/models/cnn_text_classification.py in forward(self, words, seq_len)

---> 57         x = self.embed(words)  # [N,L] -> [N,L,C]
     58         if seq_len is not None:
     59             mask = seq_len_to_mask(seq_len)

~/anaconda3/envs/venv/lib/python3.6/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    720             result = self._slow_forward(*input, **kwargs)
    721         else:
--> 722             result = self.forward(*input, **kwargs)
    723         for hook in itertools.chain(
    724                 _global_forward_hooks.values(),

~/anaconda3/envs/venv/lib/python3.6/site-packages/fastNLP/embeddings/embedding.py in forward(self, words)
     71             mask = torch.bernoulli(mask).eq(1)  # dropout_word越大,越多位置为1
     72             words = words.masked_fill(mask, self.unk_index)
---> 73         words = self.embed(words)
     74         return self.dropout(words)
     75 

~/anaconda3/envs/venv/lib/python3.6/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    720             result = self._slow_forward(*input, **kwargs)
    721         else:
--> 722             result = self.forward(*input, **kwargs)
    723         for hook in itertools.chain(
    724                 _global_forward_hooks.values(),

~/anaconda3/envs/venv/lib/python3.6/site-packages/fastNLP/embeddings/static_embedding.py in forward(self, words)

    332         if hasattr(self, 'words_to_words'):
--> 333             words = self.words_to_words[words]
    334         words = self.drop_word(words)
    335         words = self.embedding(words)

IndexError: too many indices for tensor of dimension 1

"""

# case 2
def train(epoch, data, devdata):
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    lossfunc = torch.nn.CrossEntropyLoss()
    batch_size = 8

    # 定义一个Batch,传入DataSet,规定batch_size和去batch的规则。
    # 顺序(Sequential),随机(Random),相似长度组成一个batch(Bucket)
    train_sampler = BucketSampler(batch_size=batch_size, seq_len_field_name='seq_len')
    train_batch = DataSetIter(batch_size=batch_size, dataset=data, sampler=train_sampler)

    start_time = time.time()
    print("-"*5+"start training"+"-"*5)
    for i in range(epoch):
        loss_list = []
        for batch_x, batch_y in train_batch:
            optimizer.zero_grad()
            output = model(batch_x['words'])
            loss = lossfunc(output['pred'], batch_y['target'])
            loss.backward()
            optimizer.step()
            loss_list.append(loss.item())

        #这里verbose如果为0,在调用Tester对象的test()函数时不输出任何信息,返回评估信息; 如果为1,打印出验证结果,返回评估信息
        #在调用过Tester对象的test()函数后,调用其_format_eval_results(res)函数,结构化输出验证结果
        tester_tmp = Tester(devdata, model, metrics=AccuracyMetric(), verbose=0)
        res=tester_tmp.test()

        print('Epoch {:d} Avg Loss: {:.2f}'.format(i, sum(loss_list) / len(loss_list)),end=" ")
        print(tester_tmp._format_eval_results(res),end=" ")
        print('{:d}ms'.format(round((time.time()-start_time)*1000)))
        loss_list.clear()

vocab = data_bundle.vocabs['vocab']
embedding = StaticEmbedding(vocab, model_dir_or_name='.vector_cache/glove.6B.300d.txt')
model = CNNText(embedding, len(data_bundle.get_vocab('target_vocab')), dropout=0.1)

train(3, data_bundle.get_dataset('train'), data_bundle.get_dataset('test'))

"""
Found 90623 out of 375077 words in the pre-training embedding.
-----start training-----
---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
     8     train(3, data_bundle.get_dataset('train'), data_bundle.get_dataset('test'))
<ipython-input-35-2386220b3070> in train(epoch, data, devdata)
     24         #在调用过Tester对象的test()函数后,调用其_format_eval_results(res)函数,结构化输出验证结果
     25         tester_tmp = Tester(devdata, model, metrics=AccuracyMetric(), verbose=0)
---> 26         res=tester_tmp.test()
     27 
     28         print('Epoch {:d} Avg Loss: {:.2f}'.format(i, sum(loss_list) / len(loss_list)),end=" ")

~/anaconda3/envs/venv/lib/python3.6/site-packages/fastNLP/core/tester.py in test(self)
    182                         _move_dict_value_to_device(batch_x, batch_y, device=self._model_device)
    183                         with self.auto_cast():
--> 184                             pred_dict = self._data_forward(self._predict_func, batch_x)
    185                             if not isinstance(pred_dict, dict):
    186                                 raise TypeError(f"The return value of {_get_func_signature(self._predict_func)} "

~/anaconda3/envs/venv/lib/python3.6/site-packages/fastNLP/core/tester.py in _data_forward(self, func, x)
    231         x = _build_args(func, **x)
--> 232         y = self._predict_func_wrapper(**x)
    233         return y
    234 

~/anaconda3/envs/venv/lib/python3.6/site-packages/fastNLP/models/cnn_text_classification.py in predict(self, words, seq_len)

---> 74         output = self(words, seq_len)
     75         _, predict = output[C.OUTPUT].max(dim=1)
     76         return {C.OUTPUT: predict}

~/anaconda3/envs/venv/lib/python3.6/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    720             result = self._slow_forward(*input, **kwargs)
    721         else:
--> 722             result = self.forward(*input, **kwargs)
    723         for hook in itertools.chain(
    724                 _global_forward_hooks.values(),

~/anaconda3/envs/venv/lib/python3.6/site-packages/fastNLP/models/cnn_text_classification.py in forward(self, words, seq_len)

---> 57         x = self.embed(words)  # [N,L] -> [N,L,C]
     58         if seq_len is not None:
     59             mask = seq_len_to_mask(seq_len)

~/anaconda3/envs/venv/lib/python3.6/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    720             result = self._slow_forward(*input, **kwargs)
    721         else:
--> 722             result = self.forward(*input, **kwargs)
    723         for hook in itertools.chain(
    724                 _global_forward_hooks.values(),

~/anaconda3/envs/venv/lib/python3.6/site-packages/fastNLP/embeddings/embedding.py in forward(self, words)
     71             mask = torch.bernoulli(mask).eq(1)  # dropout_word越大,越多位置为1
     72             words = words.masked_fill(mask, self.unk_index)
---> 73         words = self.embed(words)
     74         return self.dropout(words)
     75 

~/anaconda3/envs/venv/lib/python3.6/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    720             result = self._slow_forward(*input, **kwargs)
    721         else:
--> 722             result = self.forward(*input, **kwargs)
    723         for hook in itertools.chain(
    724                 _global_forward_hooks.values(),

~/anaconda3/envs/venv/lib/python3.6/site-packages/fastNLP/embeddings/static_embedding.py in forward(self, words)

    332         if hasattr(self, 'words_to_words'):
--> 333             words = self.words_to_words[words]
    334         words = self.drop_word(words)
    335         words = self.embedding(words)

IndexError: too many indices for tensor of dimension 1

"""
yhcc commented 3 years ago

因为您的test数据集没有调用

vocab.index_dataset(test_0, field_name='words')

来进行index操作。