Closed cwoonb closed 1 year ago
!pip install mxnet !pip install gluonnlp==0.8.0 !pip install tqdm pandas !pip install torch !pip install sentencepiece !pip install transformers !pip install 'git+https://github.com/SKTBrain/KoBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf' from kobert_tokenizer import KoBERTTokenizer from transformers import BertModel from transformers import AdamW from transformers.optimization import get_cosine_schedule_with_warmup import torch from torch import nn import torch.nn.functional as F import torch.optim as optim from torch.utils.data import Dataset, DataLoader import gluonnlp as nlp import numpy as np from tqdm.notebook import tqdm tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1') bertmodel = BertModel.from_pretrained('skt/kobert-base-v1', return_dict=False) vocab = nlp.vocab.BERTVocab.from_sentencepiece(tokenizer.vocab_file, padding_token='[PAD]')
class BERTSentenceTransform: r"""BERT style data transformation.
Parameters
----------
tokenizer : BERTTokenizer.
Tokenizer for the sentences.
max_seq_length : int.
Maximum sequence length of the sentences.
pad : bool, default True
Whether to pad the sentences to maximum length.
pair : bool, default True
Whether to transform sentences or sentence pairs.
"""
def __init__(self, tokenizer, max_seq_length,vocab, pad=True, pair=True):
self._tokenizer = tokenizer
self._max_seq_length = max_seq_length
self._pad = pad
self._pair = pair
self._vocab = vocab
def __call__(self, line):
"""Perform transformation for sequence pairs or single sequences.
The transformation is processed in the following steps:
- tokenize the input sequences
- insert [CLS], [SEP] as necessary
- generate type ids to indicate whether a token belongs to the first
sequence or the second sequence.
- generate valid length
For sequence pairs, the input is a tuple of 2 strings:
text_a, text_b.
Inputs:
text_a: 'is this jacksonville ?'
text_b: 'no it is not'
Tokenization:
text_a: 'is this jack ##son ##ville ?'
text_b: 'no it is not .'
Processed:
tokens: '[CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]'
type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
valid_length: 14
For single sequences, the input is a tuple of single string:
text_a.
Inputs:
text_a: 'the dog is hairy .'
Tokenization:
text_a: 'the dog is hairy .'
Processed:
text_a: '[CLS] the dog is hairy . [SEP]'
type_ids: 0 0 0 0 0 0 0
valid_length: 7
Parameters
----------
line: tuple of str
Input strings. For sequence pairs, the input is a tuple of 2 strings:
(text_a, text_b). For single sequences, the input is a tuple of single
string: (text_a,).
Returns
-------
np.array: input token ids in 'int32', shape (batch_size, seq_length)
np.array: valid length in 'int32', shape (batch_size,)
np.array: input token type ids in 'int32', shape (batch_size, seq_length)
"""
# convert to unicode
text_a = line[0]
if self._pair:
assert len(line) == 2
text_b = line[1]
tokens_a = self._tokenizer.tokenize(text_a)
tokens_b = None
if self._pair:
tokens_b = self._tokenizer(text_b)
if tokens_b:
# Modifies `tokens_a` and `tokens_b` in place so that the total
# length is less than the specified length.
# Account for [CLS], [SEP], [SEP] with "- 3"
self._truncate_seq_pair(tokens_a, tokens_b,
self._max_seq_length - 3)
else:
# Account for [CLS] and [SEP] with "- 2"
if len(tokens_a) > self._max_seq_length - 2:
tokens_a = tokens_a[0:(self._max_seq_length - 2)]
# The embedding vectors for `type=0` and `type=1` were learned during
# pre-training and are added to the wordpiece embedding vector
# (and position vector). This is not *strictly* necessary since
# the [SEP] token unambiguously separates the sequences, but it makes
# it easier for the model to learn the concept of sequences.
# For classification tasks, the first vector (corresponding to [CLS]) is
# used as as the "sentence vector". Note that this only makes sense because
# the entire model is fine-tuned.
#vocab = self._tokenizer.vocab
vocab = self._vocab
tokens = []
tokens.append(vocab.cls_token)
tokens.extend(tokens_a)
tokens.append(vocab.sep_token)
segment_ids = [0] * len(tokens)
if tokens_b:
tokens.extend(tokens_b)
tokens.append(vocab.sep_token)
segment_ids.extend([1] * (len(tokens) - len(segment_ids)))
input_ids = self._tokenizer.convert_tokens_to_ids(tokens)
# The valid length of sentences. Only real tokens are attended to.
valid_length = len(input_ids)
if self._pad:
# Zero-pad up to the sequence length.
padding_length = self._max_seq_length - valid_length
# use padding tokens for the rest
input_ids.extend([vocab[vocab.padding_token]] * padding_length)
segment_ids.extend([0] * padding_length)
return np.array(input_ids, dtype='int32'), np.array(valid_length, dtype='int32'),\
np.array(segment_ids, dtype='int32')
class BERTDataset(Dataset): def init(self, dataset, sent_idx, label_idx, bert_tokenizer,vocab, max_len, pad, pair): transform = BERTSentenceTransform( tokenizer, max_seq_length=max_len,vocab=vocab,pad=pad, pair=pair)
self.sentences = [transform([i[sent_idx]]) for i in dataset]
self.labels = [np.int32(i[label_idx]) for i in dataset]
def __getitem__(self, i):
return (self.sentences[i] + (self.labels[i], ))
def __len__(self):
return (len(self.labels))
class BERTClassifier(nn.Module): def init(self, bert, hidden_size = 768, num_classes=6, dr_rate=None, params=None): super(BERTClassifier, self).init() self.bert = bert self.dr_rate = dr_rate
self.classifier = nn.Linear(hidden_size , num_classes)
if dr_rate:
self.dropout = nn.Dropout(p=dr_rate)
def gen_attention_mask(self, token_ids, valid_length):
attention_mask = torch.zeros_like(token_ids)
for i, v in enumerate(valid_length):
attention_mask[i][:v] = 1
return attention_mask.float()
def forward(self, token_ids, valid_length, segment_ids):
attention_mask = self.gen_attention_mask(token_ids, valid_length)
_, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
if self.dr_rate:
out = self.dropout(pooler)
else:
out = pooler
return self.classifier(out)
이렇게 해보세용
@kibeomi 이렇게 해봤는데 완되더라고요 ㅜㅜ 구글링 하면서 엄청 해봤는데... 혹시 코랩에서 저거 돌아가시나요?
from kobert import get_pytorch_kobert_model -> 이게 안되네요 ㅜㅜ 그래도 확인해 주셔서 감사합니다.
kobert가 import 안되서 !pip install 'git+https://github.com/SKTBrain/KoBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf' 이 코드로 설치하고
from kobert_tokenizer import KoBERTTokenizer from transformers import BertModel 이렇게 하면 될텐데용
🐛 Bug
No module named 'kobert' No module named 'glounnlp'
colab The code is not running after the update. 코랩 업데이트 이후 코드 실행이 안되고 있습니다. 이전에는 문제 없이 정상작동을 했던 코드입니다.
To Reproduce
버그를 재현하기 위한 재현절차를 작성해주세요.
앞전의 이슈로 발생한 해결방법에 대해 따라해봤습니다. 그러나 되지지가 않는 현상입니다.
Expected behavior
오류는 import gluonnlp 오류 import kobert 오류 현상입니다.
Environment
코랩 -> python : 3.6, 3.7, 3.8을 사용했습니다.
Additional context
5월 8일이후에 코랩으로 돌려서 되는분 계시면 설치와 import 하는 방법을 공유좀 부탁드려도 될까요? 너무 답답합니다. 되던 코드가 갑자기 안되니까요...
지금 현재 코렙으로 오류가 발생을 하고 있습니다. 혹시 현재 기준으로 도움을 받을 수 있을까요? 코랩에서 4월말까지 모델 학습을 해두고 다른작업 이후 다시 돌려보니 import 오류가 발생되고 있습니다. 혹시 되시는분 알려주실 수 있을 까요?? 너무 알고 싶습니다. 왜이런 이유가 발생을 했는지요