Closed ytgui closed 4 years ago
class Dictionary(object):
def __init__(self):
self.word2idx = {}
self.idx2word = []
def add_word(self, word):
if word not in self.word2idx:
self.idx2word.append(word)
self.word2idx[word] = len(self.idx2word) - 1
return self.word2idx[word]
def __len__(self):
return len(self.idx2word)
class Corpus(object):
def __init__(self, path):
self.dictionary = Dictionary()
self.train = self.tokenize(os.path.join(path, 'train.txt'))
self.valid = self.tokenize(os.path.join(path, 'valid.txt'))
self.test = self.tokenize(os.path.join(path, 'test.txt'))
def tokenize(self, path):
"""Tokenizes a text file."""
assert os.path.exists(path)
# Add words to the dictionary
with open(path, 'r', encoding="utf8") as f:
for line in f:
words = line.split() + ['<eos>']
for word in words:
self.dictionary.add_word(word)
# Tokenize file content
with open(path, 'r', encoding="utf8") as f:
idss = []
for line in f:
words = line.split() + ['<eos>']
ids = []
for word in words:
ids.append(self.dictionary.word2idx[word])
idss.append(torch.tensor(ids).type(torch.int64))
ids = torch.cat(idss)
return ids
def batchify(data, bsz):
# Work out how cleanly we can divide the dataset into bsz parts.
nbatch = data.size(0) // bsz
# Trim off any extra elements that wouldn't cleanly fit (remainders).
data = data.narrow(0, 0, nbatch * bsz)
# Evenly divide the data across the bsz batches.
# notice: data is transposed to seq first, batch second
data = data.view(bsz, -1).t().contiguous()
return data.to(device)
def get_batch(source, i):
# sql, batch, feature
seq_len = min(args.bptt, len(source) - 1 - i)
data = source[i:i+seq_len]
target = source[i+1:i+1+seq_len].view(-1)
return data, target
def repackage_hidden(h):
"""Wraps hidden states in new Tensors, to detach them from their history."""
if isinstance(h, torch.Tensor):
return h.detach()
else:
return tuple(repackage_hidden(v) for v in h)
https://www.jianshu.com/p/e59dba1fc4b0 https://www.jianshu.com/p/160c4800b9b5 https://v2ex.com/t/642320#reply20 https://www.jianshu.com/p/0d7b5c226f39 https://github.com/graykode/nlp-tutorial https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html https://blog.csdn.net/tMb8Z9Vdm66wH68VX1/article/details/80174586 https://dumps.wikimedia.org/zhwiki/ https://github.com/OYE93/Chinese-NLP-Corpus https://github.com/crownpku/awesome-chinese-nlp https://wakespace.lib.wfu.edu/handle/10339/39379 https://github.com/niderhoff/nlp-datasets https://www.ldc.upenn.edu/collaborations/current-projects/bolt/annotation/treebank https://github.com/propbank