ytgui / temp

0 stars 0 forks source link

learning nlp #108

Closed ytgui closed 4 years ago

ytgui commented 4 years ago

https://www.jianshu.com/p/e59dba1fc4b0 https://www.jianshu.com/p/160c4800b9b5 https://v2ex.com/t/642320#reply20 https://www.jianshu.com/p/0d7b5c226f39 https://github.com/graykode/nlp-tutorial https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html https://blog.csdn.net/tMb8Z9Vdm66wH68VX1/article/details/80174586 https://dumps.wikimedia.org/zhwiki/ https://github.com/OYE93/Chinese-NLP-Corpus https://github.com/crownpku/awesome-chinese-nlp https://wakespace.lib.wfu.edu/handle/10339/39379 https://github.com/niderhoff/nlp-datasets https://www.ldc.upenn.edu/collaborations/current-projects/bolt/annotation/treebank https://github.com/propbank

ytgui commented 4 years ago

examples/word_language_model

data loader and corpus


class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []

    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)

class Corpus(object):
    def __init__(self, path):
        self.dictionary = Dictionary()
        self.train = self.tokenize(os.path.join(path, 'train.txt'))
        self.valid = self.tokenize(os.path.join(path, 'valid.txt'))
        self.test = self.tokenize(os.path.join(path, 'test.txt'))

    def tokenize(self, path):
        """Tokenizes a text file."""
        assert os.path.exists(path)
        # Add words to the dictionary
        with open(path, 'r', encoding="utf8") as f:
            for line in f:
                words = line.split() + ['<eos>']
                for word in words:
                    self.dictionary.add_word(word)

        # Tokenize file content
        with open(path, 'r', encoding="utf8") as f:
            idss = []
            for line in f:
                words = line.split() + ['<eos>']
                ids = []
                for word in words:
                    ids.append(self.dictionary.word2idx[word])
                idss.append(torch.tensor(ids).type(torch.int64))
            ids = torch.cat(idss)

        return ids

data format in seq first, batch second

def batchify(data, bsz):
    # Work out how cleanly we can divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    # notice: data is transposed to seq first, batch second 
    data = data.view(bsz, -1).t().contiguous()
    return data.to(device)

training and hidden_state

def get_batch(source, i):
    # sql, batch, feature
    seq_len = min(args.bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].view(-1)
    return data, target

def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history."""
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)