timoschick / form-context-model

This repository contains the code for the Form-Context Model and its Attentive Mimicking variant.
Apache License 2.0
31 stars 3 forks source link

Name Typing Task (4.3 in Attentive Mimicking Paper) #9

Open rajicon opened 3 years ago

rajicon commented 3 years ago

I was trying to replicate the Name Typing Task, and I was wondering if you had a script for it. If not, I have some questions:

(1) Is it a multilabel problem? It seems to be give the dataset. In that case, how is the f1 scores and accuracy calculated? Is accuracy based on exact matching, and is f1 macro or micro?

(2) How many epochs was the logistic regression trained for? Also, because it is multilabel, is the output activation sigmoid?

timoschick commented 3 years ago

Hi @rajicon, you can find the script we've used for calculating the scores below. It's not exactly beautifully written but I hope it answers your questions. If it does not, let me know :)

import torch
import torch.nn as nn
import torch.optim as optim
import io
import random
import numpy as np
from embeddings import load_embeddings
from vocab import load_vocab
import torch.nn.functional as F

torch.manual_seed(42)
random.seed(42)

class Eval:
    def __init__(self, acc, tp, fp, tn, fn):
        self.acc = acc
        self.tp = tp
        self.fp = fp
        self.tn = tn
        self.fn = fn
        self.f1 = (2 * tp) / (2 * tp + fn + fp)

class NameTypingClassifier(nn.Module):
    def __init__(self, embedding_dim, num_classes):
        super(NameTypingClassifier, self).__init__()
        self.linear = nn.Linear(embedding_dim, num_classes)
        self.sigmoid = nn.Sigmoid()

    def forward(self, word_vectors):
        return self.sigmoid(self.linear(word_vectors))

def test(model, test_data):
    model.eval()

    print('Nr of test instances: {}'.format(len(test_data)))
    print('Nr of labels: {}'.format(len(test_data) * 50))

    with torch.no_grad():
        positives, total = 0.0, 0.0
        tp, tn, fp, fn = 0.0, 0.0, 0.0, 0.0

        for test_inst in test_data:

            word_vector = torch.from_numpy(np.array(test_inst[0], dtype=np.float32))
            label = test_inst[1]

            prediction = model(word_vector).data.numpy()

            prediction[prediction >= 0.5] = 1
            prediction[prediction < 0.5] = 0

            for i in range(len(prediction)):
                if prediction[i] == 1 and label[i] == 1:
                    tp += 1
                elif prediction[i] == 0 and label[i] == 0:
                    tn += 1
                elif prediction[i] == 1 and label[i] == 0:
                    fp += 1
                else:
                    fn += 1

            if np.array_equal(prediction, label):
                positives += 1

            total += 1

    return Eval(acc=positives / total, tp=tp, fp=fp, tn=tn, fn=fn)

def load_data(path, label_dict, embeddings, embedding_dim, skip_unk=True,
              min_freq=-1, max_freq=-1, vocab=None, averaging_embeddings=None):
    data = []

    with io.open(path, 'r', encoding='utf-8') as f:
        for line in f.read().splitlines():
            word, labels = line.split('\t', 1)

            label_ids = [label_dict[label] for label in labels.split()]
            label_vector = [1 if i in label_ids else 0 for i in range(len(label_dict))]
            label_vector = np.array(label_vector)

            if vocab and vocab.get(word, 0) < min_freq:
                continue

            if vocab and vocab.get(word, 0) > max_freq >= 0:
                continue

            if word not in embeddings and skip_unk:
                continue

            word_vector = embeddings[word] if word in embeddings else np.zeros(embedding_dim)

            if averaging_embeddings:
                count = vocab.get(word, 0)
                count = min(32, count)
                m = -1 / 32
                c = 1
                w = m * count + c

                if word in embeddings and word in averaging_embeddings:
                    word_vector = w * embeddings[word] + (1 - w) * averaging_embeddings[word]
                elif word in embeddings:
                    word_vector = embeddings[word]
                elif word in averaging_embeddings:
                    word_vector = averaging_embeddings[word]
                else:
                    word_vector = np.zeros(embedding_dim)

            data.append((word_vector, label_vector))

    print('Sucessfully loaded data from {}'.format(path))
    return data

def train(model, train_data, num_epochs=5, batch_size=32, print_every_n_steps=100):
    loss_function = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.01)

    for epoch in range(num_epochs):

        random.shuffle(train_data)

        count = 0
        loss_sum = 0

        i = 0

        while i < len(train_data):
            optimizer.zero_grad()

            batch = train_data[i:i + batch_size]
            word_vectors = torch.from_numpy(np.array([word_vector for (word_vector, _) in batch], dtype=np.float32))
            labels = torch.from_numpy(np.array([labels for (_, labels) in batch], dtype=np.float32))  # .unsqueeze(1)

            log_probs = model(word_vectors)
            loss = loss_function(log_probs, labels)
            loss_sum += loss.item()

            count += 1
            if count % print_every_n_steps == 0:
                print('current loss = ' + str(loss_sum / print_every_n_steps))
                loss_sum = 0

            loss.backward()
            optimizer.step()

            i += batch_size

        print('done with epoch ' + str(epoch))

def load_label_dict(path):
    label_dict = {}
    index = 0

    with io.open(path, 'r', encoding='utf-8') as f:
        for line in f.read().splitlines():
            label = line.split()[0]
            label_dict[label] = index
            index += 1

    return label_dict

if __name__ == '__main__':

    base_dir = '<X>' ' TODO: replace this with a directory containing all required files

    train_data_path = base_dir + r'dataset\train.tsv'
    test_data_path = base_dir + r'dataset\test.tsv'

    train_embeddings = load_embeddings(base_dir + 'name-typing-base.model')
    label_dict = load_label_dict(base_dir + r'dataset\types.tsv')
    vocab = load_vocab(base_dir + 'vocab-all.txt')

    test_embedding_files = ['<X>'] # TODO: replace this with the files containing embeddings you want to test

    train_data = load_data(train_data_path, label_dict, train_embeddings, 300)

    model = NameTypingClassifier(300, num_classes=len(label_dict))
    train(model, train_data)

    lines = []
    test_embeddings_dict = {}

    for emb_file in test_embedding_files:
        test_embeddings_dict[emb_file] = load_embeddings(base_dir + emb_file)

    for freq in [(1, 1), (2, 3), (4, 7), (8, 15), (16, 31), (32, 63), (64, 100), (1, 100)]:

        lines.append('freq = [' + str(freq[0]) + ", " + str(freq[1]) + '): ' + '\n----------------')

        for emb_file in test_embedding_files:
            test_embeddings = test_embeddings_dict[emb_file]
            test_data = load_data(test_data_path, label_dict, test_embeddings, 300, skip_unk=False,
                                  min_freq=freq[0], max_freq=freq[1], vocab=vocab)
            eval = test(model, test_data)
            lines.append(' {} ({} words): \t acc = {} \t f1 = {}'.format(emb_file, len(test_data), eval.acc, eval.f1))

        lines.append('----------------\n')

    for line in lines:
        print(line)