Open 1337-Pete opened 4 years ago
MY CODE
import tensorflow as tf
import tensorflow_hub as hub
import os
import pandas as pd
import bert
from bert import tokenization
from bert import run_classifier
from bert import optimization
######################## SETTINGS ########################
# directory for to store model output and checkpoints
OUTPUT_DIR = 'C:/Users/WTC/Desktop/AI/Sentiment/BERT'#@param
# Set DO_DELETE to rewrite the OUTPUT_DIR if it exists - FALSE
DO_DELETE = False
if DO_DELETE:
try:
tf.gfile.DeleteRecursively(OUTPUT_DIR)
except:
# Doesn't matter if the directory didn't exist
pass
tf.gfile.MakeDirs(OUTPUT_DIR)
print('***** Model output directory: {} *****'.format(OUTPUT_DIR))
######################## DATA ########################
# Load all files from a directory in a DataFrame.
def load_directory_data(directory):
data = {}
data["sentence"] = []
#data["sentiment"] = []
for file_path in os.listdir(directory):
with open(os.path.join(directory, file_path), "r", encoding="utf8") as f:
data["sentence"].append(f.read())
#data["sentiment"].append(re.match("\d+_(\d+)\.txt", file_path).group(1))
return pd.DataFrame.from_dict(data)
# Merge positive and negative examples, add a polarity column and shuffle.
def load_dataset(directory):
neu_df = load_directory_data(os.path.join(directory, "neu"))
pos_df = load_directory_data(os.path.join(directory, "pos"))
neg_df = load_directory_data(os.path.join(directory, "neg"))
neu_df["polarity"] = '2'
pos_df["polarity"] = '1'
neg_df["polarity"] = '0'
return pd.concat([neu_df, pos_df, neg_df]).sample(frac=1).reset_index(drop=True)
# download and process the dataset files
def download_and_load_datasets(force_download=False):
dataset = tf.keras.utils.get_file(
fname="",
origin="C:/Users/WTC/Desktop/AI/Sentiment/data")
train_df = load_dataset("C:/Users/WTC/Desktop/AI/Sentiment/twitter scrape/data/train")
test_df = load_dataset("C:/Users/WTC/Desktop/AI/Sentiment/twitter scrape/data/test")
return train_df, test_df
train, test = download_and_load_datasets()
# train, test
#train = train.sample(1864)
#test = test.sample(481)
train = train.sample(1864)
test = test.sample(481)
train.columns
DATA_COLUMN = 'sentence'
LABEL_COLUMN = 'polarity'
# label list - 0 is for negative, 1 for positive, 2 for neutral
label_list = ['0', '1', '2']
######################## DATA PREPROCESSING ########################
#transform data into BERT format:
# 1. Creating InputExamples
# 2. Preprocessing Data (lowercase text, tokenize ('sally says hi' -> 'sally', 'says', 'hi'))
# - Lowercase text
# - Tokenize
# - Break words into WordPieces ('calling' -> ['call', '##ing'])
# - Map our words to indexes using a vocab file provided by BERT
# - Add special 'CLS' and 'SEP' tokens
# - Append 'index' and 'segment' tokens to each input
# text_a is text to classify, the 'request' field in our dataframe
# text_b is used if training a model to understand sentence relationship (is text_b a translation of text_a, or vice-versa? WE WILL NOT USE THIS)
#INPUT EXAMPLE (create examples for data)
# 'guid' is globally unique ID for book keeping - wont use here
train_InputExamples = train.apply(lambda x: bert.run_classifier.InputExample (guid = None, text_a = x[DATA_COLUMN], text_b = None, label = x[LABEL_COLUMN]), axis = 1)
test_InputExamples = test.apply(lambda x: bert.run_classifier.InputExample (guid = None, text_a = x[DATA_COLUMN], text_b = None, label = x[LABEL_COLUMN]), axis = 1)
#PREPROCESSING DATA
#uncased all version of BERT
BERT_MODEL_HUB = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"
def create_tokenizer_from_hub_module():
"""Get the vocab file and casing info from the Hub module."""
with tf.Graph().as_default():
bert_module = hub.Module(BERT_MODEL_HUB)
tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
with tf.Session() as sess:
#load vocab file and use lowercase data
vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
tokenization_info["do_lower_case"]])
return bert.tokenization.FullTokenizer(
vocab_file=vocab_file, do_lower_case=do_lower_case)
# initialization of tokenizer
tokenizer = create_tokenizer_from_hub_module()
# convert our InputExamples into features that BERT can understand
MAX_SEQ_LENGTH = 128 # sequences are at most 128 tokens long
train_features = bert.run_classifier.convert_examples_to_features(train_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)
test_features = bert.run_classifier.convert_examples_to_features(test_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)
######################## CREATING A MODEL ########################
#Load BERT tf hub module, then create a single new layer that will be trained to adapt BERT to our sentiment ask
# Fine tuning is the strtategy of using a mostly trained model
def create_model(is_predicting, input_ids, input_mask, segment_ids, labels,
num_labels):
"""Creates a classification model."""
bert_module = hub.Module(
BERT_MODEL_HUB,
trainable=True)
bert_inputs = dict(
input_ids=input_ids,
input_mask=input_mask,
segment_ids=segment_ids)
bert_outputs = bert_module(
inputs=bert_inputs,
signature="tokens",
as_dict=True)
# Use "pooled_output" for classification tasks on an entire sentence.
# Use "sequence_outputs" for token-level output.
output_layer = bert_outputs["pooled_output"]
hidden_size = output_layer.shape[-1].value
# Create our own layer to tune for politeness data.
output_weights = tf.get_variable(
"output_weights", [num_labels, hidden_size],
initializer=tf.truncated_normal_initializer(stddev=0.02))
output_bias = tf.get_variable(
"output_bias", [num_labels], initializer=tf.zeros_initializer())
with tf.variable_scope("loss"):
# Dropout helps prevent overfitting
output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)
logits = tf.matmul(output_layer, output_weights, transpose_b=True)
logits = tf.nn.bias_add(logits, output_bias)
log_probs = tf.nn.log_softmax(logits, axis=-1)
# Convert labels into one-hot encoding
one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)
predicted_labels = tf.squeeze(tf.argmax(log_probs, axis=-1, output_type=tf.int32))
# If we're predicting, we want predicted labels and the probabiltiies.
if is_predicting:
return (predicted_labels, log_probs)
# If we're train/eval, compute loss between predicted and actual label
per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
loss = tf.reduce_mean(per_example_loss)
return (loss, predicted_labels, log_probs)
# model_fn_builder actually creates our model function
# using the passed parameters for num_labels, learning_rate, etc.
def model_fn_builder(num_labels, learning_rate, num_train_steps,
num_warmup_steps):
"""Returns `model_fn` closure for TPUEstimator."""
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument
"""The `model_fn` for TPUEstimator."""
input_ids = features["input_ids"]
input_mask = features["input_mask"]
segment_ids = features["segment_ids"]
label_ids = features["label_ids"]
is_predicting = (mode == tf.estimator.ModeKeys.PREDICT)
# TRAIN and EVAL
if not is_predicting:
(loss, predicted_labels, log_probs) = create_model(
is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)
train_op = bert.optimization.create_optimizer(
loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu=False)
# Calculate evaluation metrics.
def metric_fn(label_ids, predicted_labels):
accuracy = tf.metrics.accuracy(label_ids, predicted_labels)
f1_score = tf.contrib.metrics.f1_score(
label_ids,
predicted_labels)
auc = tf.metrics.auc(
label_ids,
predicted_labels)
recall = tf.metrics.recall(
label_ids,
predicted_labels)
precision = tf.metrics.precision(
label_ids,
predicted_labels)
true_pos = tf.metrics.true_positives(
label_ids,
predicted_labels)
true_neg = tf.metrics.true_negatives(
label_ids,
predicted_labels)
false_pos = tf.metrics.false_positives(
label_ids,
predicted_labels)
false_neg = tf.metrics.false_negatives(
label_ids,
predicted_labels)
return {
"eval_accuracy": accuracy,
"f1_score": f1_score,
"auc": auc,
"precision": precision,
"recall": recall,
"true_positives": true_pos,
"true_negatives": true_neg,
"false_positives": false_pos,
"false_negatives": false_neg
}
eval_metrics = metric_fn(label_ids, predicted_labels)
if mode == tf.estimator.ModeKeys.TRAIN:
return tf.estimator.EstimatorSpec(mode=mode,
loss=loss,
train_op=train_op)
else:
return tf.estimator.EstimatorSpec(mode=mode,
loss=loss,
eval_metric_ops=eval_metrics)
else:
(predicted_labels, log_probs) = create_model(
is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)
predictions = {
'probabilities': log_probs,
'labels': predicted_labels
}
return tf.estimator.EstimatorSpec(mode, predictions=predictions)
# Return the actual model function in the closure
return model_fn
# Compute train and warmup steps from batch size
# These hyperparameters are copied from this colab notebook (https://colab.sandbox.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb)
BATCH_SIZE = 16 # number of samples processed before the model is updated.
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 4.0 # the number of complete passes through the training dataset.
# Warmup is a period of time where the learning rate
# is small and gradually increases--usually helps training.
WARMUP_PROPORTION = 0.1
# Compute # train and warmup steps from batch size
num_train_steps = int(len(train_features) / BATCH_SIZE * NUM_TRAIN_EPOCHS)
num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)
# Model configs
SAVE_CHECKPOINTS_STEPS = 0
SAVE_SUMMARY_STEPS = 0
# Specify outpit directory and number of checkpoint steps to save
run_config = tf.estimator.RunConfig(
model_dir=OUTPUT_DIR,
save_summary_steps=SAVE_SUMMARY_STEPS,
save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS)
model_fn = model_fn_builder(
num_labels=len(label_list),
learning_rate=LEARNING_RATE,
num_train_steps=num_train_steps,
num_warmup_steps=num_warmup_steps)
estimator = tf.estimator.Estimator(
model_fn=model_fn,
config=run_config,
params={"batch_size": BATCH_SIZE})
# Create an input builder function that takes our training feature set (train_features) and produces a generator
train_input_fn = bert.run_classifier.input_fn_builder(
features=train_features,
seq_length=MAX_SEQ_LENGTH,
is_training=True,
drop_remainder=False)
#actually train it
print(f'Beginning Training!')
current_time = datetime.now()
estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
print("Training took time ", datetime.now() - current_time)
#use test data to see how well the model did
test_input_fn = run_classifier.input_fn_builder(
features=test_features,
seq_length=MAX_SEQ_LENGTH,
is_training=False,
drop_remainder=False)
estimator.evaluate(input_fn=test_input_fn, steps=None)
# Code for making predictions on new sentences:
def getPrediction(in_sentences):
labels = ["Neutral", "Negative", "Positive"]
input_examples = [run_classifier.InputExample(guid="", text_a = x, text_b = None, label = 0) for x in in_sentences] # here, "" is just a dummy label
input_features = run_classifier.convert_examples_to_features(input_examples, label_list, MAX_SEQ_LENGTH, tokenizer)
predict_input_fn = run_classifier.input_fn_builder(features=input_features, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=False)
predictions = estimator.predict(predict_input_fn)
return [(sentence, prediction['probabilities'], labels[prediction['labels']]) for sentence, prediction in zip(in_sentences, predictions)]```
I was also not able to crack this problem. But I can see that optimization algorithm need to be changed from log_softmax to sigmoid. There are few articles that claim that they cracked it.. https://towardsdatascience.com/beginners-guide-to-bert-for-multi-classification-task-92f5445c2d7c https://towardsdatascience.com/building-a-multi-label-text-classifier-using-bert-and-tensorflow-f188e0ecdc5d https://medium.com/huggingface/multi-label-text-classification-using-bert-the-mighty-transformer-69714fa3fb3d Let me know if you were able to solve it.
I've built a multiclassification model using the IMDb example linked below. I've added an extra label for "neutral" classification and have had some success with running my model. However, I randomly will encounter this error from time to time and find myself unable to test on a new amount of training and testing data. I have no idea why I'm getting thrown this error for predictions. Any thoughts?
IMDB Example
https://github.com/google-research/bert/blob/master/predicting_movie_reviews_with_bert_on_tf_hub.ipynb
ERORR