simplysameer333 / MachineLearning

1 stars 1 forks source link

train #5

Open simplysameer333 opened 6 years ago

simplysameer333 commented 6 years ago

{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Install packages" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#!pip3 install tensorflow==1.10\n", "#!pip3 install tensorflow-gpu==1.10.0\n", "#!pip3 install nltk\n", "#!pip3 install numpy\n", "#!pip3 install pandas\n", "#!pip3 install gensim" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Import dependencies" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import nltk\n", "from os import listdir\n", "import time\n", "from nltk.corpus import wordnet\n", "import re\n", "import pickle\n", "from gensim.models.keyedvectors import KeyedVectors\n", "import numpy as np\n", "import pandas as pd\n", "from pathlib import Path\n", "import tensorflow as tf\n", "from tensorflow.python.layers.core import Dense\n", "from tensorflow.contrib.seq2seq.python.ops import beam_search_ops" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Config & Hyper Parameters" ] }, { "cell_type": "code", "execution_count": 95, "metadata": {}, "outputs": [], "source": [ "base_path = '../data/'\n", "path = base_path + 'sample_5k\\'\n", "#path = base_path + 'stories\\'\n", "articles_pickle_filename = \"articles.pickle\"\n", "headlines_pickle_filename = \"headlines.pickle\"\n", "# articles_pickle_filename = \"articles_full.pickle\"\n", "# headlines_pickle_filename = \"headlines_full.pickle\"\n", "vocab_to_int_pickle_filename = \"vocab_to_int.pickle\"\n", "int_to_vocab_pickle_filename = \"int_to_vocab.pickle\"\n", "\n", "''' https://fasttext.cc/docs/en/english-vectors.html \n", " or https://www.kaggle.com/yesbutwhatdoesitmean/wikinews300d1mvec'''\n", "model_path = '../model/'\n", "model_org_filename = 'wiki-news-300d-1M.vec'\n", "model_pickle_filename = \"model.pickle\"\n", "word_embedding_matrix_filename = \"word_embedding_matrix.pickle\"\n", "checkpoint = \"./../out/best_model.ckpt\"\n", "tensorboard_logs = '../logs'\n", "\n", "# to avoid words that are used less that threshold value\n", "threshold = 2\n", "enable_gpu = False\n", "\n", "# Dimension size as per pre-trained data\n", "embedding_dim = 300\n", "max_text_length = 500\n", "max_summary_length = 20\n", "min_length = 5\n", "unk_text_limit = 100\n", "\n", "# Set the Hyperparameters\n", "epochs = 100\n", "batch_size = 10\n", "rnn_size = 256\n", "num_layers = 5\n", "learning_rate = 0.002\n", "keep_probability = 0.75\n", "beam_width = 3\n", "\n", "# Training Hyperparameters\n", "start = 0\n", "end = 4000\n", "learning_rate_decay = 0.95\n", "min_learning_rate = 0.0002\n", "display_step = 10 # Check training loss after every 10 batches\n", "stop_early = 0\n", "stop = 3 # If the update loss does not decrease in 3 consecutive update checks, stop training\n", "per_epoch = 3 # Make 3 update checks per epoch\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Stopword list and Initialize Lemmatizer" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "nltk.download('stopwords')\n", "\n", "stop_words = nltk.corpus.stopwords.words('english')\n", "lmtzr = nltk.WordNetLemmatizer().lemmatize" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Read files and load into memory" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def load_files(filename):\n", " # open the file as read only\n", " file = open(filename, encoding='utf-8')\n", " # read all text\n", " text = file.read()\n", " # close the file\n", " file.close()\n", " return text" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Split a document into news article body and headlines" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def split_data(doc):\n", " # find first headlines\n", " index = doc.find('@highlight')\n", " # split into story and headlines\n", " article, headlines = doc[:index], doc[index:].split('@highlight')\n", " # strip extra white space around each highlight\n", " headlines = [h.strip() for h in headlines if len(h) > 0]\n", " return article, headlines" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Clean a list of lines\n", "This section is used to remove unwanted words and return cleaned articles and headlines." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def clean_text(lines, remove_stopwords=True):\n", " \n", " cleaned = list()\n", " for line in lines:\n", " # strip source cnn office if it exists\n", " index = line.find('(CNN) -- ')\n", " if index > -1:\n", " line = line[index + len('(CNN)'):]\n", " else:\n", " index = line.find('(CNN)')\n", " if index > -1:\n", " line = line[index + len('(CNN)'):]\n", "\n", " # tokenize on white space\n", " line = line.split()\n", "\n", " # convert to lower case\n", " line = [word.lower() for word in line]\n", "\n", " # Optionally, remove stop words\n", " if remove_stopwords:\n", " line = [w for w in line if w not in stopwords]\n", "\n", " # remove punctuation from each token\n", " #line = [w.translate(table) for w in line]\n", "\n", " # remove tokens with numbers in them\n", " line = [word for word in line if word.isalpha()]\n", "\n", " # Format words and remove unwanted characters\n", " text = \" \".join(line)\n", " text = re.sub(r'https?:\/\/.[\r\n]', '', text, flags=re.MULTILINE)\n", " text = re.sub(r'\<a href', ' ', text)\n", " text = re.sub(r'&', '', text)\n", " text = re.sub(r'[\"\-;%()|+&=%.,!?:#$@\[\]/]', ' ', text)\n", " text = re.sub(r'
', ' ', text)\n", " text = re.sub(r'\'', ' ', text)\n", "\n", " # remove empty strings\n", " if len(text )> 0 :\n", " cleaned.append(text)\n", "\n", " return cleaned" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Normalization of data using Lemmatization\n", "Lemmatization is used as it returns better words choice than stemming as Lemmatization returns only valid dictionary(wordnet) words. Trade is it takes more time." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_wordnet_pos(treebank_tag):\n", " if treebank_tag.startswith('J'):\n", " return wordnet.ADJ\n", " elif treebank_tag.startswith('V'):\n", " return wordnet.VERB\n", " elif treebank_tag.startswith('N'):\n", " return wordnet.NOUN\n", " elif treebank_tag.startswith('R'):\n", " return wordnet.ADV\n", " else:\n", " return wordnet.NOUN\n", "\n", "def normalize_text(text):\n", " cleaned = list()\n", "\n", " for line in text :\n", " word_pos = nltk.pos_tag(nltk.word_tokenize(line))\n", " lemm_words = [lmtzr(sw[0], get_wordnet_pos(sw[1])) for sw in word_pos]\n", "\n", " word = [x.lower() for x in lemm_words]\n", " cleaned.append(' '.join(word))\n", "\n", " return cleaned" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Load all stories in a directory\n", "This is used to load and clean the learn and test dataset. After cleaning data it returns two list cleaned articles and cleaned headlines." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def load_stories(location):\n", " stories = list()\n", " file_list = listdir(location)\n", " total_files = len (file_list)\n", " count = 0\n", " print (\"Total Files : {total_files}\".format(total_files= total_files))\n", " clean_articles = []\n", " clean_headlines = []\n", " for name in file_list:\n", " count = count + 1\n", " filename = location + '/' + name\n", " # load document\n", " print('Loading - {filename}, files number - {count}, out of - {total_files}'\n", " .format(filename=filename, count=count, total_files=total_files))\n", " doc = load_files(filename)\n", " # split into story and highlights\n", " article, headlines = split_data(doc)\n", " # store\n", " #stories.append({'article': article, 'headlines' : headlines})\n", "\n", " article = clean_text(article.split('\n'))\n", " article = normalize_text(article)\n", " clean_articles.append(' '.join(article))\n", " headlines = clean_text(headlines, remove_stopwords=False)\n", " headlines = normalize_text(headlines)\n", " clean_headlines.append(' '.join(headlines))\n", "\n", " return clean_articles, clean_headlines" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Main Program\n", "Start point of data cleaning, once the articles and headlines are cleaned, they dumped so that can be reused for vectorization and then running model directly. This is becasue cleaning is an expensive operation in terms of time and resources. " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def main():\n", " start = time.perf_counter()\n", " clean_articles, clean_headlines = load_stories(path)\n", " print(\"Total Articles : {len_articles} , Total Headlines : {len_headlines}- Time Taken : {time_taken}\"\n", " .format(len_articles=len(clean_articles), len_headlines =\n", " len(clean_headlines), time_taken = (time.perf_counter()-start)/60))\n", "\n", " print (\"Serialization of articles\")\n", " # Store Articles (serialize)\n", " with open(base_path + articles_pickle_filename, 'wb') as handle:\n", " pickle.dump(clean_articles, handle, protocol=pickle.HIGHEST_PROTOCOL)\n", "\n", " print(\"Serialization of headlines\")\n", " # Store Articles (serialize)\n", " with open(base_path + headlines_pickle_filename, 'wb') as handle:\n", " pickle.dump(clean_headlines, handle, protocol=pickle.HIGHEST_PROTOCOL)\n", "\n", "\n", "'''-------------------------main------------------------------'''\n", "main()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Load Pre-trained English word embedding\n", "\n", "This is used to load pre-trained english word embedding 'fast text' provided facebook. First it checks if pre-trained model dump already exists, if not it load model and put in it dump. Dump is created becasue it loads faster than actual word embedding model.\n", "https://fasttext.cc/docs/en/english-vectors.html" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def create_or_load_model():\n", " model_pickle = Path(model_path + model_pickle_filename)\n", " if model_pickle.exists():\n", " print(\"Loading Pre-Trained Model Pickle..... \")\n", " start = time.perf_counter()\n", " with open(model_path + model_pickle_filename, 'rb') as handle:\n", " model = pickle.load(handle)\n", " print(\"Loaded Pre-Trained Model Pickle, time taken\", ((time.perf_counter() - start) / 60))\n", " else:\n", " print(\"Loading Pre-Trained Model ..... \")\n", " start = time.perf_counter()\n", " model = KeyedVectors.load_word2vec_format(model_path + model_org_filename, binary=False)\n", " with open(base_path + model_pickle_filename, 'wb') as handle:\n", " pickle.dump(model, handle, protocol=pickle.HIGHEST_PROTOCOL)\n", " print(\"Loaded Pre-Trained Model, time taken\", ((time.perf_counter() - start) / 60))\n", " return model" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# count_words\n", "\n", "This is a utility method used to count how many times a word is used. " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def count_words(count_dict, text):\n", " ''' Count the number of occurrences of each word in a set of text'''\n", " for sentence in text:\n", " for word in sentence.split():\n", " if word not in count_dict:\n", " count_dict[word] = 1\n", " else:\n", " count_dict[word] += 1" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# vectorization\n", "\n", "This is used to get word embedding for each word from pre-trained model" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def vectorization(text, embeddings_index, model):\n", " for sentence in text:\n", " try:\n", " for vocab_word in sentence.split():\n", " embeddings_index[vocab_word] = model[vocab_word] \n", " # print(\"Work : {vocab_word} , vector value : {vector_value}\".\n", " #format(vocab_word=vocab_word, vector_value =vector_value))\n", " except KeyError:\n", " '''ignore'''\n", " # print(\"{vocab_word} not in vocabulary\".format(vocab_word=vocab_word))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# missing_word_ratio\n", "\n", "Find the number of words that are missing from CN, and are used more than our threshold." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def missing_word_ratio(word_counts, embeddings_index):\n", " ''' Find the number of words that are missing from CN, and are used more than our threshold.'''\n", " missing_words_count = 0\n", " missing_words = list()\n", "\n", " for word, count in word_counts.items():\n", " if word not in embeddings_index and word not in missing_words and count > threshold:\n", " missing_words_count += 1\n", " missing_words.append(word)\n", " # print(\"{word} is missing \".format(word=word))\n", "\n", " missing_ratio = round(missing_words_count / len(word_counts), 4)
100\n", " return missing_ratio, missing_words_count" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# covert_vocab_to_int\n", "\n", "This is used to covert each word in training set to word vector. This is important as ML algorithm can only understand numbers. This integer representation of word is later passed encoder for word processing." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def covert_vocab_to_int(word_counts, embeddings_index):\n", " # dictionary to convert words to integers\n", " vocab_to_int = {}\n", "\n", " value = 0\n", " for word, count in word_counts.items():\n", " if count > threshold or word in embeddings_index:\n", " vocab_to_int[word] = value\n", " value += 1\n", "\n", " # Special tokens that will be added to our vocab\n", " codes = [\"\", \"\", \"\", \"\"]\n", "\n", " # Add codes to vocab\n", " for code in codes:\n", " vocab_to_int[code] = len(vocab_to_int)\n", "\n", " # Dictionary to convert integers to words\n", " int_to_vocab = {}\n", " for word, value in vocab_to_int.items():\n", " int_to_vocab[value] = word\n", "\n", " usage_ratio = round(len(vocab_to_int) / len(word_counts), 4) 100\n", "\n", " print(\"Total number of unique words:\", len(word_counts))\n", " print(\"Number of words we will use:\", len(vocab_to_int))\n", " print(\"Percent of words we will use: {}%\".format(usage_ratio))\n", "\n", " return vocab_to_int, int_to_vocab" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# create_combine_word_matrix\n", "\n", "Need to use 300 for embedding dimensions to match corpus(input data) vectors.\n", "This will return cobine matriz that would have 'embeddings_index' for from pre-trained word embedding plus \n", "random embedding generated for words missing in pre-trained word embedding." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def create_combine_word_matrix(vocab_to_int, embeddings_index):\n", " \n", " nb_words = len(vocab_to_int)\n", " # Create matrix with default values of zero\n", " word_embedding_matrix = np.zeros((nb_words, embedding_dim), dtype=np.float32)\n", " for word, i in vocab_to_int.items():\n", " if word in embeddings_index:\n", " word_embedding_matrix[i] = embeddings_index[word]\n", " else:\n", " # If word not in CN, create a random embedding for it\n", " new_embedding = np.array(np.random.uniform(-1.0, 1.0, embedding_dim))\n", " embeddings_index[word] = new_embedding\n", " word_embedding_matrix[i] = new_embedding\n", "\n", " # Check if value matches len(vocab_to_int)\n", " print(\"word_embedding_matrix length : \", len(word_embedding_matrix))\n", " return word_embedding_matrix" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Finding unknown words\n", "\n", "Convert words in text to an integer. If word is not in vocab_to_int, use UNK's integer.\n", "Total the number of words and UNKs. Add EOS token to the end of texts." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def convert_to_ints(text, vocab_to_int, eos=False): \n", " ints = []\n", " word_count = 0\n", " unk_count = 0\n", " for sentence in text:\n", " sentence_ints = []\n", " for word in sentence.split():\n", " word_count += 1\n", " if word in vocab_to_int:\n", " sentence_ints.append(vocab_to_int[word])\n", " else:\n", " sentence_ints.append(vocab_to_int[\"\"])\n", " # print(\"UNK Word : \", word)\n", " unk_count += 1\n", " if eos:\n", " sentence_ints.append(vocab_to_int[\"\"])\n", " ints.append(sentence_ints)\n", "\n", " unk_percent = round(unk_count / word_count, 4) 100\n", "\n", " print(\"Total number of words : \", word_count)\n", " print(\"Total number of UNKs : \", unk_count)\n", " print(\"Percent of words that are UNK: {}%\".format(unk_percent))\n", "\n", " return ints, word_count, unk_count\n", "\n", "\n", "def create_dataFrame(text):\n", " '''Create a data frame of the sentence lengths from a text'''\n", " lengths = []\n", " for sentence in text:\n", " lengths.append(len(sentence))\n", " return pd.DataFrame(lengths, columns=['counts'])\n", "\n", "\n", "def unk_counter(sentence, vocab_to_int):\n", " '''Counts the number of time UNK appears in a sentence.'''\n", " unk_count = 0\n", " for word in sentence:\n", " if word == vocab_to_int[\"\"]:\n", " unk_count += 1\n", " return unk_count" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Sorting training dataset\n", "\n", "Sort the summaries and texts by the length of the texts, shortest to longest. \n", "This is required so that batch provided to tensorflow will have lesser padding as sentences would be of same size.\n", "Limit the length of summaries and texts based on the min and max ranges. This is to avoid out of range data.\n", "Remove reviews that include too many UNKs as they would not provide much of learning experience." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def sort_corplus(lengths_articles, int_rep_articles, int_rep_headlines, vocab_to_int):\n", "\n", " sorted_articles = []\n", " sorted_headlines = []\n", " #max_text_length = max_text_length\n", " #max_summary_length = max_summary_length\n", " #min_length = config.min_length\n", " #unk_text_limit = config.unk_text_limit\n", " unk_summary_limit = 0\n", "\n", " for count, words in enumerate(int_rep_articles):\n", " if (len(int_rep_articles[count]) >= min_length and len(int_rep_articles[count]) <= max_text_length\n", " and unk_counter(int_rep_headlines[count], vocab_to_int) <= unk_summary_limit and\n", " unk_counter(int_rep_articles[count], vocab_to_int) <= unk_text_limit):\n", " sorted_headlines.append(int_rep_headlines[count])\n", " sorted_articles.append(int_rep_articles[count])\n", "\n", " # Compare lengths to ensure they match\n", " print(len(sorted_headlines))\n", " print(len(sorted_articles))\n", "\n", " return sorted_articles, sorted_headlines" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Create input for Tensorflow graph\n", "\n", "For using tensorflow we need to provide below input paramters and create_input_for_graph() is used to generate these variables.\n", "\n", "clean_articles -> articles after removing impurities\n", "\n", "sorted_articles -> articles sorted as the thr length\n", "\n", "sorted_headline -> headlines (sorted as per article length) as the thr length\n", "\n", "vocab_to_int -> interger values of all vocab words\n", "\n", "word_embedding_matrix -> 300 dim matrix for each word in vocab" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def create_input_for_graph():\n", " # Load data (deserialize)\n", " with open(base_path + articles_pickle_filename, 'rb') as handle:\n", " clean_articles = pickle.load(handle)\n", "\n", " with open(base_path + headlines_pickle_filename, 'rb') as handle:\n", " clean_headlines = pickle.load(handle)\n", "\n", " pre_trained_model = create_or_load_model()\n", "\n", " word_counts = {}\n", " print(\"counting Articles\")\n", " count_words(word_counts, clean_articles)\n", " print(\"counting Headlines\")\n", " count_words(word_counts, clean_headlines)\n", "\n", " print(\"Total Stories : \", len(clean_headlines))\n", " print(\"Size of Vocabulary:\", len(word_counts))\n", "\n", " print(\"creating embedding index .....\")\n", " embeddings_index = {};\n", " vectorization(clean_articles, embeddings_index, pre_trained_model)\n", " vectorization(clean_headlines, embeddings_index, pre_trained_model)\n", " print('Word embeddings:', len(embeddings_index))\n", "\n", " # find out missing words and thr %\n", " missing_ratio, missing_words_count = missing_word_ratio(word_counts, embeddings_index)\n", "\n", " print(\"Number of words missing :\", missing_words_count)\n", " print(\"Percent of words that are missing from vocabulary: {}%\".format(missing_ratio))\n", "\n", " '''dictionary to convert words to integers - This is to found total words count that \n", " we get from aur corpus(input date) and out of that what % of words we would be using. \n", " This is after removing words that count less than threshold'''\n", " ocab_to_int, int_to_vocab = covert_vocab_to_int(word_counts, embeddings_index)\n", "\n", " # persist vocab_to_int for use in generate stage\n", " with open(config.base_path + config.vocab_to_int_pickle_filename, 'wb') as handle:\n", " pickle.dump(vocab_to_int, handle, protocol=pickle.HIGHEST_PROTOCOL)\n", "\n", " with open(config.base_path + config.int_to_vocab_pickle_filename, 'wb') as handle:\n", " pickle.dump(int_to_vocab, handle, protocol=pickle.HIGHEST_PROTOCOL)\n", "\n", " word_embedding_matrix = create_combine_word_matrix(vocab_to_int, embeddings_index)\n", "\n", " # Apply convert_to_ints to clean_articles and clean_headlines\n", " print(\"Article Data\")\n", " int_repr_articles, word_article_count, unk_article_count = convert_to_ints(clean_articles, \n", " vocab_to_int, eos=True)\n", "\n", " print(\"Headline Data\")\n", " int_repr_headlines, word_headline_count, unk_headline_count = convert_to_ints(clean_headlines,\n", " vocab_to_int)\n", "\n", " lengths_articles = create_dataFrame(int_repr_articles)\n", " # lengths_headlines = create_dataFrame(int_repr_headlines)\n", "\n", " sorted_articles, sorted_headlines = sort_corplus(lengths_articles, int_repr_articles,\n", " int_repr_headlines, vocab_to_int)\n", "\n", " return sorted_articles, sorted_headlines, vocab_to_int, word_embedding_matrix" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Define placeholders" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def model_inputs():\n", " '''Create palceholders for inputs to the model'''\n", "\n", " input_data = tf.placeholder(tf.int32, [None, None], name='input_data')\n", " targets = tf.placeholder(tf.int32, [None, None], name='targets')\n", " lr = tf.placeholder(tf.float32, name='learning_rate')\n", " keep_prob = tf.placeholder(tf.float32, name='keep_prob')\n", " headline_length = tf.placeholder(tf.int32, (None,), name='headline_length')\n", " max_headline_length = tf.reduce_max(headline_length, name='max_headline_length')\n", " article_length = tf.placeholder(tf.int32, (None,), name='article_length')\n", "\n", " return input_data, targets, lr, keep_prob, headline_length, max_headline_length, article_length" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##Add padding 'GO' to each sentence to make encoder unserstand taht new article has arrived.\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def process_encoding_input(target_data, vocab_to_int, batch_size):\n", " '''Remove the last word id from each batch and concat the to the begining of each batch'''\n", " # sample = [[11, 12, 13], [31, 32, 33], [51, 52, 53], [61, 62,63]]\n", " # slice = tf.strided_slice(sample, begin=[0,0], end=[4,4], strides=[1,1])\n", " # process_input = tf.concat([tf.fill([4, 1], 9999), slice], 1)\n", " # process_input = [[9999 11 12 13], [9999 31 32 33] , [9999 51 52 53], [9999 61 62 63]]\n", "\n", " # target data has batch_size rows, -1 means everything, so first elect of each row\n", " slice = tf.strided_slice(target_data, [0, 0], [batch_size, -1], [1, 1])\n", "\n", " # tf.fill creates array of batch_size X 1 and then fill in value of ''\n", " # create matrix that has first column as value vocab_to_int[''] and second as index [first column of each row)\n", " process_input = tf.concat([tf.fill([batch_size, 1], vocab_to_int['']), slice], 1)\n", "\n", " return process_input" ] }, { "cell_type": "code", "execution_count": 80, "metadata": {}, "outputs": [], "source": [ "def train_decoding_layer(dec_embed_input, headline_length, dec_cell, initial_state, output_layer, max_headline_length):\n", " '''Create the training logits'''\n", "\n", " # for training : read inputs from dense ground truth vector\n", " training_helper = tf.contrib.seq2seq.TrainingHelper(inputs=dec_embed_input,\n", " sequence_length=headline_length,\n", " time_major=False)\n", "\n", " training_decoder = tf.contrib.seq2seq.BasicDecoder(dec_cell,\n", " training_helper,\n", " initial_state,\n", " output_layer)\n", "\n", " traininglogits, , _ = tf.contrib.seq2seq.dynamic_decode(training_decoder, impute_finished=True,\n", " output_time_major=False,\n", " maximum_iterations = max_headline_length)\n", "\n", " return training_logits" ] }, { "cell_type": "code", "execution_count": 81, "metadata": {}, "outputs": [], "source": [ "def inference_decoding_layer(embeddings, start_token, end_token, dec_cell, output_layer, max_headline_length,\n", " enc_state_tiled, batch_size):\n", " '''Create the inference logits'''\n", "\n", " start_tokens = tf.tile(tf.constant([start_token], dtype=tf.int32), [batch_size], name='start_tokens')\n", " beam_initial_state = dec_cell.zero_state(batch_size=config.batch_size config.beam_width,\n", " dtype=enc_state_tiled[0].dtype).clone(cell_state=enc_state_tiled[0])\n", "\n", " inference_decoder = tf.contrib.seq2seq.BeamSearchDecoder(\n", " cell=dec_cell,\n", " embedding=embeddings,\n", " start_tokens=start_tokens,\n", " end_token=end_token,\n", " initial_state=beam_initial_state,\n", " beam_width=config.beam_width,\n", " output_layer=output_layer,\n", " length_penalty_weight=0.0)\n", "\n", " inferencelogits, , _ = tf.contrib.seq2seq.dynamic_decode(\n", " decoder=inference_decoder,\n", " impute_finished=False,\n", " maximum_iterations=2 max_headline_length)\n", "\n", "\n", " return inference_logits" ] }, { "cell_type": "code", "execution_count": 82, "metadata": {}, "outputs": [], "source": [ "def decoding_layer(dec_embed_input, embeddings, enc_output, enc_state, vocab_size, article_length, headline_length,\n", " max_headline_length, rnn_size, vocab_to_int, keep_prob, batch_size, num_layers):\n", " '''Create the decoding cell and attention for the training and inference decoding layers'''\n", "\n", " # creating layer and Dropout layers\n", " for layer in range(num_layers):\n", " with tf.variablescope('decoder{}'.format(layer)):\n", " lstm = tf.contrib.rnn.LSTMCell(rnn_size, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))\n", " dec_cell = tf.contrib.rnn.DropoutWrapper(lstm, input_keep_prob=keep_prob)\n", "\n", " # creating Dense- This is also called out layer. This will produce the summary.\n", " output_layer = Dense(vocab_size, activation='relu', kernel_initializer\n", " =tf.truncated_normal_initializer(mean=0.0, stddev=0.1))\n", "\n", " # Using BahdanauAttention as one of the widely used Attention Algorithms\n", " with tf.variable_scope('shared_attention_mechanism'):\n", " attn_mech = tf.contrib.seq2seq.BahdanauAttention(rnn_size, enc_output, article_length,\n", " normalize=False, name='BahdanauAttention')\n", "\n", " dec_cell_atten = tf.contrib.seq2seq.AttentionWrapper(dec_cell, attn_mech, rnn_size)\n", " initial_state = dec_cell_atten.zero_state(batch_size=batch_size, dtype=enc_state[0].dtype).clone(\n", " cell_state=enc_state[0])\n", "\n", " # Creating training logits - which would be used during training dataset\n", " with tf.variable_scope(\"decode\"):\n", " training_logits = train_decoding_layer(dec_embed_input,\n", " headline_length,\n", " dec_cell_atten,\n", " initial_state,\n", " output_layer,\n", " max_headline_length)\n", "\n", " # BEAM SEARCH TILE\n", " enc_output_tiled = tf.contrib.seq2seq.tile_batch(enc_output, multiplier=config.beam_width)\n", " enc_state_tiled = tf.contrib.seq2seq.tile_batch(enc_state, multiplier=config.beam_width)\n", " article_length_tiled = tf.contrib.seq2seq.tile_batch(article_length, multiplier=config.beam_width)\n", "\n", " # Using BahdanauAttention as one of the widely used Attention Algorithms\n", " with tf.variable_scope('shared_attention_mechanism', reuse=True):\n", " attn_mech = tf.contrib.seq2seq.BahdanauAttention(rnn_size, enc_output_tiled, article_length_tiled,\n", " normalize=False, name='BahdanauAttention')\n", "\n", " dec_cell = tf.contrib.seq2seq.AttentionWrapper(dec_cell, attn_mech, rnn_size)\n", "\n", " # Creating inference logits - which would produce out using train model\n", " with tf.variable_scope(\"decode\", reuse=True):\n", " inference_logits = inference_decoding_layer(embeddings,\n", " vocab_to_int[''],\n", " vocab_to_int[''],\n", " dec_cell,\n", " output_layer,\n", " max_headline_length,\n", " enc_state_tiled,\n", " batch_size)\n", "\n", " return training_logits, inference_logits" ] }, { "cell_type": "code", "execution_count": 83, "metadata": {}, "outputs": [], "source": [ "def encoding_layer(rnn_size, article_length, num_layers, rnn_inputs, keep_prob):\n", " '''Create the encoding layer'''\n", "\n", " # Number of layer inside neural network\n", " for layer in range(num_layers):\n", " with tf.variablescope('encoder{}'.format(layer)):\n", " # forward direction cell with random weights with seed value for reproduce random value\n", " cell_fw = tf.contrib.rnn.LSTMCell(rnn_size, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))\n", "\n", " # Dropout to kills cells that are not changing.\n", " cell_fw = tf.contrib.rnn.DropoutWrapper(cell_fw, input_keep_prob=keep_prob)\n", "\n", " cell_bw = tf.contrib.rnn.LSTMCell(rnn_size, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))\n", " cell_bw = tf.contrib.rnn.DropoutWrapper(cell_bw, input_keep_prob=keep_prob)\n", "\n", " # Bidirectional as it is more optimized, spl with Dropouts\n", " enc_output, enc_state = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, rnn_inputs, article_length,\n", " dtype=tf.float32)\n", "\n", " # Join outputs since we are using a bidirectional RNN\n", " enc_output = tf.concat(enc_output, 2)\n", "\n", " return enc_output, enc_state\n" ] }, { "cell_type": "code", "execution_count": 76, "metadata": {}, "outputs": [], "source": [ "def seq2seq_model(input_data, target_data, keep_prob, article_length, headline_length, max_headliney_length,\n", " vocab_size, rnn_size, num_layers, vocab_to_int, batch_size, word_embedding_matrix):\n", " '''Use the previous functions to create the training and inference logits'''\n", "\n", " # Use fasttext's embeddings and the newly created ones as our embeddings\n", " embeddings = word_embedding_matrix\n", "\n", " # embedding_lookup returns embedding values of input_data that we have provided\n", " print(\"Geting embedding for encoder input\")\n", " enc_embed_input = tf.nn.embedding_lookup(embeddings, input_data)\n", "\n", " # Define encoder layers - with respect to size of neurons, hidden layers and design (such as bi-directional)\n", " print(\"Initializing encoder layers\")\n", " enc_output, enc_state = encoding_layer(rnn_size, article_length, num_layers, enc_embed_input, keep_prob)\n", "\n", " print(\"Adding 'GO' to start text\")\n", " dec_input = process_encoding_input(target_data, vocab_to_int, batch_size)\n", "\n", " print(\"Getting embedding for encoder input\")\n", " dec_embed_input = tf.nn.embedding_lookup(embeddings, dec_input)\n", "\n", " print(\"Getting decoding_layer logits ... \")\n", "\n", " # Train: Learn model parameters.\n", " # Inference: Apply model on unseen data to assess performance.\n", " training_logits, inference_logits = decoding_layer(dec_embed_input,\n", " embeddings,\n", " enc_output,\n", " enc_state,\n", " vocab_size,\n", " article_length,\n", " headline_length,\n", " max_headliney_length,\n", " rnn_size,\n", " vocab_to_int,\n", " keep_prob,\n", " batch_size,\n", " num_layers)\n", "\n", " return training_logits, inference_logits" ] }, { "cell_type": "code", "execution_count": 84, "metadata": {}, "outputs": [], "source": [ "def build_graph(vocab_to_int, word_embedding_matrix):\n", " # Build the graph\n", " train_graph = tf.Graph()\n", " # Set the graph to default to ensure that it is ready for training\n", " with train_graph.as_default():\n", " # Load the model inputs\n", " print(\"Load input parameter ...\")\n", " input_data, targets, lr, keep_prob, headline_length, max_headline_length, \\n", " article_length = model_inputs()\n", "\n", " # Create the training and inference logits\n", " print(\"Create instance of seq2seq model parameter ...\")\n", "\n", " # training_logits gives us matrix of possibilities when we trained the system whereas\n", " # inference_logits are used when we are trying to predict summary out of it.\n", " training_logits, inference_logits = seq2seq_model(tf.reverse(input_data, [-1]),\n", " targets,\n", " keep_prob,\n", " article_length,\n", " headline_length,\n", " max_headline_length,\n", " len(vocab_to_int) + 1,\n", " rnn_size,\n", " num_layers,\n", " vocab_to_int,\n", " batch_size,\n", " word_embedding_matrix)\n", "\n", " # Create tensors for the training logits and inference logits\n", " training_logits = tf.identity(training_logits.rnn_output, 'logits')\n", "\n", " # inference_logits would be used while predicting the summary\n", " # used for basic decoder\n", " # inference_logits = tf.identity(inference_logits.sample_id, name='predictions')\n", " inference_logits = tf.identity(inference_logits.predicted_ids, name='predictions')\n", "\n", " # Create the weights for sequence_loss\n", " masks = tf.sequence_mask(headline_length, max_headline_length, dtype=tf.float32, name='masks')\n", "\n", " with tf.name_scope(\"optimization\"):\n", " # Loss function\n", " cost = tf.contrib.seq2seq.sequence_loss(training_logits, targets, masks)\n", "\n", " # Optimizer\n", " optimizer = tf.train.AdamOptimizer(learning_rate)\n", "\n", " # Gradient Clipping\n", " gradients = optimizer.compute_gradients(cost)\n", " capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var)\n", " for grad, var in gradients if grad is not None]\n", " train_op = optimizer.apply_gradients(capped_gradients)\n", " print(\"Graph is built.\")\n", "\n", " # input_data, targets, lr, keep_prob, headline_length, max_headline_length, article_length\n", " return train_graph, train_op, cost, input_data, targets, lr, keep_prob, \\n", " headline_length, max_headline_length, article_length" ] }, { "cell_type": "code", "execution_count": 89, "metadata": {}, "outputs": [], "source": [ "# This could later be improved as tensorflow provide that put padding by it owns.\n", "def pad_sentence_batch(sentence_batch, vocab_to_int):\n", " \"\"\"Pad sentences with so that each sentence of a batch has the same length\"\"\"\n", " max_sentence = max([len(sentence) for sentence in sentence_batch])\n", " padded_batch = [sentence + [vocab_to_int['']] (max_sentence - len(sentence))\n", " for sentence in sentence_batch]\n", " # print (\"padded ==== > \", padded_batch)\n", " return padded_batch" ] }, { "cell_type": "code", "execution_count": 90, "metadata": {}, "outputs": [], "source": [ "def get_batches(headlines, articles, batch_size, vocab_to_int):\n", " \"\"\"Batch headlines, articles, and the lengths of their sentences together\"\"\"\n", " for batch_i in range(0, len(articles) // batch_size):\n", " start_i = batch_i batch_size\n", " headlines_batch = headlines[start_i:start_i + batch_size]\n", " articles_batch = articles[start_i:start_i + batch_size]\n", " pad_headlines_batch = np.array(pad_sentence_batch(headlines_batch, vocab_to_int))\n", " pad_articles_batch = np.array(pad_sentence_batch(articles_batch, vocab_to_int))\n", "\n", " # Need the lengths for the _lengths parameters\n", " pad_headlines_lengths = []\n", " for headline in pad_headlines_batch:\n", " pad_headlines_lengths.append(len(headline))\n", "\n", " pad_articles_lengths = []\n", " for article in pad_articles_batch:\n", " pad_articles_lengths.append(len(article))\n", "\n", " yield pad_headlines_batch, pad_articles_batch, pad_headlines_lengths, pad_articles_lengths" ] }, { "cell_type": "code", "execution_count": 100, "metadata": {}, "outputs": [], "source": [ "def train_model(train_graph, train_op, cost, gen_input_data, gen_targets, gen_lr, gen_keep_prob,\n", " gen_headline_length, gen_max_headline_length, gen_article_length,\n", " sorted_headlines_short, sorted_articles_short, vocab_to_int):\n", " # Record the update losses for saving improvements in the model\n", " headlines_update_loss = []\n", "\n", " # name given to checkpoint\n", " checkpoint = config.checkpoint\n", "\n", " # This make sures that in one epoch it only checked as per value specified of per_epoch\n", " # e.g if length of article is 4000 the => 4000 / 32 (bath size) = > 125\n", " # (it means we will have 125 loops in 1 epoch) then 125 / 3 - 1 = 40\n", " # (so while covering 125 iteration per epoch after 40 iteration\n", " # it will check and print the loss)\n", " update_check = (len(sorted_articles_short) // config.batch_size\n", " // config.per_epoch) - 1\n", " print(\"init value of update_check\", update_check)\n", " gr_learning_rate = config.learning_rate\n", "\n", " session_config = tf.ConfigProto(device_count={'GPU': 0})\n", " if config.enable_gpu:\n", " session_config = tf.ConfigProto(device_count={'GPU': 1})\n", " session_config.gpu_options.allocator_type = 'BFC'\n", " session_config.gpu_options.allow_growth = True\n", "\n", " with tf.Session(graph=train_graph, config=session_config) as sess:\n", " # This is to show graph in tensorboard\n", " # project path tensorboard --logdir = logs - -port 6006\n", " # TensorBoard 1.10.0 at http: // Sam: 6006(Press CTRL + C to quit)\n", " tf.summary.FileWriter(config.tensorboard_logs, graph=sess.graph)\n", " sess.run(tf.global_variables_initializer())\n", "\n", " # If we want to continue training a previous session\n", " # loader = tf.train.import_meta_graph(\"./\" + checkpoint + '.meta')\n", " # loader.restore(sess, checkpoint)\n", "\n", " for epoch_i in range(1, config.epochs + 1):\n", " update_loss = 0\n", " batch_loss = 0\n", " for batch_i, (headlines_batch, articles_batch, headlines_lengths,\n", " articles_lengths) in enumerate(get_batches(sorted_headlines_short,\n", " sorted_articles_short, config.batch_size,\n", " vocab_to_int)):\n", " # print(\"batch_i ==== \", batch_i)\n", " starttime = time.time()\n", " , loss = sess.run(\n", " [train_op, cost],\n", " {gen_input_data: articles_batch,\n", " gen_targets: headlines_batch,\n", " gen_lr: gr_learning_rate,\n", " gen_headline_length: headlines_lengths,\n", " gen_article_length: articles_lengths,\n", " gen_keep_prob: config.keep_probability})\n", "\n", " batch_loss += loss\n", " update_loss += loss\n", " end_time = time.time()\n", " batch_time = end_time - start_time\n", "\n", " # This prints status after value specified in display_step.\n", " # Helps to to see progress\n", " if batch_i % config.display_step == 0 and batch_i > 0:\n", " print('Epoch {}/{} Batch {}/{} - Loss: {:>6.3f}, Seconds: {:>4.2f}'\n", " .format(epoch_i,\n", " config.epochs,\n", " batch_i,\n", " len(sorted_articles_short) // config.batch_size,\n", " batch_loss / config.display_step,\n", " batch_time config.display_step))\n", " batch_loss = 0\n", "\n", " # print loss value after after steps specified in update_check\n", " if batch_i % update_check == 0 and batch_i > 0:\n", " print(\"Average loss for this update:\", round(update_loss / update_check, 3))\n", " headlines_update_loss.append(update_loss)\n", "\n", " # If the update loss is at a new minimum, save the model\n", " if update_loss <= min(headlines_update_loss):\n", " stop_early = 0\n", " saver = tf.train.Saver()\n", " saver.save(sess, checkpoint)\n", " print('New Record! - checkpoint saved')\n", " else:\n", " print(\"No Improvement.\")\n", " stop_early += 1\n", " if stop_early == config.stop:\n", " break\n", " update_loss = 0\n", "\n", " # Reduce learning rate, but not below its minimum value\n", " gr_learning_rate = config.learning_rate_decay\n", " if gr_learning_rate < config.min_learning_rate:\n", " gr_learning_rate = config.min_learning_rate\n", "\n", " if stop_early == config.stop:\n", " print(\"Stopping Training.\")\n", " break" ] }, { "cell_type": "code", "execution_count": 101, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Prepare input parameters ...\n", "Loading Pre-Trained Model Pickle..... \n", "Loaded Pre-Trained Model Pickle, time taken 1.5636331900000036\n", "counting Articles\n", "counting Headlines\n", "Total Stories : 5607\n", "Size of Vocabulary: 47067\n", "creating embedding index .....\n", "Word embeddings: 23102\n", "Number of words missing : 5787\n", "Percent of words that are missing from vocabulary: 12.3%\n", "Total number of unique words: 47067\n", "Number of words we will use: 28893\n", "Percent of words we will use: 61.39%\n", "word_embedding_matrix length : 28893\n", "Article Data\n", "Total number of words : 1643175\n", "Total number of UNKs : 21983\n", "Percent of words that are UNK: 1.34%\n", "Headline Data\n", "Total number of words : 182202\n", "Total number of UNKs : 574\n", "Percent of words that are UNK: 0.32%\n", "4538\n", "4538\n", "Build Graph parameters ...\n", "Load input parameter ...\n", "Create instance of seq2seq model parameter ...\n", "Geting embedding for encoder input\n", "Initializing encoder layers\n", "Adding 'GO' to start text\n", "Getting embedding for encoder input\n", "Getting decoding_layer logits ... \n", "Graph is built.\n", "Total Articles that we have for this run : 4538\n", "Total Articles samples taken for this run : 4000\n", "The shortest text length: 109\n", "The longest text length: 235\n", "init value of update_check 40\n", "batch_i ==== 0\n", "batch_i ==== 1\n", "batch_i ==== 2\n" ] }, { "ename": "KeyboardInterrupt", "evalue": "", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 24\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 25\u001b[0m \u001b[1;34m'''-------------------------main------------------------------'''\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 26\u001b[1;33m \u001b[0mmain\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[1;32m\u001b[0m in \u001b[0;36mmain\u001b[1;34m()\u001b[0m\n\u001b[0;32m 20\u001b[0m train_model(train_graph, train_op, cost, gen_input_data, gen_targets,\n\u001b[0;32m 21\u001b[0m \u001b[0mgen_lr\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mgen_keep_prob\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mgen_headline_length\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mgen_max_headline_length\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 22\u001b[1;33m gen_article_length, sorted_headlines_short, sorted_articles_short, vocab_to_int)\n\u001b[0m\u001b[0;32m 23\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 24\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m\u001b[0m in \u001b[0;36mtrain_model\u001b[1;34m(train_graph, train_op, cost, gen_input_data, gen_targets, gen_lr, gen_keep_prob, gen_headline_length, gen_max_headline_length, gen_article_length, sorted_headlines_short, sorted_articles_short, vocab_to_int)\u001b[0m\n\u001b[0;32m 44\u001b[0m \u001b[0mgen_headline_length\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mheadlines_lengths\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 45\u001b[0m \u001b[0mgen_article_length\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0marticles_lengths\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 46\u001b[1;33m gen_keep_prob: keep_probability})\n\u001b[0m\u001b[0;32m 47\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 48\u001b[0m \u001b[0mbatch_loss\u001b[0m \u001b[1;33m+=\u001b[0m \u001b[0mloss\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32mc:\users\sameer\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\client\session.py\u001b[0m in \u001b[0;36mrun\u001b[1;34m(self, fetches, feed_dict, options, run_metadata)\u001b[0m\n\u001b[0;32m 875\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 876\u001b[0m result = self._run(None, fetches, feed_dict, options_ptr,\n\u001b[1;32m--> 877\u001b[1;33m run_metadata_ptr)\n\u001b[0m\u001b[0;32m 878\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mrun_metadata\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 879\u001b[0m \u001b[0mproto_data\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtf_session\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mTF_GetBuffer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mrun_metadata_ptr\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32mc:\users\sameer\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\client\session.py\u001b[0m in \u001b[0;36m_run\u001b[1;34m(self, handle, fetches, feed_dict, options, run_metadata)\u001b[0m\n\u001b[0;32m 1098\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mfinal_fetches\u001b[0m \u001b[1;32mor\u001b[0m \u001b[0mfinal_targets\u001b[0m \u001b[1;32mor\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mhandle\u001b[0m \u001b[1;32mand\u001b[0m \u001b[0mfeed_dict_tensor\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1099\u001b[0m results = self._do_run(handle, final_targets, final_fetches,\n\u001b[1;32m-> 1100\u001b[1;33m feed_dict_tensor, options, run_metadata)\n\u001b[0m\u001b[0;32m 1101\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1102\u001b[0m \u001b[0mresults\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32mc:\users\sameer\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\client\session.py\u001b[0m in \u001b[0;36m_do_run\u001b[1;34m(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)\u001b[0m\n\u001b[0;32m 1270\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mhandle\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1271\u001b[0m return self._do_call(_run_fn, feeds, fetches, targets, options,\n\u001b[1;32m-> 1272\u001b[1;33m run_metadata)\n\u001b[0m\u001b[0;32m 1273\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1274\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_do_call\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0m_prun_fn\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mhandle\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfeeds\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfetches\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32mc:\users\sameer\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\client\session.py\u001b[0m in \u001b[0;36m_do_call\u001b[1;34m(self, fn, args)\u001b[0m\n\u001b[0;32m 1276\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_do_call\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfn\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1277\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1278\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mfn\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1279\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0merrors\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mOpError\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1280\u001b[0m \u001b[0mmessage\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcompat\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mas_text\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0me\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmessage\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32mc:\users\sameer\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\client\session.py\u001b[0m in \u001b[0;36m_run_fn\u001b[1;34m(feed_dict, fetch_list, target_list, options, run_metadata)\u001b[0m\n\u001b[0;32m 1261\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_extend_graph\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1262\u001b[0m return self._call_tf_sessionrun(\n\u001b[1;32m-> 1263\u001b[1;33m options, feed_dict, fetch_list, target_list, run_metadata)\n\u001b[0m\u001b[0;32m 1264\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1265\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_prun_fn\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mhandle\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfeed_dict\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfetch_list\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32mc:\users\sameer\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\client\session.py\u001b[0m in \u001b[0;36m_call_tf_sessionrun\u001b[1;34m(self, options, feed_dict, fetch_list, target_list, run_metadata)\u001b[0m\n\u001b[0;32m 1348\u001b[0m return tf_session.TF_SessionRun_wrapper(\n\u001b[0;32m 1349\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_session\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0moptions\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfeed_dict\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfetch_list\u001b[0m\u001b0m\n", "\u001b[1;31mKeyboardInterrupt\u001b[0m: " ], "output_type": "error" } ], "source": [ "def main():\n", " print(\"Prepare input parameters ...\")\n", " sorted_articles, sorted_headlines, vocab_to_int, word_embedding_matrix \\n", " = vectorization.create_input_for_graph()\n", " print(\"Build Graph parameters ...\")\n", " train_graph, train_op, cost, gen_input_data, gen_targets, gen_lr, gen_keep_prob, \\n", " gen_headline_length, gen_max_headline_length, \\n", " gen_article_length = build_model.build_graph(vocab_to_int, word_embedding_matrix)\n", "\n", " # Subset the data for training, this is used to check if steps are working fine.\n", " # In actual run whole data should be taken\n", " start = config.start\n", " end = start + config.end\n", "\n", " print(\"Total Articles that we have for this run :\", len(sorted_articles))\n", " # Train the Model\n", " sorted_headlines_short = sorted_headlines[start:end]\n", " sorted_articles_short = sorted_articles[start:end]\n", " print(\"Total Articles samples taken for this run :\", len(sorted_articles_short))\n", " print(\"The shortest text length:\", len(sorted_articles_short[0]))\n", " print(\"The longest text length:\", len(sorted_articles_short[-1]))\n", "\n", " train_model(train_graph, train_op, cost, gen_input_data, gen_targets,\n", " gen_lr, gen_keep_prob, gen_headline_length, gen_max_headline_length,\n", " gen_article_length, sorted_headlines_short, sorted_articles_short, vocab_to_int)\n", "\n", "\n", "'''-------------------------main------------------------------'''\n", "main()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "" ] } )\n", " print(\"The longest text length:\", len(sorted_articles_short[-1]))\n", "\n", " train_model(train_graph, train_op, cost, gen_input_data, gen_targets,\n", " gen_lr, gen_keep_prob, gen_headline_length, gen_max_headline_length,\n", " gen_article_length, sorted_headlines_short, sorted_articles_short, vocab_to_int)\n", "\n", "\n", "'''-------------------------main------------------------------'''\n", "main()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3.0 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.6" } }, "nbformat": 4, "nbformat_minor": 0 }