dbl001 commented 5 years ago

I get this error when pretrained_embeddings=None

m = model(num_docs,
          vocab_size,
          num_topics=num_topics,
          #embedding_size=embed_size,
          restore=False,
          #logdir="/data/",
          pretrained_embeddings=None,
          freqs=freqs)

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-11-2552ac324704> in <module>()
     25           #logdir="/data/",
     26           pretrained_embeddings=None,
---> 27           freqs=freqs)
     28 
     29 m.train(pivot_ids,target_ids,doc_ids, len(pivot_ids), num_epochs, idx_to_word=idx_to_word,  switch_loss_epoch=5)

~/Lda2vec-Tensorflow/lda2vec/Lda2vec.py in __init__(self, num_unique_documents, vocab_size, num_topics, freqs, save_graph_def, embedding_size, num_sampled, learning_rate, lmbda, alpha, power, batch_size, logdir, restore, fixed_words, factors_in, pretrained_embeddings)
     76                                             power=self.power)
     77             # Initialize the Topic-Document Mixture
---> 78             self.mixture = M.EmbedMixture(self.num_unique_documents, self.num_topics, self.embedding_size)
     79 
     80 

~/Lda2vec-Tensorflow/lda2vec/embedding_mixture.py in __init__(self, n_documents, n_topics, n_dim, temperature, W_in, factors_in, name)
     27         self.topic_embedding = tf.get_variable('topic_embedding', shape=[n_topics, n_dim],
     28                                                dtype=tf.float32,
---> 29                                                initializer=tf.orthogonal_initializer(gain=scalar)) if factors_in is None else factors_in
     30 
     31 

~/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/ops/variable_scope.py in get_variable(name, shape, dtype, initializer, regularizer, trainable, collections, caching_device, partitioner, validate_shape, use_resource, custom_getter, constraint, synchronization, aggregation)
   1485       constraint=constraint,
   1486       synchronization=synchronization,
-> 1487       aggregation=aggregation)
   1488 
   1489 

~/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/ops/variable_scope.py in get_variable(self, var_store, name, shape, dtype, initializer, regularizer, reuse, trainable, collections, caching_device, partitioner, validate_shape, use_resource, custom_getter, constraint, synchronization, aggregation)
   1235           constraint=constraint,
   1236           synchronization=synchronization,
-> 1237           aggregation=aggregation)
   1238 
   1239   def _get_partitioned_variable(self,

~/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/ops/variable_scope.py in get_variable(self, name, shape, dtype, initializer, regularizer, reuse, trainable, collections, caching_device, partitioner, validate_shape, use_resource, custom_getter, constraint, synchronization, aggregation)
    538           constraint=constraint,
    539           synchronization=synchronization,
--> 540           aggregation=aggregation)
    541 
    542   def _get_partitioned_variable(self,

~/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/ops/variable_scope.py in _true_getter(name, shape, dtype, initializer, regularizer, reuse, trainable, collections, caching_device, partitioner, validate_shape, use_resource, constraint, synchronization, aggregation)
    490           constraint=constraint,
    491           synchronization=synchronization,
--> 492           aggregation=aggregation)
    493 
    494     # Set trainable value based on synchronization value.

~/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/ops/variable_scope.py in _get_single_variable(self, name, shape, dtype, initializer, regularizer, partition_info, reuse, trainable, collections, caching_device, validate_shape, use_resource, constraint, synchronization, aggregation)
    859                          "reuse=tf.AUTO_REUSE in VarScope? "
    860                          "Originally defined at:\n\n%s" % (
--> 861                              name, "".join(traceback.format_list(tb))))
    862       found_var = self._vars[name]
    863       if not shape.is_compatible_with(found_var.get_shape()):

ValueError: Variable topic_embedding already exists, disallowed. Did you mean to set reuse=True or reuse=tf.AUTO_REUSE in VarScope? Originally defined at:

  File "/home/ubuntu/Lda2vec-Tensorflow/lda2vec/embedding_mixture.py", line 29, in __init__
    initializer=tf.orthogonal_initializer(gain=scalar)) if factors_in is None else factors_in
  File "/home/ubuntu/Lda2vec-Tensorflow/lda2vec/Lda2vec.py", line 78, in __init__
    self.mixture = M.EmbedMixture(self.num_unique_documents, self.num_topics, self.embedding_size)
  File "<ipython-input-8-6f2c3ffe8774>", line 27, in <module>
    freqs=freqs

nateraw commented 5 years ago

Does the twenty newsgroups example work when you set pretrained_embeddings=False right now? There weren't any errors for me yesterday.

I'm assuming you pulled since my commit last night?

dbl001 commented 5 years ago

Yes, I did a pull this morning. Here’s my results from twenty_newsgroups:

import pandas as pd from lda2vec.nlppipe import Preprocessor

Data directory

data_dir ="data"

Where to save preprocessed data

clean_data_dir = "data/clean_data_twenty_newsgroups"

Name of input file. Should be inside of data_dir

input_file = "20_newsgroups.txt"

Should we load pretrained embeddings from file

load_embeds = True

Read in data file

df = pd.read_csv(data_dir+"/"+input_file, sep="\t")

Initialize a preprocessor

P = Preprocessor(df, "texts", max_features=30000, maxlen=10000, min_count=30, nlp="en_core_web_lg")

Run the preprocessing on your dataframe

P.preprocess()

Load embeddings from file if we choose to do so

if load_embeds:

Load embedding matrix from file path - change path to where you saved them

embedding_matrix = P.load_glove("glove.6B.300d.txt")

else: embedding_matrix = None

Save data to data_dir

P.save_data(clean_data_dir, embedding_matrix=embedding_matrix)

from lda2vec import utils, model

Path to preprocessed data

data_path = "data/clean_data_twenty_newsgroups"

Whether or not to load saved embeddings file

load_embeds = True

Load data from files

(idx_to_word, word_to_idx, freqs, pivot_ids, target_ids, doc_ids, embed_matrix) = utils.load_preprocessed_data(data_path, load_embed_matrix=load_embeds)

Number of unique documents

num_docs = doc_ids.max() + 1

Number of unique words in vocabulary (int)

vocab_size = len(freqs)

Embed layer dimension size

If not loading embeds, change 128 to whatever size you want.

embed_size = embed_matrix.shape[1] if load_embeds else 128

Number of topics to cluster into

num_topics = 20

Amount of iterations over entire dataset

num_epochs = 200

Batch size - Increase/decrease depending on memory usage

batch_size = 500

Epoch that we want to "switch on" LDA loss

switch_loss_epoch = 0

Pretrained embeddings value

pretrained_embeddings = embed_matrix if load_embeds else None

If True, save logdir, otherwise don't

save_graph = True

Initialize the model

m = model(num_docs, vocab_size, num_topics, embedding_size=embed_size, pretrained_embeddings=pretrained_embeddings, freqs=freqs, batch_size = batch_size, save_graph_def=save_graph)

Train the model

m.train(pivot_ids, target_ids, doc_ids, len(pivot_ids), num_epochs, idx_to_word=idx_to_word, switch_loss_epoch=switch_loss_epoch)

Visualize topics with pyldavis

utils.generate_ldavis_data(data_path, m, idx_to_word, freqs, vocab_size)

InvalidArgumentError Traceback (most recent call last) ~/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/client/session.py in _do_call(self, fn, args) 1333 try: -> 1334 return fn(args) 1335 except errors.OpError as e:

~/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/client/session.py in _run_fn(feed_dict, fetch_list, target_list, options, run_metadata) 1318 return self._call_tf_sessionrun( -> 1319 options, feed_dict, fetch_list, target_list, run_metadata) 1320

~/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/client/session.py in _call_tf_sessionrun(self, options, feed_dict, fetch_list, target_list, run_metadata) 1406 self._session, options, feed_dict, fetch_list, target_list, -> 1407 run_metadata) 1408

InvalidArgumentError: indices[0] = 5451 is not in [0, 5451) [[{{node nce_loss/negative_sampling/nce_loss/embedding_lookup}} = GatherV2[Taxis=DT_INT32, Tindices=DT_INT64, Tparams=DT_FLOAT, _class=["loc:@Optimizer/train/update_nce_weights/AssignSub"], _device="/job:localhost/replica:0/task:0/device:CPU:0"](nce_weights/read, nce_loss/negative_sampling/nce_loss/concat, nce_loss/negative_sampling/nce_loss/embedding_lookup/axis)]]

During handling of the above exception, another exception occurred:

InvalidArgumentError Traceback (most recent call last)

in () 48 num_epochs, 49 idx_to_word=idx_to_word, ---> 50 switch_loss_epoch=switch_loss_epoch) 51 52 # Visualize topics with pyldavis ~/Lda2vec-Tensorflow/lda2vec/Lda2vec.py in train(self, pivot_words, target_words, doc_ids, data_size, num_epochs, switch_loss_epoch, save_every, report_every, print_topics_every, idx_to_word) 244 245 # Run a step of the model --> 246 summary, _, l, lw2v, llda, step = self.sesh.run(fetches, feed_dict=feed_dict) 247 248 # Prints log every "report_every" epoch ~/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/client/session.py in run(self, fetches, feed_dict, options, run_metadata) 927 try: 928 result = self._run(None, fetches, feed_dict, options_ptr, --> 929 run_metadata_ptr) 930 if run_metadata: 931 proto_data = tf_session.TF_GetBuffer(run_metadata_ptr) ~/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/client/session.py in _run(self, handle, fetches, feed_dict, options, run_metadata) 1150 if final_fetches or final_targets or (handle and feed_dict_tensor): 1151 results = self._do_run(handle, final_targets, final_fetches, -> 1152 feed_dict_tensor, options, run_metadata) 1153 else: 1154 results = [] ~/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/client/session.py in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata) 1326 if handle is None: 1327 return self._do_call(_run_fn, feeds, fetches, targets, options, -> 1328 run_metadata) 1329 else: 1330 return self._do_call(_prun_fn, handle, feeds, fetches) ~/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/client/session.py in _do_call(self, fn, *args) 1346 pass 1347 message = error_interpolation.interpolate(message, self._graph) -> 1348 raise type(e)(node_def, op, message) 1349 1350 def _extend_graph(self): InvalidArgumentError: indices[0] = 5451 is not in [0, 5451) [[node nce_loss/negative_sampling/nce_loss/embedding_lookup (defined at /home/ubuntu/Lda2vec-Tensorflow/lda2vec/word_embedding.py:46) = GatherV2[Taxis=DT_INT32, Tindices=DT_INT64, Tparams=DT_FLOAT, _class=["loc:@Optimizer/train/update_nce_weights/AssignSub"], _device="/job:localhost/replica:0/task:0/device:CPU:0"](nce_weights/read, nce_loss/negative_sampling/nce_loss/concat, nce_loss/negative_sampling/nce_loss/embedding_lookup/axis)]] Caused by op 'nce_loss/negative_sampling/nce_loss/embedding_lookup', defined at: File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/runpy.py", line 193, in _run_module_as_main "__main__", mod_spec) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/runpy.py", line 85, in _run_code exec(code, run_globals) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/ipykernel_launcher.py", line 16, in app.launch_new_instance() File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/traitlets/config/application.py", line 658, in launch_instance app.start() File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/ipykernel/kernelapp.py", line 486, in start self.io_loop.start() File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/tornado/platform/asyncio.py", line 127, in start self.asyncio_loop.run_forever() File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/asyncio/base_events.py", line 427, in run_forever self._run_once() File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/asyncio/base_events.py", line 1440, in _run_once handle._run() File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/asyncio/events.py", line 145, in _run self._callback(*self._args) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/tornado/ioloop.py", line 759, in _run_callback ret = callback() File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/tornado/stack_context.py", line 276, in null_wrapper return fn(*args, **kwargs) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 536, in self.io_loop.add_callback(lambda : self._handle_events(self.socket, 0)) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 450, in _handle_events self._handle_recv() File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 480, in _handle_recv self._run_callback(callback, msg) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 432, in _run_callback callback(*args, **kwargs) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/tornado/stack_context.py", line 276, in null_wrapper return fn(*args, **kwargs) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher return self.dispatch_shell(stream, msg) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 233, in dispatch_shell handler(stream, idents, msg) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 399, in execute_request user_expressions, allow_stdin) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/ipykernel/ipkernel.py", line 208, in do_execute res = shell.run_cell(code, store_history=store_history, silent=silent) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/ipykernel/zmqshell.py", line 537, in run_cell return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2662, in run_cell raw_cell, store_history, silent, shell_futures) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2785, in _run_cell interactivity=interactivity, compiler=compiler, result=result) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2903, in run_ast_nodes if self.run_code(code, result): File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2963, in run_code exec(code_obj, self.user_global_ns, self.user_ns) File "", line 41, in save_graph_def=save_graph) File "/home/ubuntu/Lda2vec-Tensorflow/lda2vec/Lda2vec.py", line 82, in __init__ handles = self._build_graph() File "/home/ubuntu/Lda2vec-Tensorflow/lda2vec/Lda2vec.py", line 162, in _build_graph loss_word2vec = self.w_embed(context, y) File "/home/ubuntu/Lda2vec-Tensorflow/lda2vec/word_embedding.py", line 46, in __call__ sampled_values=sampler)) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/ops/nn_impl.py", line 1248, in nce_loss name=name) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/ops/nn_impl.py", line 1062, in _compute_sampled_logits weights, all_ids, partition_strategy=partition_strategy) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/ops/embedding_ops.py", line 313, in embedding_lookup transform_fn=None) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/ops/embedding_ops.py", line 133, in _embedding_lookup_and_transform result = _clip(array_ops.gather(params[0], ids, name=name), File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/ops/array_ops.py", line 2675, in gather return gen_array_ops.gather_v2(params, indices, axis, name=name) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/ops/gen_array_ops.py", line 3332, in gather_v2 "GatherV2", params=params, indices=indices, axis=axis, name=name) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper op_def=op_def) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py", line 488, in new_func return func(*args, **kwargs) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 3274, in create_op op_def=op_def) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1770, in __init__ self._traceback = tf_stack.extract_stack() InvalidArgumentError (see above for traceback): indices[0] = 5451 is not in [0, 5451) [[node nce_loss/negative_sampling/nce_loss/embedding_lookup (defined at /home/ubuntu/Lda2vec-Tensorflow/lda2vec/word_embedding.py:46) = GatherV2[Taxis=DT_INT32, Tindices=DT_INT64, Tparams=DT_FLOAT, _class=["loc:@Optimizer/train/update_nce_weights/AssignSub"], _device="/job:localhost/replica:0/task:0/device:CPU:0"](nce_weights/read, nce_loss/negative_sampling/nce_loss/concat, nce_loss/negative_sampling/nce_loss/embedding_lookup/axis)]] > On Mar 25, 2019, at 10:11 AM, Nathan Raw wrote: > > Does the twenty newsgroups example work when you set pretrained_embeddings=False right now? There weren't any errors for me yesterday. > > I'm assuming you pulled since my commit last night? > > — > You are receiving this because you authored the thread. > Reply to this email directly, view it on GitHub , or mute the thread . >

dbl001 commented 5 years ago

tensorflow.version '1.12.0'

On Mar 25, 2019, at 10:30 AM, David Laxer davidl@softintel.com wrote:

Yes, I did a pull this morning. Here’s my results from twenty_newsgroups:

import pandas as pd from lda2vec.nlppipe import Preprocessor

Data directory

data_dir ="data"

Where to save preprocessed data

clean_data_dir = "data/clean_data_twenty_newsgroups"

Name of input file. Should be inside of data_dir

input_file = "20_newsgroups.txt"

Should we load pretrained embeddings from file

load_embeds = True

Read in data file

df = pd.read_csv(data_dir+"/"+input_file, sep="\t")

Initialize a preprocessor

P = Preprocessor(df, "texts", max_features=30000, maxlen=10000, min_count=30, nlp="en_core_web_lg")

Run the preprocessing on your dataframe

P.preprocess()

Load embeddings from file if we choose to do so

if load_embeds:

Load embedding matrix from file path - change path to where you saved them
embedding_matrix = P.load_glove("glove.6B.300d.txt")
else: embedding_matrix = None

Save data to data_dir

P.save_data(clean_data_dir, embedding_matrix=embedding_matrix)

from lda2vec import utils, model

Path to preprocessed data

data_path = "data/clean_data_twenty_newsgroups"

Whether or not to load saved embeddings file

load_embeds = True

Load data from files

(idx_to_word, word_to_idx, freqs, pivot_ids, target_ids, doc_ids, embed_matrix) = utils.load_preprocessed_data(data_path, load_embed_matrix=load_embeds)

Number of unique documents

num_docs = doc_ids.max() + 1

Number of unique words in vocabulary (int)

vocab_size = len(freqs)

Embed layer dimension size

If not loading embeds, change 128 to whatever size you want.

embed_size = embed_matrix.shape[1] if load_embeds else 128

Number of topics to cluster into

num_topics = 20

Amount of iterations over entire dataset

num_epochs = 200

Batch size - Increase/decrease depending on memory usage

batch_size = 500

Epoch that we want to "switch on" LDA loss

switch_loss_epoch = 0

Pretrained embeddings value

pretrained_embeddings = embed_matrix if load_embeds else None

If True, save logdir, otherwise don't

save_graph = True

Initialize the model

m = model(num_docs, vocab_size, num_topics, embedding_size=embed_size, pretrained_embeddings=pretrained_embeddings, freqs=freqs, batch_size = batch_size, save_graph_def=save_graph)

Train the model

m.train(pivot_ids, target_ids, doc_ids, len(pivot_ids), num_epochs, idx_to_word=idx_to_word, switch_loss_epoch=switch_loss_epoch)

Visualize topics with pyldavis

utils.generate_ldavis_data(data_path, m, idx_to_word, freqs, vocab_size)

InvalidArgumentError Traceback (most recent call last) ~/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/client/session.py in _do_call(self, fn, args) 1333 try: -> 1334 return fn(args) 1335 except errors.OpError as e:

~/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/client/session.py in _run_fn(feed_dict, fetch_list, target_list, options, run_metadata) 1318 return self._call_tf_sessionrun( -> 1319 options, feed_dict, fetch_list, target_list, run_metadata) 1320

~/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/client/session.py in _call_tf_sessionrun(self, options, feed_dict, fetch_list, target_list, run_metadata) 1406 self._session, options, feed_dict, fetch_list, target_list, -> 1407 run_metadata) 1408

InvalidArgumentError: indices[0] = 5451 is not in [0, 5451) [[{{node nce_loss/negative_sampling/nce_loss/embedding_lookup}} = GatherV2[Taxis=DT_INT32, Tindices=DT_INT64, Tparams=DT_FLOAT, _class=["loc:@Optimizer/train/update_nce_weights/AssignSub"], _device="/job:localhost/replica:0/task:0/device:CPU:0"](nce_weights/read, nce_loss/negative_sampling/nce_loss/concat, nce_loss/negative_sampling/nce_loss/embedding_lookup/axis)]]

During handling of the above exception, another exception occurred:

InvalidArgumentError Traceback (most recent call last)
in () 48 num_epochs, 49 idx_to_word=idx_to_word, ---> 50 switch_loss_epoch=switch_loss_epoch) 51 52 # Visualize topics with pyldavis ~/Lda2vec-Tensorflow/lda2vec/Lda2vec.py in train(self, pivot_words, target_words, doc_ids, data_size, num_epochs, switch_loss_epoch, save_every, report_every, print_topics_every, idx_to_word) 244 245 # Run a step of the model --> 246 summary, _, l, lw2v, llda, step = self.sesh.run(fetches, feed_dict=feed_dict) 247 248 # Prints log every "report_every" epoch ~/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/client/session.py in run(self, fetches, feed_dict, options, run_metadata) 927 try: 928 result = self._run(None, fetches, feed_dict, options_ptr, --> 929 run_metadata_ptr) 930 if run_metadata: 931 proto_data = tf_session.TF_GetBuffer(run_metadata_ptr) ~/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/client/session.py in _run(self, handle, fetches, feed_dict, options, run_metadata) 1150 if final_fetches or final_targets or (handle and feed_dict_tensor): 1151 results = self._do_run(handle, final_targets, final_fetches, -> 1152 feed_dict_tensor, options, run_metadata) 1153 else: 1154 results = [] ~/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/client/session.py in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata) 1326 if handle is None: 1327 return self._do_call(_run_fn, feeds, fetches, targets, options, -> 1328 run_metadata) 1329 else: 1330 return self._do_call(_prun_fn, handle, feeds, fetches) ~/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/client/session.py in _do_call(self, fn, *args) 1346 pass 1347 message = error_interpolation.interpolate(message, self._graph) -> 1348 raise type(e)(node_def, op, message) 1349 1350 def _extend_graph(self): InvalidArgumentError: indices[0] = 5451 is not in [0, 5451) [[node nce_loss/negative_sampling/nce_loss/embedding_lookup (defined at /home/ubuntu/Lda2vec-Tensorflow/lda2vec/word_embedding.py:46) = GatherV2[Taxis=DT_INT32, Tindices=DT_INT64, Tparams=DT_FLOAT, _class=["loc:@Optimizer/train/update_nce_weights/AssignSub"], _device="/job:localhost/replica:0/task:0/device:CPU:0"](nce_weights/read, nce_loss/negative_sampling/nce_loss/concat, nce_loss/negative_sampling/nce_loss/embedding_lookup/axis)]] Caused by op 'nce_loss/negative_sampling/nce_loss/embedding_lookup', defined at: File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/runpy.py", line 193, in _run_module_as_main "__main__", mod_spec) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/runpy.py", line 85, in _run_code exec(code, run_globals) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/ipykernel_launcher.py", line 16, in app.launch_new_instance() File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/traitlets/config/application.py", line 658, in launch_instance app.start() File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/ipykernel/kernelapp.py", line 486, in start self.io _loop.start() File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/tornado/platform/asyncio.py", line 127, in start self.asyncio_loop.run_forever() File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/asyncio/base_events.py", line 427, in run_forever self._run_once() File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/asyncio/base_events.py", line 1440, in _run_once handle._run() File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/asyncio/events.py", line 145, in _run self._callback(*self._args) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/tornado/ioloop.py", line 759, in _run_callback ret = callback() File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/tornado/stack_context.py", line 276, in null_wrapper return fn(*args, **kwargs) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 536, in self.io _loop.add_callback(lambda : self._handle_events(self.socket, 0)) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 450, in _handle_events self._handle_recv() File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 480, in _handle_recv self._run_callback(callback, msg) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 432, in _run_callback callback(*args, **kwargs) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/tornado/stack_context.py", line 276, in null_wrapper return fn(*args, **kwargs) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher return self.dispatch_shell(stream, msg) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 233, in dispatch_shell handler(stream, idents, msg) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 399, in execute_request user_expressions, allow_stdin) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/ipykernel/ipkernel.py", line 208, in do_execute res = shell.run_cell(code, store_history=store_history, silent=silent) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/ipykernel/zmqshell.py", line 537, in run_cell return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2662, in run_cell raw_cell, store_history, silent, shell_futures) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2785, in _run_cell interactivity=interactivity, compiler=compiler, result=result) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2903, in run_ast_nodes if self.run_code(code, result): File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2963, in run_code exec(code_obj, self.user_global_ns, self.user_ns) File "", line 41, in save_graph_def=save_graph) File "/home/ubuntu/Lda2vec-Tensorflow/lda2vec/Lda2vec.py", line 82, in __init__ handles = self._build_graph() File "/home/ubuntu/Lda2vec-Tensorflow/lda2vec/Lda2vec.py", line 162, in _build_graph loss_word2vec = self.w_embed(context, y) File "/home/ubuntu/Lda2vec-Tensorflow/lda2vec/word_embedding.py", line 46, in __call__ sampled_values=sampler)) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/ops/nn_impl.py", line 1248, in nce_loss name=name) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/ops/nn_impl.py", line 1062, in _compute_sampled_logits weights, all_ids, partition_strategy=partition_strategy) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/ops/embedding_ops.py", line 313, in embedding_lookup transform_fn=None) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/ops/embedding_ops.py", line 133, in _embedding_lookup_and_transform result = _clip(array_ops.gather(params[0], ids, name=name), File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/ops/array_ops.py", line 2675, in gather return gen_array_ops.gather_v2(params, indices, axis, name=name) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/ops/gen_array_ops.py", line 3332, in gather_v2 "GatherV2", params=params, indices=indices, axis=axis, name=name) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper op_def=op_def) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py", line 488, in new_func return func(*args, **kwargs) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 3274, in create_op op_def=op_def) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1770, in __init__ self._traceback = tf_stack.extract_stack() InvalidArgumentError (see above for traceback): indices[0] = 5451 is not in [0, 5451) [[node nce_loss/negative_sampling/nce_loss/embedding_lookup (defined at /home/ubuntu/Lda2vec-Tensorflow/lda2vec/word_embedding.py:46) = GatherV2[Taxis=DT_INT32, Tindices=DT_INT64, Tparams=DT_FLOAT, _class=["loc:@Optimizer/train/update_nce_weights/AssignSub"], _device="/job:localhost/replica:0/task:0/device:CPU:0"](nce_weights/read, nce_loss/negative_sampling/nce_loss/concat, nce_loss/negative_sampling/nce_loss/embedding_lookup/axis)]] > On Mar 25, 2019, at 10:11 AM, Nathan Raw > wrote: > > Does the twenty newsgroups example work when you set pretrained_embeddings=False right now? There weren't any errors for me yesterday. > > I'm assuming you pulled since my commit last night? > > — > You are receiving this because you authored the thread. > Reply to this email directly, view it on GitHub , or mute the thread . >

nateraw commented 5 years ago

For my own sanity, recommenting for you so I can format with markdown. Can't reformat email replies

import pandas as pd
from lda2vec.nlppipe import Preprocessor

# Data directory
data_dir ="data"
# Where to save preprocessed data
clean_data_dir = "data/clean_data_twenty_newsgroups"
# Name of input file. Should be inside of data_dir
input_file = "20_newsgroups.txt"
# Should we load pretrained embeddings from file
load_embeds = True

# Read in data file
df = pd.read_csv(data_dir+"/"+input_file, sep="\t")

# Initialize a preprocessor
P = Preprocessor(df, "texts", max_features=30000, maxlen=10000, min_count=30, nlp="en_core_web_lg")

# Run the preprocessing on your dataframe
P.preprocess()

# Load embeddings from file if we choose to do so
if load_embeds:
    # Load embedding matrix from file path - change path to where you saved them
    embedding_matrix = P.load_glove("glove.6B.300d.txt")
else:
    embedding_matrix = None

# Save data to data_dir
P.save_data(clean_data_dir, embedding_matrix=embedding_matrix)

from lda2vec import utils, model

# Path to preprocessed data
data_path  = "data/clean_data_twenty_newsgroups"
# Whether or not to load saved embeddings file
load_embeds = True

# Load data from files
(idx_to_word, word_to_idx, freqs, pivot_ids,
 target_ids, doc_ids, embed_matrix) = utils.load_preprocessed_data(data_path, load_embed_matrix=load_embeds)

# Number of unique documents
num_docs = doc_ids.max() + 1
# Number of unique words in vocabulary (int)
vocab_size = len(freqs)
# Embed layer dimension size
# If not loading embeds, change 128 to whatever size you want.
embed_size = embed_matrix.shape[1] if load_embeds else 128
# Number of topics to cluster into
num_topics = 20
# Amount of iterations over entire dataset
num_epochs = 200
# Batch size - Increase/decrease depending on memory usage
batch_size = 500
# Epoch that we want to "switch on" LDA loss
switch_loss_epoch = 0
# Pretrained embeddings value
pretrained_embeddings = embed_matrix if load_embeds else None
# If True, save logdir, otherwise don't
save_graph = True

# Initialize the model
m = model(num_docs,
          vocab_size,
          num_topics,
          embedding_size=embed_size,
          pretrained_embeddings=pretrained_embeddings,
          freqs=freqs,
          batch_size = batch_size,
          save_graph_def=save_graph)

# Train the model
m.train(pivot_ids,
        target_ids,
        doc_ids,
        len(pivot_ids),
        num_epochs,
        idx_to_word=idx_to_word,
        switch_loss_epoch=switch_loss_epoch)

# Visualize topics with pyldavis
utils.generate_ldavis_data(data_path, m, idx_to_word, freqs, vocab_size)

dbl001 commented 5 years ago

Sorry! I’ll paste directly into github.

On Mar 25, 2019, at 10:55 AM, Nathan Raw notifications@github.com wrote:

For my own sanity, recommenting for you so I can format with markdown. Can't reformat email replies

import pandas as pd from lda2vec.nlppipe import Preprocessor

Data directory

data_dir ="data"

Where to save preprocessed data

clean_data_dir = "data/clean_data_twenty_newsgroups"

Name of input file. Should be inside of data_dir

input_file = "20_newsgroups.txt"

Should we load pretrained embeddings from file

load_embeds = True

Read in data file

df = pd.read_csv(data_dir+"/"+input_file, sep="\t")

Initialize a preprocessor

P = Preprocessor(df, "texts", max_features=30000, maxlen=10000, min_count=30, nlp="en_core_web_lg")

Run the preprocessing on your dataframe

P.preprocess()

Load embeddings from file if we choose to do so

if load_embeds:

Load embedding matrix from file path - change path to where you saved them
embedding_matrix = P.load_glove("glove.6B.300d.txt")
else: embedding_matrix = None

Save data to data_dir

P.save_data(clean_data_dir, embedding_matrix=embedding_matrix)

from lda2vec import utils, model

Path to preprocessed data

data_path = "data/clean_data_twenty_newsgroups"

Whether or not to load saved embeddings file

load_embeds = True

Load data from files

(idx_to_word, word_to_idx, freqs, pivot_ids, target_ids, doc_ids, embed_matrix) = utils.load_preprocessed_data(data_path, load_embed_matrix=load_embeds)

Number of unique documents

num_docs = doc_ids.max() + 1

Number of unique words in vocabulary (int)

vocab_size = len(freqs)

Embed layer dimension size

If not loading embeds, change 128 to whatever size you want.

embed_size = embed_matrix.shape[1] if load_embeds else 128

Number of topics to cluster into

num_topics = 20

Amount of iterations over entire dataset

num_epochs = 200

Batch size - Increase/decrease depending on memory usage

batch_size = 500

Epoch that we want to "switch on" LDA loss

switch_loss_epoch = 0

Pretrained embeddings value

pretrained_embeddings = embed_matrix if load_embeds else None

If True, save logdir, otherwise don't

save_graph = True

Initialize the model

m = model(num_docs, vocab_size, num_topics, embedding_size=embed_size, pretrained_embeddings=pretrained_embeddings, freqs=freqs, batch_size = batch_size, save_graph_def=save_graph)

Train the model

m.train(pivot_ids, target_ids, doc_ids, len(pivot_ids), num_epochs, idx_to_word=idx_to_word, switch_loss_epoch=switch_loss_epoch)

Visualize topics with pyldavis

utils.generate_ldavis_data(data_path, m, idx_to_word, freqs, vocab_size) — You are receiving this because you authored the thread. Reply to this email directly, view it on GitHub https://github.com/nateraw/Lda2vec-Tensorflow/issues/40#issuecomment-476310785, or mute the thread https://github.com/notifications/unsubscribe-auth/AC9i2zr5UL7LjL89uB_bAkFs9tcnV2Ibks5vaQ2QgaJpZM4cHWDr.

nateraw commented 5 years ago

No worries 😄

nateraw commented 5 years ago

At work right now so I can't really help. Just adding this to my todo list tonight. Will get back to you. Sorry that this stuff is always breaking 🙁 . Sooo many improvements lately but they came with many more bugs.

nateraw commented 5 years ago

Also, just to be clear, this is all supposed to be run on Tensorflow 1.5.0.

klausrossmann commented 5 years ago

Was running into the same error just now. The TF Version was the problem, with 1.5.0 it's working!

nateraw / Lda2vec-Tensorflow

ValueError: Variable topic_embedding already exists, disallowed. Did you mean to set reuse=True or reuse=tf.AUTO_REUSE in VarScope? Originally defined at: #40

Data directory

Where to save preprocessed data

Name of input file. Should be inside of data_dir

Should we load pretrained embeddings from file

Read in data file

Initialize a preprocessor

Run the preprocessing on your dataframe

Load embeddings from file if we choose to do so

Load embedding matrix from file path - change path to where you saved them

Save data to data_dir

Path to preprocessed data

Whether or not to load saved embeddings file

Load data from files

Number of unique documents

Number of unique words in vocabulary (int)

Embed layer dimension size

If not loading embeds, change 128 to whatever size you want.

Number of topics to cluster into

Amount of iterations over entire dataset

Batch size - Increase/decrease depending on memory usage

Epoch that we want to "switch on" LDA loss

Pretrained embeddings value

If True, save logdir, otherwise don't

Initialize the model

Train the model

Visualize topics with pyldavis

Data directory

Where to save preprocessed data

Name of input file. Should be inside of data_dir

Should we load pretrained embeddings from file

Read in data file

Initialize a preprocessor

Run the preprocessing on your dataframe

Load embeddings from file if we choose to do so

Load embedding matrix from file path - change path to where you saved them

Save data to data_dir

Path to preprocessed data

Whether or not to load saved embeddings file

Load data from files

Number of unique documents

Number of unique words in vocabulary (int)

Embed layer dimension size

If not loading embeds, change 128 to whatever size you want.

Number of topics to cluster into

Amount of iterations over entire dataset

Batch size - Increase/decrease depending on memory usage

Epoch that we want to "switch on" LDA loss

Pretrained embeddings value

If True, save logdir, otherwise don't

Initialize the model

Train the model

Visualize topics with pyldavis

Data directory

Where to save preprocessed data

Name of input file. Should be inside of data_dir

Should we load pretrained embeddings from file

Read in data file

Initialize a preprocessor

Run the preprocessing on your dataframe

Load embeddings from file if we choose to do so

Load embedding matrix from file path - change path to where you saved them

Save data to data_dir

Path to preprocessed data

Whether or not to load saved embeddings file

Load data from files

Number of unique documents

Number of unique words in vocabulary (int)

Embed layer dimension size

If not loading embeds, change 128 to whatever size you want.

Number of topics to cluster into

Amount of iterations over entire dataset

Batch size - Increase/decrease depending on memory usage

Epoch that we want to "switch on" LDA loss

Pretrained embeddings value

If True, save logdir, otherwise don't

Initialize the model

Train the model

Visualize topics with pyldavis