Closed dbl001 closed 5 years ago
# Train the model
m.train(pivot_ids,
target_ids,
doc_ids,
len(pivot_ids),
num_epochs,
idx_to_word=idx_to_word,
switch_loss_epoch=switch_loss_epoch)
# Visualize topics with pyldavis
utils.generate_ldavis_data(data_path, m, idx_to_word, freqs, vocab_size)
---------------------------------------------------------------------------
InvalidArgumentError Traceback (most recent call last)
~/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/client/session.py in _do_call(self, fn, *args)
1333 try:
-> 1334 return fn(*args)
1335 except errors.OpError as e:
~/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/client/session.py in _run_fn(feed_dict, fetch_list, target_list, options, run_metadata)
1318 return self._call_tf_sessionrun(
-> 1319 options, feed_dict, fetch_list, target_list, run_metadata)
1320
~/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/client/session.py in _call_tf_sessionrun(self, options, feed_dict, fetch_list, target_list, run_metadata)
1406 self._session, options, feed_dict, fetch_list, target_list,
-> 1407 run_metadata)
1408
InvalidArgumentError: indices[250] = 5451 is not in [0, 5451)
[[{{node word_embed_lookup}} = GatherV2[Taxis=DT_INT32, Tindices=DT_INT32, Tparams=DT_FLOAT, _class=["loc:@Optimizer/train/update_word_embedding/AssignSub"], _device="/job:localhost/replica:0/task:0/device:CPU:0"](word_embedding/read, _arg_x_pivot_idxs_0_1, word_embed_lookup/axis)]]
During handling of the above exception, another exception occurred:
InvalidArgumentError Traceback (most recent call last)
<ipython-input-3-7f4db3f7fa23> in <module>()
6 num_epochs,
7 idx_to_word=idx_to_word,
----> 8 switch_loss_epoch=switch_loss_epoch)
9
10 # Visualize topics with pyldavis
~/Lda2vec-Tensorflow/lda2vec/Lda2vec.py in train(self, pivot_words, target_words, doc_ids, data_size, num_epochs, switch_loss_epoch, save_every, report_every, print_topics_every, idx_to_word)
244
245 # Run a step of the model
--> 246 summary, _, l, lw2v, llda, step = self.sesh.run(fetches, feed_dict=feed_dict)
247
248 # Prints log every "report_every" epoch
~/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/client/session.py in run(self, fetches, feed_dict, options, run_metadata)
927 try:
928 result = self._run(None, fetches, feed_dict, options_ptr,
--> 929 run_metadata_ptr)
930 if run_metadata:
931 proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)
~/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/client/session.py in _run(self, handle, fetches, feed_dict, options, run_metadata)
1150 if final_fetches or final_targets or (handle and feed_dict_tensor):
1151 results = self._do_run(handle, final_targets, final_fetches,
-> 1152 feed_dict_tensor, options, run_metadata)
1153 else:
1154 results = []
~/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/client/session.py in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)
1326 if handle is None:
1327 return self._do_call(_run_fn, feeds, fetches, targets, options,
-> 1328 run_metadata)
1329 else:
1330 return self._do_call(_prun_fn, handle, feeds, fetches)
~/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/client/session.py in _do_call(self, fn, *args)
1346 pass
1347 message = error_interpolation.interpolate(message, self._graph)
-> 1348 raise type(e)(node_def, op, message)
1349
1350 def _extend_graph(self):
InvalidArgumentError: indices[250] = 5451 is not in [0, 5451)
[[node word_embed_lookup (defined at /home/ubuntu/Lda2vec-Tensorflow/lda2vec/Lda2vec.py:152) = GatherV2[Taxis=DT_INT32, Tindices=DT_INT32, Tparams=DT_FLOAT, _class=["loc:@Optimizer/train/update_word_embedding/AssignSub"], _device="/job:localhost/replica:0/task:0/device:CPU:0"](word_embedding/read, _arg_x_pivot_idxs_0_1, word_embed_lookup/axis)]]
Caused by op 'word_embed_lookup', defined at:
File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/runpy.py", line 193, in _run_module_as_main
"__main__", mod_spec)
File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/ipykernel_launcher.py", line 16, in <module>
app.launch_new_instance()
File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/traitlets/config/application.py", line 658, in launch_instance
app.start()
File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/ipykernel/kernelapp.py", line 486, in start
self.io_loop.start()
File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/tornado/platform/asyncio.py", line 127, in start
self.asyncio_loop.run_forever()
File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/asyncio/base_events.py", line 427, in run_forever
self._run_once()
File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/asyncio/base_events.py", line 1440, in _run_once
handle._run()
File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/asyncio/events.py", line 145, in _run
self._callback(*self._args)
File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/tornado/platform/asyncio.py", line 117, in _handle_events
handler_func(fileobj, events)
File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/tornado/stack_context.py", line 276, in null_wrapper
return fn(*args, **kwargs)
File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 450, in _handle_events
self._handle_recv()
File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 480, in _handle_recv
self._run_callback(callback, msg)
File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 432, in _run_callback
callback(*args, **kwargs)
File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/tornado/stack_context.py", line 276, in null_wrapper
return fn(*args, **kwargs)
File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
return self.dispatch_shell(stream, msg)
File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 233, in dispatch_shell
handler(stream, idents, msg)
File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 399, in execute_request
user_expressions, allow_stdin)
File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/ipykernel/ipkernel.py", line 208, in do_execute
res = shell.run_cell(code, store_history=store_history, silent=silent)
File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/ipykernel/zmqshell.py", line 537, in run_cell
return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2662, in run_cell
raw_cell, store_history, silent, shell_futures)
File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2785, in _run_cell
interactivity=interactivity, compiler=compiler, result=result)
File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2903, in run_ast_nodes
if self.run_code(code, result):
File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2963, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-1-2177bca49b35>", line 41, in <module>
save_graph_def=save_graph)
File "/home/ubuntu/Lda2vec-Tensorflow/lda2vec/Lda2vec.py", line 82, in __init__
handles = self._build_graph()
File "/home/ubuntu/Lda2vec-Tensorflow/lda2vec/Lda2vec.py", line 152, in _build_graph
word_context = tf.nn.embedding_lookup(self.w_embed.embedding, x, name='word_embed_lookup')
File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/ops/embedding_ops.py", line 313, in embedding_lookup
transform_fn=None)
File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/ops/embedding_ops.py", line 133, in _embedding_lookup_and_transform
result = _clip(array_ops.gather(params[0], ids, name=name),
File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/ops/array_ops.py", line 2675, in gather
return gen_array_ops.gather_v2(params, indices, axis, name=name)
File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/ops/gen_array_ops.py", line 3332, in gather_v2
"GatherV2", params=params, indices=indices, axis=axis, name=name)
File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
op_def=op_def)
File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py", line 488, in new_func
return func(*args, **kwargs)
File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 3274, in create_op
op_def=op_def)
File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1770, in __init__
self._traceback = tf_stack.extract_stack()
InvalidArgumentError (see above for traceback): indices[250] = 5451 is not in [0, 5451)
[[node word_embed_lookup (defined at /home/ubuntu/Lda2vec-Tensorflow/lda2vec/Lda2vec.py:152) = GatherV2[Taxis=DT_INT32, Tindices=DT_INT32, Tparams=DT_FLOAT, _class=["loc:@Optimizer/train/update_word_embedding/AssignSub"], _device="/job:localhost/replica:0/task:0/device:CPU:0"](word_embedding/read, _arg_x_pivot_idxs_0_1, word_embed_lookup/axis)]]
The 'pivot_ids' and the 'target_ids' in 'skipgram.txt', appear to be 'one-based', but the embedding_matrix is 'zero-based':
E.g. -
print(np.sort(pivot_ids))
[ 1 1 1 ... 5451 5451 5451]
print(np.sort(target_ids))
[ 1 1 1 ... 5451 5451 5451]
The Keras Tokenizer appears to output 1 based ids' and states:
"0 is a reserved index that won't be assigned to any word."
Strange...This totally could be because I was running those other experiments I talked to you about via email. I might have made changes that I shouldn't have pushed back up here. Very stupid bug/mistake on my part.
In load_glove() the indexes are zero-based. But the skipgram indexes are one-based.
Could this have something to do with the exception?
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]
# word_index = tokenizer.word_index
nb_words = self.vocab_size
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in self.word_to_idx.items():
if i >= self.vocab_size: continue
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None: embedding_matrix[i] = embedding_vector
``
I get this error even with: load_embeds = False
On Mar 31, 2019, at 12:45 PM, Nathan Raw notifications@github.com wrote:
Strange...This totally could be because I was running those other experiments I talked to you about via email. I might have made changes that I shouldn't have pushed back up here. Very stupid bug/mistake on my part.
— You are receiving this because you authored the thread. Reply to this email directly, view it on GitHub https://github.com/nateraw/Lda2vec-Tensorflow/issues/43#issuecomment-478374086, or mute the thread https://github.com/notifications/unsubscribe-auth/AC9i24UfR0yT6vLVcSTcDA4KQxj8yuBBks5vcRBPgaJpZM4cQhKV.
Found the source of this issue. Fix coming. It's actually a tensorflow version issue, I think. I don't get the issue on TF v1.5.0. Either way, I'll fix it.
That make sense. Any details on the issue?
On Apr 1, 2019, at 9:24 AM, Nathan Raw notifications@github.com wrote:
Found the source of this issue. Fix coming. It's actually a tensorflow version issue, I think. I don't get the issue on TF v1.5.0. Either way, I'll fix it.
— You are receiving this because you authored the thread. Reply to this email directly, view it on GitHub https://github.com/nateraw/Lda2vec-Tensorflow/issues/43#issuecomment-478648568, or mute the thread https://github.com/notifications/unsubscribe-auth/AC9i20H_R5VHhjPECHzBoKk2aTIHA9oSks5vcjLLgaJpZM4cQhKV.
I believe the easy fix is to pass an unk_token="<SKIP>"
when initializing the keras tokenizer within nlppipe.py. Will check tonight and push. I've also made better installation instructions using conda environment so you can make sure you're on the same versions of everything.
Issue isn't that keras tokenizer starts at 1. The issue is that we don't have a token representation of idx 0, so it skips it when doing the loop in load_glove()
. If you try to just assume everything is from 1-vocab_size, then you get a different error in TF.
I'm no longer getting the error: InvalidArAgumentError (see above for traceback): indices[478] = 5451 is not in [0, 5451
Ahh... I remember messing with that function a while back when something wasn't working. It was actually related to this issue, most likely. Again, the tensorflow version I'm using didn't give me the InvalidArgumentError, so I didn't know why there was a bug.
I'll take a deeper look at the topics function. I think theres probably only a couple small tweaks that need to be made. Will close this issue for now, as this specific problem has been solved.
I'm getting this error on Epoch 1 of run_20newsgroups.py:
InvalidArgumentError (see above for traceback): indices[478] = 5451 is not in [0, 5451) [[node word_embed_lookup (defined at /Users/davidlaxer/Lda2vec-Tensorflow/lda2vec/Lda2vec.py:152) = GatherV2[Taxis=DT_INT32, Tindices=DT_INT32, Tparams=DT_FLOAT, _class=["loc:@Optimizer/train/update_word_embedding/AssignSub"], _device="/job:localhost/replica:0/task:0/device:CPU:0"](word_embedding/read, _arg_x_pivot_idxs_0_1, word_embed_lookup/axis)]]
It seems that the 'word_embed_lookup ' tensor contains an embedding reference beyond the length of the embedding_matrix. Any ideas where this 'off by one' issue could be?
similar to: https://github.com/nateraw/Lda2vec-Tensorflow/issues/5