Closed minuenergy closed 5 months ago
I fix this to clone tensors
frame_event_pos_emb_level1_clone=frame_event_pos_emb_level1.clone()
frame_event_pos_emb_level1_clone[:,pos_level2_idx] = frame_event_pos_emb_level1_clone[:,pos_level2_idx] + frame_event_pos_emb_level2
and now on warnings.warn("nn.functional.sigmoid is deprecated. Use torch.sigmoid instead.") Epoch 1/15 : |███----| 47.31% [695/1469 07:12<08:01 Total Loss 7.9050 | Smooth Total Loss 7.9254 | SRL_loss 3.5246 | Loss_Vb 4.0605 | Loss_Role 0.3199]]]]
it works but i changed view func -> reshape func ( which is use more memory ) and also clone .. is there any other way original code works?
I'm able to evaluate correctly.
but when I use command below, i got runtime error
CUDA_VISIBLE_DEVICES=0 python main_dist.py experiment1 --task_type=grounded_end-to-end --train.bs=16 --train.bsv=16
vidsitu_code/transformer_grounded_vsitu.py:204: UserWarning: Use of indexput on expanded tensors is deprecated. Please clone() the tensor before performing this operation. This also applies to advanced indexing e.g. tensor[indices] = tensor (Triggered internally at /pytorch/aten/src/ATen/native/TensorAdvancedIndexing.cpp:300.) frame_event_pos_emb_level1[:,pos_level2_idx] = frame_event_pos_emb_level1[:,pos_level2_idx] + frame_event_pos_emb_level2 Traceback (most recent call last): File "main_dist.py", line 169, in
fire.Fire(main_dist)
File "/opt/conda/lib/python3.7/site-packages/fire/core.py", line 141, in Fire
component_trace = _Fire(component, args, parsed_flag_args, context, name)
File "/opt/conda/lib/python3.7/site-packages/fire/core.py", line 480, in _Fire
target=component.name)
File "/opt/conda/lib/python3.7/site-packages/fire/core.py", line 691, in _CallAndUpdateTrace
component = fn(*varargs, kwargs)
File "main_dist.py", line 157, in main_dist
launch_job(cfg, init_method="tcp://localhost:9997", func=main_fn)
File "/workspace/mount/SSD_11T/minwoo/projects/LLM/GVSR_origin/utils/trn_dist_utils.py", line 42, in launch_job
func(cfg=cfg)
File "main_dist.py", line 95, in main_fn
learn.fit(epochs=cfg.train.epochs, lr=cfg.train.lr)
File "/workspace/mount/SSD_11T/minwoo/projects/LLM/GVSR_origin/utils/trn_utils.py", line 810, in fit
raise e
File "/workspace/mount/SSD_11T/minwoo/projects/LLM/GVSR_origin/utils/trn_utils.py", line 769, in fit
train_loss, train_acc = self.train_epoch(mb)
File "/workspace/mount/SSD_11T/minwoo/projects/LLM/GVSR_origin/utils/trn_utils.py", line 543, in train_epoch
out = self.mdl(batch)
File "/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, *kwargs)
File "/workspace/mount/SSD_11T/minwoo/projects/LLM/GVSR_origin/vidsitu_code/ground_vsitu.py", line 41, in forward
vb_pred, vb_loss, pred_roles, role_loss, grounded_nouns, bb_attn, selected_roles = self.tx_vo_ro(inp, self.cfg)
File "/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(input, kwargs)
File "vidsitu_code/transformer_grounded_vsitu.py", line 314, in forward
vid_obj_enc_emb, attn_mask_vid_obj = self.process_vid_obj_inputs(inp)
File "vidsitu_code/transformer_grounded_vsitu.py", line 207, in process_vid_obj_inputs
obj_event_pos_emb = obj_event_pos_emb.view(B,F_N,D)
RuntimeError: view size is not compatible with input tensor's size and stride (at least one dimension spans across two contiguous subspaces). Use .reshape(...) instead.
so I changed view function to reshape function to pass this and after that i got this message below
vidsitu_code/transformer_grounded_vsitu.py:204: UserWarning: Use of indexput on expanded tensors is deprecated. Please clone() the tensor before performing this operation. This also applies to advanced indexing e.g. tensor[indices] = tensor (Triggered internally at /pytorch/aten/src/ATen/native/TensorAdvancedIndexing.cpp:300.) frame_event_pos_emb_level1[:,pos_level2_idx] = frame_event_pos_emb_level1[:,pos_level2_idx] + frame_event_pos_emb_level2 /opt/conda/lib/python3.7/site-packages/torch/nn/functional.py:1639: UserWarning: nn.functional.sigmoid is deprecated. Use torch.sigmoid instead. warnings.warn("nn.functional.sigmoid is deprecated. Use torch.sigmoid instead.") Traceback (most recent call last): File "main_dist.py", line 166, in
fire.Fire(main_dist)
File "/opt/conda/lib/python3.7/site-packages/fire/core.py", line 141, in Fire
component_trace = _Fire(component, args, parsed_flag_args, context, name)
File "/opt/conda/lib/python3.7/site-packages/fire/core.py", line 480, in _Fire
target=component.name)
File "/opt/conda/lib/python3.7/site-packages/fire/core.py", line 691, in _CallAndUpdateTrace
component = fn(*varargs, **kwargs)
File "main_dist.py", line 154, in main_dist
launch_job(cfg, init_method="tcp://localhost:9997", func=main_fn)
File "/workspace/mount/SSD_11T/minwoo/projects/LLM/GVSR/utils/trn_dist_utils.py", line 42, in launch_job
func(cfg=cfg)
File "main_dist.py", line 92, in main_fn
learn.fit(epochs=cfg.train.epochs, lr=cfg.train.lr)
File "/workspace/mount/SSD_11T/minwoo/projects/LLM/GVSR/utils/trn_utils.py", line 810, in fit
raise e
File "/workspace/mount/SSD_11T/minwoo/projects/LLM/GVSR/utils/trn_utils.py", line 769, in fit
train_loss, train_acc = self.train_epoch(mb)
File "/workspace/mount/SSD_11T/minwoo/projects/LLM/GVSR/utils/trn_utils.py", line 549, in train_epoch
loss.backward()
File "/opt/conda/lib/python3.7/site-packages/torch/tensor.py", line 221, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph)
File "/opt/conda/lib/python3.7/site-packages/torch/autograd/init.py", line 132, in backward
allow_unreachable=True) # allow_unreachable flag
RuntimeError: unsupported operation: more than one element of the written-to tensor refers to a single memory location. Please clone() the tensor before performing the operation.
how can i fix this?