Open QiaolingChen00 opened 2 years ago
(Pdb) p valid_outputs['prediction_scores']
tensor([[[-1.4637, -1.8279, -1.3620, ..., -3.4474, -3.5652, -3.7201],
[-2.2772, -1.4135, -0.3563, ..., -3.9874, -3.6914, -4.3784],
[-1.7881, -1.6877, -0.9608, ..., -3.8668, -3.8015, -3.8960],
...,
[16.1533, -2.0781, -2.0611, ..., -0.6250, -0.0593, -0.2079],
[15.8574, -1.7131, -2.5491, ..., 0.1721, 0.1925, -0.1640],
[15.9417, -1.9389, -2.1877, ..., -0.2055, 0.3651, -0.2933]],
[[-2.1948, -1.1345, -1.3834, ..., -3.8225, -3.1896, -4.0065],
[-2.4267, -1.2630, -1.0649, ..., -3.4871, -3.6825, -4.2588],
[-1.9069, -1.0583, -1.1420, ..., -3.6800, -3.3295, -4.2015],
...,
[15.6844, -1.7978, -2.1143, ..., -0.3307, 0.3876, -0.1023],
[15.6787, -1.8057, -1.9951, ..., -0.0831, 0.3188, 0.3700],
[15.9435, -2.0964, -1.8328, ..., 0.1360, 0.2450, -0.2655]],
[[-2.2479, -1.2012, -1.2251, ..., -3.6136, -3.3275, -4.2752],
[-2.3663, -1.2691, -1.2006, ..., -3.6112, -3.9899, -3.8147],
[-3.1647, -1.6516, -0.9750, ..., -3.8327, -3.5308, -3.9213],
...,
[15.9918, -1.8764, -2.2719, ..., -0.3094, 0.3376, 0.0662],
[15.5396, -2.2806, -1.9955, ..., -0.6294, 0.1453, -0.0919],
[15.6536, -2.0522, -2.4244, ..., -0.4132, 0.3085, 0.1209]],
[[-3.4790, -0.6961, -0.7400, ..., -3.3011, -3.3043, -4.1945],
[-5.9494, 0.6520, 1.4165, ..., -3.2651, -3.3162, -3.6195],
[-6.3337, 0.8197, 1.4651, ..., -3.2946, -3.1457, -3.5941],
...,
[15.8770, -1.9922, -2.4893, ..., -0.4149, 0.4579, -0.2464],
[15.6058, -2.1660, -2.2389, ..., -0.0682, 0.5629, 0.0409],
[16.0423, -2.1594, -2.2384, ..., -0.4900, 0.3168, -0.2644]]], device='cuda:0', dtype=oneflow.float32)
(Pdb) p valid_outputs['prediction_scores'].shape
oneflow.Size([4, 512, 23])
(Pdb) n
RuntimeError:
File "/home/ci-user/runners/release/_work/oneflow/oneflow/oneflow/core/functional/impl/array_functor.cpp", line 2865, in operator()
Check failed: ndim <= 2 RuntimeError: t() expects a tensor with <= 2 dimensions, but input tensor is 3D
> /workspace/CQL_BERT/libai/libai/evaluation/evaluator.py(227)inference_on_dataset()
-> evaluator.process(valid_data, valid_outputs)
(Pdb) w
/workspace/CQL_BERT/libai/tools/train_net.py(62)<module>()
-> main(args)
/workspace/CQL_BERT/libai/tools/train_net.py(57)main()
-> return trainer.train()
/workspace/CQL_BERT/libai/libai/engine/default.py(464)train()
-> super().train(self.start_iter, self.max_iter)
/workspace/CQL_BERT/libai/libai/engine/trainer.py(157)train()
-> self.after_train()
/workspace/CQL_BERT/libai/libai/engine/trainer.py(165)after_train()
-> h.after_train()
/workspace/CQL_BERT/libai/libai/engine/hooks.py(350)after_train()
-> self._do_eval()
/workspace/CQL_BERT/libai/libai/engine/hooks.py(317)_do_eval()
-> results = self._func()
/workspace/CQL_BERT/libai/libai/engine/default.py(409)test_and_save_results()
-> self._last_eval_results = self.test(self.cfg, self.test_loader, model)
/workspace/CQL_BERT/libai/libai/engine/default.py(748)test()
-> evaluator,
> /workspace/CQL_BERT/libai/libai/evaluation/evaluator.py(227)inference_on_dataset()
-> evaluator.process(valid_data, valid_outputs)
/workspace/CQL_BERT/libai/libai/evaluation/cls_evaluator.py(56)process()
-> topk_acc = accuracy(pred_logits, labels, topk=self.topk)
/workspace/CQL_BERT/libai/libai/evaluation/cls_evaluator.py(28)accuracy()
-> pred = pred.t()
/usr/local/miniconda3/lib/python3.7/site-packages/oneflow/framework/tensor.py(964)_t()
-> return flow._C.t(self)
# get valid sample
valid_data = {
key: dist.ttol(value, ranks=[0] if value.placement.ranks.ndim == 1 else [[0]])[
:valid_sample
]
for key, value in data.items()
}
valid_outputs = {}
for key, value in outputs.items():
value = dist.ttol(value, ranks=[0] if value.placement.ranks.ndim == 1 else [[0]])
if value.ndim > 1:
valid_outputs[key] = value[:valid_sample] # Slice if it's batched output
else:
valid_outputs[key] = value
# add my 改变输入数据形状 变为(4,23)
valid_outputs=valid_outputs['prediction_scores'][:,1,:].squeeze()
if flow.cuda.is_available():
dist.synchronize()
total_compute_time += time.perf_counter() - start_compute_time
start_eval_time = time.perf_counter()
if dist.is_main_process():
evaluator.process(valid_data, valid_outputs)
报错:
[05/04 16:26:08 lb.evaluation.evaluator]: Start inference on 462 samples
Killing subprocess 17190
Traceback (most recent call last):
File "/usr/local/miniconda3/lib/python3.7/runpy.py", line 193, in _run_module_as_main
"__main__", mod_spec)
File "/usr/local/miniconda3/lib/python3.7/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/usr/local/miniconda3/lib/python3.7/site-packages/oneflow/distributed/launch.py", line 231, in <module>
main()
File "/usr/local/miniconda3/lib/python3.7/site-packages/oneflow/distributed/launch.py", line 219, in main
sigkill_handler(signal.SIGTERM, None)
File "/usr/local/miniconda3/lib/python3.7/site-packages/oneflow/distributed/launch.py", line 188, in sigkill_handler
returncode=last_return_code, cmd=cmd
subprocess.CalledProcessError: Command '['/usr/local/miniconda3/bin/python3', '-u', 'tools/train_net.py', '--config-file', 'projects/token_classification/configs/config.py', 'train.train_iter=10']' died with <Signals.SIGSEGV: 11>.
(base) root@training-notebook-cf02d3-cf02d3-jupyter-master-0:/workspace/CQL_BERT/libai#
train.update(
dict(
recompute_grad=dict(enabled=True),
output_dir="output/benchmark/token",
train_micro_batch_size=4,
test_micro_batch_size=4,
train_epoch=1,
train_iter=0,
eval_period=500,
log_period=50,
dist=dict(
data_parallel_size=1,
tensor_parallel_size=1,
pipeline_parallel_size=1,
),
# 不进行eval
evaluation=dict(
enabled=False,
)
)
)