Closed crownz-sec closed 3 years ago
发现是云平台读写文件很耗时,而且作者代码是每次测试一张图片就把结果写入对应文件,我的解决办法是将代码改成多线程,同时把测试结果先保存起来,最后一次性写入文件。后面计算mAP的Reading annotation部分代码也比较耗时,是for循环单个遍历的,代码也改成多线程了,这样的话速度快了很多。
test.py的修改部分
if self.__eval:
mAP = 0
print('*' * 20 + "Validate" + '*' * 20)
with torch.no_grad():
APs = Evaluator(self.__model).APs_voc()
for i in APs:
print("{} --> mAP : {}".format(i, APs[i]))
mAP += APs[i]
mAP = mAP / self.__num_class
print('mAP:%g' % (mAP))
evaluator.py的修改部分
import multiprocessing
from multiprocessing.dummy import Pool as ThreadPool # 线程池
from tqdm import tqdm
from collections import defaultdict
class Evaluator(object):
def __init__(self, model, visiual=True):
self.classes = cfg.DATA["CLASSES"]
self.pred_result_path = os.path.join('data', 'results')
self.val_data_path = os.path.join('data', 'VOC', 'VOCtest-2007')
self.conf_thresh = cfg.TEST["CONF_THRESH"]
self.nms_thresh = cfg.TEST["NMS_THRESH"]
self.val_shape = cfg.TEST["TEST_IMG_SIZE"]
self.__visiual = visiual
self.__visual_imgs = 0
self.model = model
self.device = next(model.parameters()).device
self.final_result = defaultdict(list)
def APs_voc(self):
img_inds_file = os.path.join(self.val_data_path, 'ImageSets', 'Main', 'test.txt')
with open(img_inds_file, 'r') as f:
lines = f.readlines()
img_inds = [line.strip() for line in lines]
if os.path.exists(self.pred_result_path):
shutil.rmtree(self.pred_result_path)
os.mkdir(self.pred_result_path)
imgs_count = len(img_inds)
cpu_nums = multiprocessing.cpu_count()
pool = ThreadPool(cpu_nums)
with tqdm(total=imgs_count) as pbar:
for i, _ in enumerate(pool.imap_unordered(self.Single_APs_voc, img_inds)):
pbar.update()
# with tqdm(total=20) as pbar:
# for i, _ in enumerate(pool.imap_unordered(self.Single_APs_voc, img_inds[:20])):
# pbar.update()
for class_name in self.final_result:
with open(os.path.join(self.pred_result_path, 'comp4_det_test_' + class_name + '.txt'), 'a') as f:
str_result = ''.join(self.final_result[class_name])
f.write(str_result)
return self.__calc_APs()
def Single_APs_voc(self, img_ind, multi_test=False, flip_test=False):
img_path = os.path.join(self.val_data_path, 'JPEGImages', img_ind+'.jpg')
img = cv2.imread(img_path)
bboxes_prd = self.get_bbox(img, multi_test, flip_test)
if bboxes_prd.shape[0]!=0 and self.__visiual and self.__visual_imgs < 100:
boxes = bboxes_prd[..., :4]
class_inds = bboxes_prd[..., 5].astype(np.int32)
scores = bboxes_prd[..., 4]
visualize_boxes(image=img, boxes=boxes, labels=class_inds, probs=scores, class_labels=self.classes)
path = os.path.join(cfg.PROJECT_PATH, "data/results/{}.jpg".format(self.__visual_imgs))
cv2.imwrite(path, img)
self.__visual_imgs += 1
for bbox in bboxes_prd:
coor = np.array(bbox[:4], dtype=np.int32)
score = bbox[4]
class_ind = int(bbox[5])
class_name = self.classes[class_ind]
score = '%.4f' % score
xmin, ymin, xmax, ymax = map(str, coor)
result = ' '.join([img_ind, score, xmin, ymin, xmax, ymax]) + '\n'
self.final_result[class_name].append(result)
voc_eval.py的修改部分
import multiprocessing
from tqdm import tqdm
from multiprocessing.dummy import Pool as ThreadPool # 线程池
recs = {}
def parse_rec(param):
""" Parse a PASCAL VOC xml file """
path = param['path']
img = param['img']
filename = path.format(img)
tree = ET.parse(filename)
objects = []
for obj in tree.findall('object'):
obj_struct = {}
obj_struct['name'] = obj.find('name').text
obj_struct['pose'] = obj.find('pose').text
obj_struct['truncated'] = int(obj.find('truncated').text)
obj_struct['difficult'] = int(obj.find('difficult').text)
bbox = obj.find('bndbox')
obj_struct['bbox'] = [int(bbox.find('xmin').text),
int(bbox.find('ymin').text),
int(bbox.find('xmax').text),
int(bbox.find('ymax').text)]
objects.append(obj_struct)
recs[img] = objects
def voc_eval(detpath,
annopath,
imagesetfile,
classname,
cachedir,
ovthresh=0.5,
use_07_metric=False):
"""rec, prec, ap = voc_eval(detpath,
annopath,
imagesetfile,
classname,
[ovthresh],
[use_07_metric])
Top level function that does the PASCAL VOC evaluation.
detpath: Path to detections
detpath.format(classname) should produce the detection results file.
annopath: Path to annotations
annopath.format(imagename) should be the xml annotations file.
imagesetfile: Text file containing the list of images, one image per line.
classname: Category name (duh)
cachedir: Directory for caching the annotations
[ovthresh]: Overlap threshold (default = 0.5)
[use_07_metric]: Whether to use VOC07's 11 point AP computation
(default False)
"""
# assumes detections are in detpath.format(classname)
# assumes annotations are in annopath.format(imagename)
# assumes imagesetfile is a text file with each line an image name
# cachedir caches the annotations in a pickle file
# first load gt
if not os.path.isdir(cachedir):
os.mkdir(cachedir)
cachefile = os.path.join(cachedir, 'annots.pkl')
# read list of images
with open(imagesetfile, 'r') as f:
lines = f.readlines()
imagenames = [x.strip() for x in lines]
print("Reading annotation...")
if not os.path.isfile(cachefile):
# load annots
imgs_count = len(imagenames)
cpu_nums = multiprocessing.cpu_count()
pool = ThreadPool(cpu_nums)
param_list = []
for imagename in imagenames:
param = {'path':annopath
,'img':imagename}
param_list.append(param)
with tqdm(total=imgs_count) as pbar:
for i, _ in enumerate(pool.imap_unordered(parse_rec, param_list)):
pbar.update()
global recs
# save
print ('Saving cached annotations to {:s}'.format(cachefile))
with open(cachefile, 'wb') as f:
pickle.dump(recs, f)
else:
# load
with open(cachefile, 'rb') as f:
recs = pickle.load(f)
因为也是刚接触多线程,代码可能多少有些问题,还请大家批评指正。
您好,我参考了您修改后的代码,能够成功运行,validate的速度也有了明显提升。 但是在成功validate几次之后,出现了如下报错
Traceback (most recent call last):
File "train.py", line 160, in <module>
gpu_id=opt.gpu_id).train()
File "train.py", line 140, in train
APs = Evaluator(self.yolov3).APs_voc()
File "/root/boyuansun/YOLOV3/eval/evaluator.py", line 51, in APs_voc
for i, _ in enumerate(pool.imap_unordered(self.Single_APs_voc, img_inds)):
File "/root/anaconda3/envs/pytorch1.5/lib/python3.5/multiprocessing/pool.py", line 731, in next
raise value
File "/root/anaconda3/envs/pytorch1.5/lib/python3.5/multiprocessing/pool.py", line 119, in worker
result = (True, func(*args, **kwds))
File "/root/boyuansun/YOLOV3/eval/evaluator.py", line 68, in Single_APs_voc
bboxes_prd = self.get_bbox(img, multi_test, flip_test)
File "/root/boyuansun/YOLOV3/eval/evaluator.py", line 106, in get_bbox
bboxes = self.__predict(img, self.val_shape, (0, np.inf))
File "/root/boyuansun/YOLOV3/eval/evaluator.py", line 119, in __predict
_, p_d = self.model(img)
File "/root/anaconda3/envs/pytorch1.5/lib/python3.5/site-packages/torch/nn/modules/module.py", line 550, in __call__
result = self.forward(*input, **kwargs)
File "/root/boyuansun/YOLOV3/model/yolov3.py", line 50, in forward
x_s, x_m, x_l = self.__backnone(x)
File "/root/anaconda3/envs/pytorch1.5/lib/python3.5/site-packages/torch/nn/modules/module.py", line 550, in __call__
result = self.forward(*input, **kwargs)
File "/root/boyuansun/YOLOV3/model/backbones/darknet53.py", line 53, in forward
x = self.__conv(x)
File "/root/anaconda3/envs/pytorch1.5/lib/python3.5/site-packages/torch/nn/modules/module.py", line 550, in __call__
result = self.forward(*input, **kwargs)
File "/root/boyuansun/YOLOV3/model/layers/conv_module.py", line 36, in forward
x = self.__conv(x)
File "/root/anaconda3/envs/pytorch1.5/lib/python3.5/site-packages/torch/nn/modules/module.py", line 550, in __call__
result = self.forward(*input, **kwargs)
File "/root/anaconda3/envs/pytorch1.5/lib/python3.5/site-packages/torch/nn/modules/conv.py", line 353, in forward
return self._conv_forward(input, self.weight)
File "/root/anaconda3/envs/pytorch1.5/lib/python3.5/site-packages/torch/nn/modules/conv.py", line 350, in _conv_forward
self.padding, self.dilation, self.groups)
RuntimeError: cuDNN error: CUDNN_STATUS_NOT_INITIALIZED
terminate called without an active exception
我认为该报错可能是由于显存没有得到及时释放引起的,在多次validate之后,就有可能出现上述错误。请问您有好的解决方案吗?
您好,我参考了您修改后的代码,能够成功运行,validate的速度也有了明显提升。 但是在成功validate几次之后,出现了如下报错
Traceback (most recent call last): File "train.py", line 160, in <module> gpu_id=opt.gpu_id).train() File "train.py", line 140, in train APs = Evaluator(self.yolov3).APs_voc() File "/root/boyuansun/YOLOV3/eval/evaluator.py", line 51, in APs_voc for i, _ in enumerate(pool.imap_unordered(self.Single_APs_voc, img_inds)): File "/root/anaconda3/envs/pytorch1.5/lib/python3.5/multiprocessing/pool.py", line 731, in next raise value File "/root/anaconda3/envs/pytorch1.5/lib/python3.5/multiprocessing/pool.py", line 119, in worker result = (True, func(*args, **kwds)) File "/root/boyuansun/YOLOV3/eval/evaluator.py", line 68, in Single_APs_voc bboxes_prd = self.get_bbox(img, multi_test, flip_test) File "/root/boyuansun/YOLOV3/eval/evaluator.py", line 106, in get_bbox bboxes = self.__predict(img, self.val_shape, (0, np.inf)) File "/root/boyuansun/YOLOV3/eval/evaluator.py", line 119, in __predict _, p_d = self.model(img) File "/root/anaconda3/envs/pytorch1.5/lib/python3.5/site-packages/torch/nn/modules/module.py", line 550, in __call__ result = self.forward(*input, **kwargs) File "/root/boyuansun/YOLOV3/model/yolov3.py", line 50, in forward x_s, x_m, x_l = self.__backnone(x) File "/root/anaconda3/envs/pytorch1.5/lib/python3.5/site-packages/torch/nn/modules/module.py", line 550, in __call__ result = self.forward(*input, **kwargs) File "/root/boyuansun/YOLOV3/model/backbones/darknet53.py", line 53, in forward x = self.__conv(x) File "/root/anaconda3/envs/pytorch1.5/lib/python3.5/site-packages/torch/nn/modules/module.py", line 550, in __call__ result = self.forward(*input, **kwargs) File "/root/boyuansun/YOLOV3/model/layers/conv_module.py", line 36, in forward x = self.__conv(x) File "/root/anaconda3/envs/pytorch1.5/lib/python3.5/site-packages/torch/nn/modules/module.py", line 550, in __call__ result = self.forward(*input, **kwargs) File "/root/anaconda3/envs/pytorch1.5/lib/python3.5/site-packages/torch/nn/modules/conv.py", line 353, in forward return self._conv_forward(input, self.weight) File "/root/anaconda3/envs/pytorch1.5/lib/python3.5/site-packages/torch/nn/modules/conv.py", line 350, in _conv_forward self.padding, self.dilation, self.groups) RuntimeError: cuDNN error: CUDNN_STATUS_NOT_INITIALIZED terminate called without an active exception
我认为该报错可能是由于显存没有得到及时释放引起的,在多次validate之后,就有可能出现上述错误。请问您有好的解决方案吗?
因为我没有在训练过程中使用这个代码,只是拿来最终测试了一下,所以也没有碰到你这个问题。我在这里找到了一些解决方案,可能减少batch_size是一个好点的解决方案。 也可以去pytorch的论坛用英文关键词找一下,有很多细节问题都可以在上面找到。
我的设备:云平台的4核cpu,24g内存,2080ti-11g显卡。 训练阶段,gpu正常使用,利用率100%。但是在测试时,gpu利用率0%,速度特别慢。之前的相关issues我也看了,但没有发现到相关的解决办法。 之后在自己没有cpu的电脑上使用best.pt权重测试,发现测试一张图片的时间为1-2s,但在云平台测试一张图片的时间为15-25s。
注:multi_scale和flip都没有开