Hello, I encountered an issue while using MMaction2:
KeyError: 'Recognizer3D is not in the mmdet::model registry. Please check whether the value of Recognizer3D is correct or it was registered as expected. More details can be found at https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#import-the-custom-module'
This is my config:`base = [
'../../base/models/mvit_small.py', '../../base/default_runtime.py'
]
- base_batch_size = (8 GPUs) x (8 samples per GPU).
auto_scale_lr = dict(enable=True, base_batch_size=2 // repeat_sample)
This is mvit_small.py: model = dict(
type='Recognizer3D',
backbone=dict(type='MViT', arch='small', drop_path_rate=0.2),
data_preprocessor=dict(
type='ActionDataPreprocessor',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
format_shape='NCTHW'),
cls_head=dict(
type='MViTHead',
in_channels=768,
num_classes=400,
label_smooth_eps=0.1,
average_clips='prob'))
This is the code i run: #---用于Mvit实现多目标,长视频预测---
import argparse
import copy as cp
import tempfile
import cv2
import mmcv
import mmengine
import numpy as np
import torch
from mmengine import DictAction
from mmengine.runner import load_checkpoint
from mmengine.structures import InstanceData
from mmaction.apis import detection_inference
from mmaction.registry import MODELS
from mmaction.structures import ActionDataSample
from mmaction.utils import frame_extract, get_str_type
try:
import moviepy.editor as mpy
except ImportError:
raise ImportError('Please install moviepy to enable output file')
---一些颜色参数和可视化参数---
FONTFACE = cv2.FONT_HERSHEY_DUPLEX
FONTSCALE = 0.5
FONTCOLOR = (255, 255, 255) # BGR, white
MSGCOLOR = (128, 128, 128) # BGR, gray
THICKNESS = 1
LINETYPE = 1
def hex2color(h):
"""Convert the 6-digit hex string to tuple of 3 int value (RGB)"""
return (int(h[:2], 16), int(h[2:4], 16), int(h[4:], 16))
plate_blue = '03045e-023e8a-0077b6-0096c7-00b4d8-48cae4'
plate_blue = plate_blue.split('-')
plate_blue = [hex2color(h) for h in plate_blue]
plate_green = '004b23-006400-007200-008000-38b000-70e000'
plate_green = plate_green.split('-')
plate_green = [hex2color(h) for h in plate_green]
---可视化---
def visualize(frames, annotations, plate=plate_blue, max_num=5):
assert max_num + 1 <= len(plate)
plate = [x[::-1] for x in plate]
framesout = cp.deepcopy(frames)
nf, na = len(frames), len(annotations)
assert nf % na == 0
nfpa = len(frames) // len(annotations)
anno = None
h, w, = frames[0].shape
scale_ratio = np.array([w, h, w, h])
for i in range(na):
anno = annotations[i]
if anno is None:
continue
for j in range(nfpa):
ind = i nfpa + j
frame = frames_out[ind]
for ann in anno:
box = ann[0]
label = ann[1]
if not len(label):
continue
score = ann[2]
box = (box scale_ratio).astype(np.int64)
st, ed = tuple(box[:2]), tuple(box[2:])
cv2.rectangle(frame, st, ed, plate[0], 2)
for k, lb in enumerate(label):
if k >= max_num:
break
text = abbrev(lb)
text = ': '.join([text, f'{score[k]:>.2f}'])
location = (0 + st[0], 18 + k * 18 + st[1])
textsize = cv2.getTextSize(text, FONTFACE, FONTSCALE,
THICKNESS)[0]
textwidth = textsize[0]
diag0 = (location[0] + textwidth, location[1] - 14)
diag1 = (location[0], location[1] + 2)
cv2.rectangle(frame, diag0, diag1, plate[k + 1], -1)
cv2.putText(frame, text, location, FONTFACE, FONTSCALE,
FONTCOLOR, THICKNESS, LINETYPE)
return frames_out
---从一个文件中加载标签映射---
def load_label_map(file_path):
lines = open(file_path).readlines()
lines = [x.strip().split(': ') for x in lines]
return {int(x[0]): x[1] for x in lines}
---缩短标签长度---
def abbrev(name):
while name.find('(') != -1:
st, ed = name.find('('), name.find(')')
name = name[:st] + '...' + name[ed + 1:]
return name
---打包,归一化预测结果
def pack_result(human_detection, result, img_h, img_w):
human_detection[:, 0::2] /= img_w
human_detection[:, 1::2] /= img_h
results = []
if result is None:
return None
for prop, res in zip(human_detection, result):
res.sort(key=lambda x: -x[1])
results.append(
(prop.data.cpu().numpy(), [x[0] for x in res], [x[1]
for x in res]))
return results
parser.add_argument('video', help='video file/url')
# 输出文件名
parser.add_argument('out_filename', help='output filename')
# 检测模型配置文件路径
parser.add_argument(
'--config',
default=('/home/miao/mmaction2/configs/recognition/mvit/mvit-small-p244_32xb16-16x4x1-200e_kinetics400-rgb.py'),
help='detection model config file path')
# 检测模型ckpt文件/URL
parser.add_argument(
'--checkpoint',
default=('/home/miao/mmaction2/work_dirs/mvit-small-p244_32xb16-16x4x1-200e_kinetics400-rgb/best_acc_top1_epoch_16.pth'),
help='spatialtemporal detection model checkpoint file/url')
# dec检测配置文件路径
parser.add_argument(
'--det-config',
default='demo/demo_configs/faster-rcnn_r50_fpn_2x_coco_infer.py',
help='human detection config file path (from mmdet)')
# dec检测ckpt文件/URL
parser.add_argument(
'--det-checkpoint',
default=('http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/'
'faster_rcnn_r50_fpn_2x_coco/'
'faster_rcnn_r50_fpn_2x_coco_'
'bbox_mAP-0.384_20200504_210434-a5d8aa15.pth'),
help='human detection checkpoint file/url')
# 检测分数阈值
parser.add_argument(
'--det-score-thr',
type=float,
default=0.9,
help='the threshold of human detection score')
# 检测类别ID
parser.add_argument(
'--det-cat-id',
type=int,
default=0,
help='the category id for human detection')
# 动作分数阈值
parser.add_argument(
'--action-score-thr',
type=float,
default=0.5,
help='the threshold of human action score')
# 标签映射文件
parser.add_argument(
'--label-map',
default='tools/data/ava/label_map.txt',
help='label map file')
# 设备选项
parser.add_argument(
'--device', type=str, default='cuda:0', help='CPU/CUDA device option')
# 指定图像的短边长度
parser.add_argument(
'--short-side',
type=int,
default=256,
help='specify the short-side length of the image')
# 每n帧进行一次预测
parser.add_argument(
'--predict-stepsize',
default=8,
type=int,
help='give out a prediction per n frames')
# 每n帧显示一次输出
parser.add_argument(
'--output-stepsize',
default=4,
type=int,
help=('show one frame per n frames in the demo, we should have: '
'predict_stepsize % output_stepsize == 0'))
# 输出视频的帧率
parser.add_argument(
'--output-fps',
default=6,
type=int,
help='the fps of demo video output')
# 覆盖配置文件中的某些设置
parser.add_argument(
'--cfg-options',
nargs='+',
action=DictAction,
default={},
help='override some settings in the used config, the key-value pair '
'in xxx=yyy format will be merged into config file. For example, '
"'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'")
# 解析参数
args = parser.parse_args()
return args
def main():
args = parse_args()
tmp_dir = tempfile.TemporaryDirectory()
frame_paths, original_frames = frame_extract(
args.video, out_dir=tmp_dir.name)
num_frame = len(frame_paths)
h, w, _ = original_frames[0].shape
# resize frames to shortside
new_w, new_h = mmcv.rescale_size((w, h), (args.short_side, np.Inf))
frames = [mmcv.imresize(img, (new_w, new_h)) for img in original_frames]
w_ratio, h_ratio = new_w / w, new_h / h
# Get clip_len, frame_interval and calculate center index of each clip
config = mmengine.Config.fromfile(args.config)
config.merge_from_dict(args.cfg_options)
val_pipeline = config.val_pipeline
sampler = [
x for x in val_pipeline if get_str_type(x['type']) == 'SampleFrames'][0]
clip_len, frame_interval = sampler['clip_len'], sampler['frame_interval']
window_size = clip_len * frame_interval
assert clip_len % 2 == 0, 'We would like to have an even clip_len'
# Note that it's 1 based here
timestamps = np.arange(window_size // 2, num_frame + 1 - window_size // 2,
args.predict_stepsize)
# Load label_map
label_map = load_label_map(args.label_map)
try:
if config['data']['train']['custom_classes'] is not None:
label_map = {
id + 1: label_map[cls]
for id, cls in enumerate(config['data']['train']
['custom_classes'])
}
except KeyError:
pass
# Get Human detection results
center_frames = [frame_paths[ind - 1] for ind in timestamps]
human_detections, _ = detection_inference(args.det_config,
args.det_checkpoint,
center_frames,
args.det_score_thr,
args.det_cat_id, args.device)
torch.cuda.empty_cache()
for i in range(len(human_detections)):
det = human_detections[i]
det[:, 0:4:2] *= w_ratio
det[:, 1:4:2] *= h_ratio
human_detections[i] = torch.from_numpy(det[:, :4]).to(args.device)
Build_model
try:
config['model']['test_cfg']['rcnn'] = dict(action_thr=0)
except KeyError:
pass
config.model.backbone.pretrained = None
model = MODELS.build(config.model)
load_checkpoint(model, args.checkpoint, map_location='cpu')
model.to(args.device)
model.eval()
predictions = []
img_norm_cfg = dict(
mean=np.array(config.model.data_preprocessor.mean),
std=np.array(config.model.data_preprocessor.std),
to_rgb=False)
print('Performing Action Detection for each clip')
assert len(timestamps) == len(human_detections)
prog_bar = mmengine.ProgressBar(len(timestamps))
for timestamp, proposal in zip(timestamps, human_detections):
if proposal.shape[0] == 0:
predictions.append(None)
continue
start_frame = timestamp - (clip_len // 2 - 1) * frame_interval
frame_inds = start_frame + np.arange(0, window_size, frame_interval)
frame_inds = list(frame_inds - 1)
imgs = [frames[ind].astype(np.float32) for ind in frame_inds]
_ = [mmcv.imnormalize_(img, **img_norm_cfg) for img in imgs]
# THWC -> CTHW -> 1CTHW
input_array = np.stack(imgs).transpose((3, 0, 1, 2))[np.newaxis]
input_tensor = torch.from_numpy(input_array).to(args.device)
datasample = ActionDataSample()
datasample.proposals = InstanceData(bboxes=proposal)
datasample.set_metainfo(dict(img_shape=(new_h, new_w)))
with torch.no_grad():
result = model(input_tensor, [datasample], mode='predict')
scores = result[0].pred_instances.scores
prediction = []
# N proposals
for i in range(proposal.shape[0]):
prediction.append([])
# Perform action score thr
for i in range(scores.shape[1]):
if i not in label_map:
continue
for j in range(proposal.shape[0]):
if scores[j, i] > args.action_score_thr:
prediction[j].append((label_map[i], scores[j,
i].item()))
predictions.append(prediction)
prog_bar.update()
results = []
for human_detection, prediction in zip(human_detections, predictions):
results.append(pack_result(human_detection, prediction, new_h, new_w))
def dense_timestamps(timestamps, n):
"""Make it nx frames."""
old_frame_interval = (timestamps[1] - timestamps[0])
start = timestamps[0] - old_frame_interval / n * (n - 1) / 2
new_frame_inds = np.arange(
len(timestamps) * n) * old_frame_interval / n + start
return new_frame_inds.astype(np.int64)
dense_n = int(args.predict_stepsize / args.output_stepsize)
frames = [
cv2.imread(frame_paths[i - 1])
for i in dense_timestamps(timestamps, dense_n)
]
print('Performing visualization')
vis_frames = visualize(frames, results)
vid = mpy.ImageSequenceClip([x[:, :, ::-1] for x in vis_frames],
fps=args.output_fps)
vid.write_videofile(args.out_filename)
tmp_dir.cleanup()
if name == 'main':
main()
`
Thank you very much to anyone who replies.
Hello, I encountered an issue while using MMaction2: KeyError: 'Recognizer3D is not in the mmdet::model registry. Please check whether the value of Recognizer3D is correct or it was registered as expected. More details can be found at https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#import-the-custom-module' This is my config:`base = [ '../../base/models/mvit_small.py', '../../base/default_runtime.py' ]
model = dict( data_preprocessor=dict( type='ActionDataPreprocessor', mean=[114.75, 114.75, 114.75], std=[57.375, 57.375, 57.375], blending=dict( type='RandomBatchAugment', augments=[ dict(type='MixupBlending', alpha=0.8, num_classes=800), dict(type='CutmixBlending', alpha=1, num_classes=800) ]), format_shape='NCTHW'), ) model = dict( cls_head=dict(num_classes=3))
"/home/miao/mmaction2/data/"
load_from = '/home/miao/mmaction2/work_dirs/mvit-small-p244_32xb16-16x4x1-200e_kinetics400-rgb/best_acc_top1_epoch_16.pth'
dataset settings
dataset_type = 'VideoDataset' data_root = '/home/miao/mmaction2/data/kinetics400_tiny/train/' data_root_val = '/home/miao/mmaction2/data/kinetics400_tiny/val/' ann_file_train = '/home/miao/mmaction2/data/kinetics400_tiny/kinetics_tiny_train_video.txt' ann_file_val = '/home/miao/mmaction2/data/kinetics400_tiny/kinetics_tiny_val_video.txt' ann_file_test ='/home/miao/mmaction2/data/kinetics400_tiny/kinetics_tiny_val_list_video.txt'
file_client_args = dict(io_backend='disk')
train_pipeline = [ dict(type='DecordInit', file_client_args), dict(type='SampleFrames', clip_len=16, frame_interval=4, num_clips=1), dict(type='DecordDecode'), dict(type='Resize', scale=(-1, 256)), dict( type='PytorchVideoWrapper', op='RandAugment', magnitude=7, num_layers=4), dict(type='RandomResizedCrop'), dict(type='Resize', scale=(224, 224), keep_ratio=False), dict(type='Flip', flip_ratio=0.5), dict(type='RandomErasing', erase_prob=0.25, mode='rand'), dict(type='FormatShape', input_format='NCTHW'), dict(type='PackActionInputs') ] val_pipeline = [ dict(type='DecordInit', file_client_args), dict( type='SampleFrames', clip_len=16, frame_interval=4, num_clips=1, test_mode=True), dict(type='DecordDecode'), dict(type='Resize', scale=(-1, 256)), dict(type='CenterCrop', crop_size=224), dict(type='FormatShape', input_format='NCTHW'), dict(type='PackActionInputs') ] test_pipeline = [ dict(type='DecordInit', **file_client_args), dict( type='SampleFrames', clip_len=16, frame_interval=4, num_clips=5, test_mode=True), dict(type='DecordDecode'), dict(type='Resize', scale=(-1, 224)), dict(type='CenterCrop', crop_size=224), dict(type='FormatShape', input_format='NCTHW'), dict(type='PackActionInputs') ]
repeat_sample = 2 train_dataloader = dict( batch_size=1, num_workers=8, persistent_workers=True, sampler=dict(type='DefaultSampler', shuffle=True), collate_fn=dict(type='repeat_pseudo_collate'), dataset=dict( type='RepeatAugDataset', num_repeats=repeat_sample, sample_once=True, ann_file=ann_file_train, data_prefix=dict(video=data_root), pipeline=train_pipeline)) val_dataloader = dict( batch_size=1, num_workers=8, persistent_workers=True, sampler=dict(type='DefaultSampler', shuffle=False), dataset=dict( type=dataset_type, ann_file=ann_file_val, data_prefix=dict(video=data_root_val), pipeline=val_pipeline, test_mode=True)) test_dataloader = dict( batch_size=1, num_workers=8, persistent_workers=True, sampler=dict(type='DefaultSampler', shuffle=False), dataset=dict( type=dataset_type, ann_file=ann_file_test, data_prefix=dict(video=data_root_val), pipeline=test_pipeline, test_mode=True))
val_evaluator = dict(type='AccMetric') test_evaluator = val_evaluator
train_cfg = dict( type='EpochBasedTrainLoop', max_epochs=20, val_begin=1, val_interval=1) val_cfg = dict(type='ValLoop') test_cfg = dict(type='TestLoop')
base_lr = 1.6e-3 optim_wrapper = dict( optimizer=dict( type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05), paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0), clip_grad=dict(max_norm=1, norm_type=2))
param_scheduler = [ dict( type='LinearLR', start_factor=0.01, by_epoch=True, begin=0, end=30, convert_to_iter_based=True), dict( type='CosineAnnealingLR', T_max=200, eta_min=base_lr / 100, by_epoch=True, begin=30, end=200, convert_to_iter_based=True) ]
default_hooks = dict( checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100))
Default setting for scaling LR automatically
-
enable
means enable scaling LR automaticallyor not by default.
-
base_batch_size
= (8 GPUs) x (8 samples per GPU).auto_scale_lr = dict(enable=True, base_batch_size=2 // repeat_sample)
This is mvit_small.py:
model = dict( type='Recognizer3D', backbone=dict(type='MViT', arch='small', drop_path_rate=0.2), data_preprocessor=dict( type='ActionDataPreprocessor', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], format_shape='NCTHW'), cls_head=dict( type='MViTHead', in_channels=768, num_classes=400, label_smooth_eps=0.1, average_clips='prob'))This is the code i run:
#---用于Mvit实现多目标,长视频预测--- import argparse import copy as cp import tempfileimport cv2 import mmcv import mmengine import numpy as np import torch from mmengine import DictAction from mmengine.runner import load_checkpoint from mmengine.structures import InstanceData
from mmaction.apis import detection_inference from mmaction.registry import MODELS from mmaction.structures import ActionDataSample from mmaction.utils import frame_extract, get_str_type try: import moviepy.editor as mpy except ImportError: raise ImportError('Please install moviepy to enable output file')
---一些颜色参数和可视化参数---
FONTFACE = cv2.FONT_HERSHEY_DUPLEX FONTSCALE = 0.5 FONTCOLOR = (255, 255, 255) # BGR, white MSGCOLOR = (128, 128, 128) # BGR, gray THICKNESS = 1 LINETYPE = 1 def hex2color(h): """Convert the 6-digit hex string to tuple of 3 int value (RGB)""" return (int(h[:2], 16), int(h[2:4], 16), int(h[4:], 16)) plate_blue = '03045e-023e8a-0077b6-0096c7-00b4d8-48cae4' plate_blue = plate_blue.split('-') plate_blue = [hex2color(h) for h in plate_blue] plate_green = '004b23-006400-007200-008000-38b000-70e000' plate_green = plate_green.split('-') plate_green = [hex2color(h) for h in plate_green]
---可视化---
def visualize(frames, annotations, plate=plate_blue, max_num=5): assert max_num + 1 <= len(plate) plate = [x[::-1] for x in plate] framesout = cp.deepcopy(frames) nf, na = len(frames), len(annotations) assert nf % na == 0 nfpa = len(frames) // len(annotations) anno = None h, w, = frames[0].shape scale_ratio = np.array([w, h, w, h]) for i in range(na): anno = annotations[i] if anno is None: continue for j in range(nfpa): ind = i nfpa + j frame = frames_out[ind] for ann in anno: box = ann[0] label = ann[1] if not len(label): continue score = ann[2] box = (box scale_ratio).astype(np.int64) st, ed = tuple(box[:2]), tuple(box[2:]) cv2.rectangle(frame, st, ed, plate[0], 2) for k, lb in enumerate(label): if k >= max_num: break text = abbrev(lb) text = ': '.join([text, f'{score[k]:>.2f}']) location = (0 + st[0], 18 + k * 18 + st[1]) textsize = cv2.getTextSize(text, FONTFACE, FONTSCALE, THICKNESS)[0] textwidth = textsize[0] diag0 = (location[0] + textwidth, location[1] - 14) diag1 = (location[0], location[1] + 2) cv2.rectangle(frame, diag0, diag1, plate[k + 1], -1) cv2.putText(frame, text, location, FONTFACE, FONTSCALE, FONTCOLOR, THICKNESS, LINETYPE)
---从一个文件中加载标签映射---
def load_label_map(file_path): lines = open(file_path).readlines() lines = [x.strip().split(': ') for x in lines] return {int(x[0]): x[1] for x in lines}
---缩短标签长度---
def abbrev(name): while name.find('(') != -1: st, ed = name.find('('), name.find(')') name = name[:st] + '...' + name[ed + 1:] return name
---打包,归一化预测结果
def pack_result(human_detection, result, img_h, img_w): human_detection[:, 0::2] /= img_w human_detection[:, 1::2] /= img_h results = [] if result is None: return None for prop, res in zip(human_detection, result): res.sort(key=lambda x: -x[1]) results.append( (prop.data.cpu().numpy(), [x[0] for x in res], [x[1] for x in res])) return results
---参数---
def parse_args(): parser = argparse.ArgumentParser(description='MMAction2 demo')
视频文件或URL
def main(): args = parse_args()
Build_model
if name == 'main': main() ` Thank you very much to anyone who replies.