open-mmlab / mmdetection

OpenMMLab Detection Toolbox and Benchmark
https://mmdetection.readthedocs.io
Apache License 2.0
29.68k stars 9.48k forks source link

'Recognizer3D' is not in the mmdet::model registry. #11864

Open Ktour123 opened 4 months ago

Ktour123 commented 4 months ago

Hello, I encountered an issue while using MMaction2: KeyError: 'Recognizer3D is not in the mmdet::model registry. Please check whether the value of Recognizer3D is correct or it was registered as expected. More details can be found at https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#import-the-custom-module' This is my config:`base = [ '../../base/models/mvit_small.py', '../../base/default_runtime.py' ]

model = dict( data_preprocessor=dict( type='ActionDataPreprocessor', mean=[114.75, 114.75, 114.75], std=[57.375, 57.375, 57.375], blending=dict( type='RandomBatchAugment', augments=[ dict(type='MixupBlending', alpha=0.8, num_classes=800), dict(type='CutmixBlending', alpha=1, num_classes=800) ]), format_shape='NCTHW'), ) model = dict( cls_head=dict(num_classes=3))

"/home/miao/mmaction2/data/"

load_from = '/home/miao/mmaction2/work_dirs/mvit-small-p244_32xb16-16x4x1-200e_kinetics400-rgb/best_acc_top1_epoch_16.pth'

dataset settings

dataset_type = 'VideoDataset' data_root = '/home/miao/mmaction2/data/kinetics400_tiny/train/' data_root_val = '/home/miao/mmaction2/data/kinetics400_tiny/val/' ann_file_train = '/home/miao/mmaction2/data/kinetics400_tiny/kinetics_tiny_train_video.txt' ann_file_val = '/home/miao/mmaction2/data/kinetics400_tiny/kinetics_tiny_val_video.txt' ann_file_test ='/home/miao/mmaction2/data/kinetics400_tiny/kinetics_tiny_val_list_video.txt'

file_client_args = dict(io_backend='disk')

train_pipeline = [ dict(type='DecordInit', file_client_args), dict(type='SampleFrames', clip_len=16, frame_interval=4, num_clips=1), dict(type='DecordDecode'), dict(type='Resize', scale=(-1, 256)), dict( type='PytorchVideoWrapper', op='RandAugment', magnitude=7, num_layers=4), dict(type='RandomResizedCrop'), dict(type='Resize', scale=(224, 224), keep_ratio=False), dict(type='Flip', flip_ratio=0.5), dict(type='RandomErasing', erase_prob=0.25, mode='rand'), dict(type='FormatShape', input_format='NCTHW'), dict(type='PackActionInputs') ] val_pipeline = [ dict(type='DecordInit', file_client_args), dict( type='SampleFrames', clip_len=16, frame_interval=4, num_clips=1, test_mode=True), dict(type='DecordDecode'), dict(type='Resize', scale=(-1, 256)), dict(type='CenterCrop', crop_size=224), dict(type='FormatShape', input_format='NCTHW'), dict(type='PackActionInputs') ] test_pipeline = [ dict(type='DecordInit', **file_client_args), dict( type='SampleFrames', clip_len=16, frame_interval=4, num_clips=5, test_mode=True), dict(type='DecordDecode'), dict(type='Resize', scale=(-1, 224)), dict(type='CenterCrop', crop_size=224), dict(type='FormatShape', input_format='NCTHW'), dict(type='PackActionInputs') ]

repeat_sample = 2 train_dataloader = dict( batch_size=1, num_workers=8, persistent_workers=True, sampler=dict(type='DefaultSampler', shuffle=True), collate_fn=dict(type='repeat_pseudo_collate'), dataset=dict( type='RepeatAugDataset', num_repeats=repeat_sample, sample_once=True, ann_file=ann_file_train, data_prefix=dict(video=data_root), pipeline=train_pipeline)) val_dataloader = dict( batch_size=1, num_workers=8, persistent_workers=True, sampler=dict(type='DefaultSampler', shuffle=False), dataset=dict( type=dataset_type, ann_file=ann_file_val, data_prefix=dict(video=data_root_val), pipeline=val_pipeline, test_mode=True)) test_dataloader = dict( batch_size=1, num_workers=8, persistent_workers=True, sampler=dict(type='DefaultSampler', shuffle=False), dataset=dict( type=dataset_type, ann_file=ann_file_test, data_prefix=dict(video=data_root_val), pipeline=test_pipeline, test_mode=True))

val_evaluator = dict(type='AccMetric') test_evaluator = val_evaluator

train_cfg = dict( type='EpochBasedTrainLoop', max_epochs=20, val_begin=1, val_interval=1) val_cfg = dict(type='ValLoop') test_cfg = dict(type='TestLoop')

base_lr = 1.6e-3 optim_wrapper = dict( optimizer=dict( type='AdamW', lr=base_lr, betas=(0.9, 0.999), weight_decay=0.05), paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0), clip_grad=dict(max_norm=1, norm_type=2))

param_scheduler = [ dict( type='LinearLR', start_factor=0.01, by_epoch=True, begin=0, end=30, convert_to_iter_based=True), dict( type='CosineAnnealingLR', T_max=200, eta_min=base_lr / 100, by_epoch=True, begin=30, end=200, convert_to_iter_based=True) ]

default_hooks = dict( checkpoint=dict(interval=3, max_keep_ckpts=5), logger=dict(interval=100))

Default setting for scaling LR automatically

- enable means enable scaling LR automatically

or not by default.

- base_batch_size = (8 GPUs) x (8 samples per GPU).

auto_scale_lr = dict(enable=True, base_batch_size=2 // repeat_sample) This is mvit_small.py: model = dict( type='Recognizer3D', backbone=dict(type='MViT', arch='small', drop_path_rate=0.2), data_preprocessor=dict( type='ActionDataPreprocessor', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], format_shape='NCTHW'), cls_head=dict( type='MViTHead', in_channels=768, num_classes=400, label_smooth_eps=0.1, average_clips='prob')) This is the code i run: #---用于Mvit实现多目标,长视频预测--- import argparse import copy as cp import tempfile

import cv2 import mmcv import mmengine import numpy as np import torch from mmengine import DictAction from mmengine.runner import load_checkpoint from mmengine.structures import InstanceData

from mmaction.apis import detection_inference from mmaction.registry import MODELS from mmaction.structures import ActionDataSample from mmaction.utils import frame_extract, get_str_type try: import moviepy.editor as mpy except ImportError: raise ImportError('Please install moviepy to enable output file')

---一些颜色参数和可视化参数---

FONTFACE = cv2.FONT_HERSHEY_DUPLEX FONTSCALE = 0.5 FONTCOLOR = (255, 255, 255) # BGR, white MSGCOLOR = (128, 128, 128) # BGR, gray THICKNESS = 1 LINETYPE = 1 def hex2color(h): """Convert the 6-digit hex string to tuple of 3 int value (RGB)""" return (int(h[:2], 16), int(h[2:4], 16), int(h[4:], 16)) plate_blue = '03045e-023e8a-0077b6-0096c7-00b4d8-48cae4' plate_blue = plate_blue.split('-') plate_blue = [hex2color(h) for h in plate_blue] plate_green = '004b23-006400-007200-008000-38b000-70e000' plate_green = plate_green.split('-') plate_green = [hex2color(h) for h in plate_green]

---可视化---

def visualize(frames, annotations, plate=plate_blue, max_num=5): assert max_num + 1 <= len(plate) plate = [x[::-1] for x in plate] framesout = cp.deepcopy(frames) nf, na = len(frames), len(annotations) assert nf % na == 0 nfpa = len(frames) // len(annotations) anno = None h, w, = frames[0].shape scale_ratio = np.array([w, h, w, h]) for i in range(na): anno = annotations[i] if anno is None: continue for j in range(nfpa): ind = i nfpa + j frame = frames_out[ind] for ann in anno: box = ann[0] label = ann[1] if not len(label): continue score = ann[2] box = (box scale_ratio).astype(np.int64) st, ed = tuple(box[:2]), tuple(box[2:]) cv2.rectangle(frame, st, ed, plate[0], 2) for k, lb in enumerate(label): if k >= max_num: break text = abbrev(lb) text = ': '.join([text, f'{score[k]:>.2f}']) location = (0 + st[0], 18 + k * 18 + st[1]) textsize = cv2.getTextSize(text, FONTFACE, FONTSCALE, THICKNESS)[0] textwidth = textsize[0] diag0 = (location[0] + textwidth, location[1] - 14) diag1 = (location[0], location[1] + 2) cv2.rectangle(frame, diag0, diag1, plate[k + 1], -1) cv2.putText(frame, text, location, FONTFACE, FONTSCALE, FONTCOLOR, THICKNESS, LINETYPE)

return frames_out

---从一个文件中加载标签映射---

def load_label_map(file_path): lines = open(file_path).readlines() lines = [x.strip().split(': ') for x in lines] return {int(x[0]): x[1] for x in lines}

---缩短标签长度---

def abbrev(name): while name.find('(') != -1: st, ed = name.find('('), name.find(')') name = name[:st] + '...' + name[ed + 1:] return name

---打包,归一化预测结果

def pack_result(human_detection, result, img_h, img_w): human_detection[:, 0::2] /= img_w human_detection[:, 1::2] /= img_h results = [] if result is None: return None for prop, res in zip(human_detection, result): res.sort(key=lambda x: -x[1]) results.append( (prop.data.cpu().numpy(), [x[0] for x in res], [x[1] for x in res])) return results

---参数---

def parse_args(): parser = argparse.ArgumentParser(description='MMAction2 demo')

视频文件或URL

parser.add_argument('video', help='video file/url')
# 输出文件名
parser.add_argument('out_filename', help='output filename')
# 检测模型配置文件路径
parser.add_argument(
    '--config',
    default=('/home/miao/mmaction2/configs/recognition/mvit/mvit-small-p244_32xb16-16x4x1-200e_kinetics400-rgb.py'),
    help='detection model config file path')
# 检测模型ckpt文件/URL
parser.add_argument(
    '--checkpoint',
    default=('/home/miao/mmaction2/work_dirs/mvit-small-p244_32xb16-16x4x1-200e_kinetics400-rgb/best_acc_top1_epoch_16.pth'),
    help='spatialtemporal detection model checkpoint file/url')
# dec检测配置文件路径
parser.add_argument(
    '--det-config',
    default='demo/demo_configs/faster-rcnn_r50_fpn_2x_coco_infer.py',
    help='human detection config file path (from mmdet)')
# dec检测ckpt文件/URL
parser.add_argument(
    '--det-checkpoint',
    default=('http://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/'
             'faster_rcnn_r50_fpn_2x_coco/'
             'faster_rcnn_r50_fpn_2x_coco_'
             'bbox_mAP-0.384_20200504_210434-a5d8aa15.pth'),
    help='human detection checkpoint file/url')
# 检测分数阈值
parser.add_argument(
    '--det-score-thr',
    type=float,
    default=0.9,
    help='the threshold of human detection score')
# 检测类别ID
parser.add_argument(
    '--det-cat-id',
    type=int,
    default=0,
    help='the category id for human detection')
# 动作分数阈值
parser.add_argument(
    '--action-score-thr',
    type=float,
    default=0.5,
    help='the threshold of human action score')
# 标签映射文件
parser.add_argument(
    '--label-map',
    default='tools/data/ava/label_map.txt',
    help='label map file')
# 设备选项
parser.add_argument(
    '--device', type=str, default='cuda:0', help='CPU/CUDA device option')
# 指定图像的短边长度
parser.add_argument(
    '--short-side',
    type=int,
    default=256,
    help='specify the short-side length of the image')
# 每n帧进行一次预测
parser.add_argument(
    '--predict-stepsize',
    default=8,
    type=int,
    help='give out a prediction per n frames')
# 每n帧显示一次输出
parser.add_argument(
    '--output-stepsize',
    default=4,
    type=int,
    help=('show one frame per n frames in the demo, we should have: '
          'predict_stepsize % output_stepsize == 0'))
# 输出视频的帧率
parser.add_argument(
    '--output-fps',
    default=6,
    type=int,
    help='the fps of demo video output')
# 覆盖配置文件中的某些设置
parser.add_argument(
    '--cfg-options',
    nargs='+',
    action=DictAction,
    default={},
    help='override some settings in the used config, the key-value pair '
    'in xxx=yyy format will be merged into config file. For example, '
    "'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'")
# 解析参数
args = parser.parse_args()
return args

def main(): args = parse_args()

tmp_dir = tempfile.TemporaryDirectory()
frame_paths, original_frames = frame_extract(
    args.video, out_dir=tmp_dir.name)
num_frame = len(frame_paths)
h, w, _ = original_frames[0].shape

# resize frames to shortside
new_w, new_h = mmcv.rescale_size((w, h), (args.short_side, np.Inf))
frames = [mmcv.imresize(img, (new_w, new_h)) for img in original_frames]
w_ratio, h_ratio = new_w / w, new_h / h

# Get clip_len, frame_interval and calculate center index of each clip
config = mmengine.Config.fromfile(args.config)
config.merge_from_dict(args.cfg_options)
val_pipeline = config.val_pipeline

sampler = [
    x for x in val_pipeline if get_str_type(x['type']) == 'SampleFrames'][0]
clip_len, frame_interval = sampler['clip_len'], sampler['frame_interval']
window_size = clip_len * frame_interval
assert clip_len % 2 == 0, 'We would like to have an even clip_len'
# Note that it's 1 based here
timestamps = np.arange(window_size // 2, num_frame + 1 - window_size // 2,
                       args.predict_stepsize)

# Load label_map
label_map = load_label_map(args.label_map)
try:
    if config['data']['train']['custom_classes'] is not None:
        label_map = {
            id + 1: label_map[cls]
            for id, cls in enumerate(config['data']['train']
                                     ['custom_classes'])
        }
except KeyError:
    pass

# Get Human detection results
center_frames = [frame_paths[ind - 1] for ind in timestamps]

human_detections, _ = detection_inference(args.det_config,
                                          args.det_checkpoint,
                                          center_frames,
                                          args.det_score_thr,
                                          args.det_cat_id, args.device)
torch.cuda.empty_cache()
for i in range(len(human_detections)):
    det = human_detections[i]
    det[:, 0:4:2] *= w_ratio
    det[:, 1:4:2] *= h_ratio
    human_detections[i] = torch.from_numpy(det[:, :4]).to(args.device)

Build_model

try:
    config['model']['test_cfg']['rcnn'] = dict(action_thr=0)
except KeyError:
    pass
config.model.backbone.pretrained = None
model = MODELS.build(config.model)
load_checkpoint(model, args.checkpoint, map_location='cpu')
model.to(args.device)
model.eval()

predictions = []

img_norm_cfg = dict(
    mean=np.array(config.model.data_preprocessor.mean),
    std=np.array(config.model.data_preprocessor.std),
    to_rgb=False)

print('Performing Action Detection for each clip')
assert len(timestamps) == len(human_detections)
prog_bar = mmengine.ProgressBar(len(timestamps))
for timestamp, proposal in zip(timestamps, human_detections):
    if proposal.shape[0] == 0:
        predictions.append(None)
        continue

    start_frame = timestamp - (clip_len // 2 - 1) * frame_interval
    frame_inds = start_frame + np.arange(0, window_size, frame_interval)
    frame_inds = list(frame_inds - 1)
    imgs = [frames[ind].astype(np.float32) for ind in frame_inds]
    _ = [mmcv.imnormalize_(img, **img_norm_cfg) for img in imgs]
    # THWC -> CTHW -> 1CTHW
    input_array = np.stack(imgs).transpose((3, 0, 1, 2))[np.newaxis]
    input_tensor = torch.from_numpy(input_array).to(args.device)

    datasample = ActionDataSample()
    datasample.proposals = InstanceData(bboxes=proposal)
    datasample.set_metainfo(dict(img_shape=(new_h, new_w)))
    with torch.no_grad():
        result = model(input_tensor, [datasample], mode='predict')
        scores = result[0].pred_instances.scores
        prediction = []
        # N proposals
        for i in range(proposal.shape[0]):
            prediction.append([])
        # Perform action score thr
        for i in range(scores.shape[1]):
            if i not in label_map:
                continue
            for j in range(proposal.shape[0]):
                if scores[j, i] > args.action_score_thr:
                    prediction[j].append((label_map[i], scores[j,
                                                               i].item()))
        predictions.append(prediction)
    prog_bar.update()

results = []
for human_detection, prediction in zip(human_detections, predictions):
    results.append(pack_result(human_detection, prediction, new_h, new_w))

def dense_timestamps(timestamps, n):
    """Make it nx frames."""
    old_frame_interval = (timestamps[1] - timestamps[0])
    start = timestamps[0] - old_frame_interval / n * (n - 1) / 2
    new_frame_inds = np.arange(
        len(timestamps) * n) * old_frame_interval / n + start
    return new_frame_inds.astype(np.int64)

dense_n = int(args.predict_stepsize / args.output_stepsize)
frames = [
    cv2.imread(frame_paths[i - 1])
    for i in dense_timestamps(timestamps, dense_n)
]
print('Performing visualization')
vis_frames = visualize(frames, results)
vid = mpy.ImageSequenceClip([x[:, :, ::-1] for x in vis_frames],
                            fps=args.output_fps)
vid.write_videofile(args.out_filename)

tmp_dir.cleanup()

if name == 'main': main() ` Thank you very much to anyone who replies.

TranThanhTuan2509 commented 1 month ago

have you solved it yet?