Open YuktiADY opened 1 year ago
@YuktiADY could you share your entire training config?
@YuktiADY could you share your entire training config?
@cir7 Please find below the config:
model = dict( type='SkeletonGCN', backbone=dict( type='STGCN', in_channels=3, edge_importance_weighting=True, graph_cfg=dict(layout='coco', strategy='spatial')), cls_head=dict( type='STGCNHead',
num_classes=16,
in_channels=256,
loss_cls=dict(type='CrossEntropyLoss')),
train_cfg=None,
test_cfg=None)
dataset_type = 'OmnilabDataset'
ann_file_train = '/home/yukti/mmaction2/Dataset/train_dataset.json'
ann_file_val = '/home/yukti/mmaction2/Dataset/val_dataset.json' train_pipeline = [ dict(type='PaddingWithLoop', clip_len=300), dict(type='PoseDecode'), dict(type='FormatGCNInput', input_format='NCTVM'), dict(type='PoseNormalize'),
dict(type='Collect', keys=['keypoints', 'action'], meta_keys=[]),
dict(type='ToTensor', keys=['keypoint'])
] val_pipeline = [ dict(type='PaddingWithLoop', clip_len=300), dict(type='PoseDecode'), dict(type='FormatGCNInput', input_format='NCTVM'), dict(type='PoseNormalize'),
dict(type='Collect', keys=['keypoints', 'action'], meta_keys=[]),
dict(type='ToTensor', keys=['keypoint'])
] test_pipeline = [ dict(type='PaddingWithLoop', clip_len=300), dict(type='PoseDecode'), dict(type='FormatGCNInput', input_format='NCTVM'), dict(type='PoseNormalize'),
dict(type='Collect', keys=['keypoints', 'action'], meta_keys=[]),
dict(type='ToTensor', keys=['keypoint'])
] data = dict( videos_per_gpu=16, workers_per_gpu=2, test_dataloader=dict(videos_per_gpu=1), train=dict( type=dataset_type, ann_file=ann_file_train, data_prefix='', pipeline=train_pipeline), val=dict( type=dataset_type, ann_file=ann_file_val, data_prefix='', pipeline=val_pipeline), test=dict( type=dataset_type, ann_file=ann_file_val, data_prefix='', pipeline=test_pipeline))
optimizer = dict( type='SGD', lr=0.1, momentum=0.9, weight_decay=0.0001, nesterov=True) optimizer_config = dict(grad_clip=None)
lr_config = dict(policy='step', step=[10, 50]) total_epochs = 80 checkpoint_config = dict(interval=5) evaluation = dict(interval=5, metrics=['top_k_accuracy']) log_config = dict(interval=100, hooks=[dict(type='TextLoggerHook')])
dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/stgcn_80e_ntu60_xsub_keypoint/' load_from = None resume_from = None workflow = [('train', 1)]
@YuktiADY why do you want to build a custom dataset and what have you modified?
@YuktiADY why do you want to build a custom dataset and what have you modified?
@cir7 the dataset i am using is custom, and have to use for M.thesis. I am basically passing a json as input to the train_pipeline which has the following: info, license, category, images and annotations (LIKE COCO FORMAT)
train_pipeline = [ dict(type='PaddingWithLoop', clip_len=300), dict(type='PoseDecode'), dict(type='FormatGCNInput', input_format='NCTVM'), dict(type='PoseNormalize'), dict(type='Collect', keys=['keypoint', 'label'], meta_keys=[]), dict(type='ToTensor', keys=['keypoint'])
So could you tell me, what are these TYPE
Do I have to make changes in this dict: type as per my json file? for instance: type=info or type=annotations, keys=[annotations]
Also what is the input format on the 3rd line?? Can you clarify this??
@YuktiADY
I suggest you convert your dataset to the format we used, rather than build a custom dataset. we provide several pose anno pkl, you can take them as reference.
train_pipline
consists of data transforms, which transform the original json file to a tensor feed into the network.
type
is the transform class type, and remain keys are the args for the transform.
I strongly recommend you read the document to understand the structure of the codebase.
@cir7
def prepare_train_frames(self, idx): """Prepare the frames for training given the index.""" results = copy.deepcopy(self.video_infos[idx])
Could you please tell me what index value is it fetching? are these index values for each frame per video or what???
For example: frame 0: 000000.jpg frame 1: 000001.jpg
Are these the index, 0,1 ..... that it is fetching??
NEED SOME CLARIFICATION :))))
@YuktiADY
the idx
is the video index in data list, same as index
in __getitem__()
of torch.utils.data.Dataset
.
frame index is generated in SampleFrames
.
If you have no additional requirements, I don't think it is necessary to customize the dataset.
@YuktiADY the
idx
is the video index in data list, same asindex
in__getitem__()
oftorch.utils.data.Dataset
. frame index is generated inSampleFrames
. If you have no additional requirements, I don't think it is necessary to customize the dataset.
@cir7 I checked the pkl file using import pickle with open('data/gym_val.pkl', 'rb') as f: data = pickle.load(f)
I found that type(data) is LIST. :Please below :
[{'frame_dir': 'A0xAXXysHUo_002184_002237_0035_0036', 'label': 93, 'img_shape': (720, 1280), 'original_shape': (720, 1280), 'total_frames': 48, 'keypoint': array([[[[589. , 503.2], [592.5, 495.8], [581.5, 495.8], [664. , 575. ], [636. , 603. ]]]], dtype=float16), 'keypoint_score': array([[[0.8066 , 0.81 , 0.849 , 0.601 , 0.8145 , 0.6875 , 0.715 , 0.4683 , 0.8145 , 0.227 , 0.8203 , 0.5806 , 0.581 , 0.4094 , 0.614 , 0.502 , 0.3875 ],
[0.1251 , 0.08923, 0.12415, 0.1068 , 0.143 , 0.1553 , 0.1438 ,
0.3403 , 0.1858 , 0.183 , 0.5664 , 0.0936 , 0.107 , 0.3218 ,
0.4246 , 0.1307 , 0.1803 ],
Then i need to convert my json to list. As my json is a dictionary
After converting my json which was dict to a list [{"description": "", "url": "", "version": "1.0", "year": "2023", "contributor": "", "date_created": "02/05/23"}, [{"url": "", "id": 1, "name": ""}], [{"supercategory": "person", "id": 1, "name": "person", "keypoints": ["nose", "left_eye", "right_eye", "left_ear", "right_ear", "left_shoulder", "right_shoulder", "left_elbow", "right_elbow", "left_wrist", "right_wrist", "left_hip", "right_hip", "left_knee", "right_knee", "left_ankle", "right_ankle"], "skeleton": [[16, 14], [14, 12], [17, 15], [15, 13], [12, 13], [6, 12], [7, 13], [6, 7], [6, 8], [7, 9], [8, 10], [9, 11], [2, 3], [1, 2], [1, 3], [2, 4], [3, 5], [4, 6], [5, 7]]}], [{"file_name": "xxxx.png", "id": 2082, "width": 1200, "height": 1200, "coco_url": "", "flickr_url": "", "license": 1, , "cvat_task_id": "162", "action": "turnleft", "performer": "xxx", "room": "xxx"], [{"iscrowd": 0, "category_id": 1, "id": 0, "image_id": 2082, "bbox": [766.0, 447.5, 230.0, 135.0], "area": 31050.0, "num_keypoints": 17, "keypoints": [945.44, 463.19, 2, 949.47, 463.19, 2, 949.5, 461.2, 1, 953.49, 473.26, 2, 975.9, 493.4, 1, 903.15, 491.38, 2, 945.44, 521.59, 2, 858.84, 493.4, 2, 915.23, 547.77, 2, 832.66, 483.33, 2, 891.06, 559.85, 2, 852.8, 519.58, 2, 870.92, 537.7, 2, 821.0, 537.9, 2, 838.7, 527.63, 1, 780.3, 569.92, 2, 814.9, 522.8, 2]}]]
I am getting this error :
Traceback (most recent call last):
File "tools/train.py", line 222, in
Can you please tell me is the data is not matching with the data of this repo ? Also the data of STGCN-skeletal based action recognition being used in the repo is https://download.openmmlab.com/mmaction/posec3d/ntu60_xsub_train.pkl?
I suggest you convert your dataset to the format we used, rather than build a custom dataset. we provide several pose anno pkl, you can take them as reference. Can you explain about this ?
@cir7 Could you please answer my queries. It will be really helpful. Else i cant proceed with work Thesis work as there is a deadline.
Hope you understand.
Thanks in advance !
Could you please give me an example of your JSON file?
You must organize your data format according to ntu60_val. These keys are required if you use our PoseDataset
and the pipeline
you are using now: label, img_shape, total_frames, and keypoint
. Note that the shape of the keypoint
is (num_persion, total_frames, num_keypoints (e.g., for COCO-17, this value is 17), 2 (corresponding the x, y coordinates respectively))
. The key, img_shape
, will be used in PoseNormalize
to normalize the data distribution if you are using 2D skeleton data.
{"description": "", "url": "", "version": "1.0", "year": "2023", "contributor": "", "date_created": "02/05/23"}, [{"url": "", "id": 1, "name": ""}], [{"supercategory": "person", "id": 1, "name": "person", "keypoints": ["nose", "left_eye", "right_eye", "left_ear", "right_ear", "left_shoulder", "right_shoulder", "left_elbow", "right_elbow", "left_wrist", "right_wrist", "left_hip", "right_hip", "left_knee", "right_knee", "left_ankle", "right_ankle"], "skeleton": [[16, 14], [14, 12], [17, 15], [15, 13], [12, 13], [6, 12], [7, 13], [6, 7], [6, 8], [7, 9], [8, 10], [9, 11], [2, 3], [1, 2], [1, 3], [2, 4], [3, 5], [4, 6], [5, 7]]}], [{"file_name": "xxxx.png", "id": 2082, "width": 1200, "height": 1200, "coco_url": "", "flickr_url": "", "license": 1, , "cvat_task_id": "162", "action": "turnleft", "performer": "xxx", "room": "xxx"], [{"iscrowd": 0, "category_id": 1, "id": 0, "image_id": 2082, "bbox": [766.0, 447.5, 230.0, 135.0], "area": 31050.0, "num_keypoints": 17, "keypoints": [945.44, 463.19, 2, 949.47, 463.19, 2, 949.5, 461.2, 1, 953.49, 473.26, 2, 975.9, 493.4, 1, 903.15, 491.38, 2, 945.44, 521.59, 2, 858.84, 493.4, 2, 915.23, 547.77, 2, 832.66, 483.33, 2, 891.06, 559.85, 2, 852.8, 519.58, 2, 870.92, 537.7, 2, 821.0, 537.9, 2, 838.7, 527.63, 1, 780.3, 569.92, 2, 814.9, 522.8, 2]}
@Dai-Wenxun This is the structure of my json annotation file
{"description": "", "url": "", "version": "1.0", "year": "2023", "contributor": "", "date_created": "02/05/23"}, [{"url": "", "id": 1, "name": ""}], [{"supercategory": "person", "id": 1, "name": "person", "keypoints": ["nose", "left_eye", "right_eye", "left_ear", "right_ear", "left_shoulder", "right_shoulder", "left_elbow", "right_elbow", "left_wrist", "right_wrist", "left_hip", "right_hip", "left_knee", "right_knee", "left_ankle", "right_ankle"], "skeleton": [[16, 14], [14, 12], [17, 15], [15, 13], [12, 13], [6, 12], [7, 13], [6, 7], [6, 8], [7, 9], [8, 10], [9, 11], [2, 3], [1, 2], [1, 3], [2, 4], [3, 5], [4, 6], [5, 7]]}], [{"file_name": "xxxx.png", "id": 2082, "width": 1200, "height": 1200, "coco_url": "", "flickr_url": "", "license": 1, , "cvat_task_id": "162", "action": "turnleft", "performer": "xxx", "room": "xxx"], [{"iscrowd": 0, "category_id": 1, "id": 0, "image_id": 2082, "bbox": [766.0, 447.5, 230.0, 135.0], "area": 31050.0, "num_keypoints": 17, "keypoints": [945.44, 463.19, 2, 949.47, 463.19, 2, 949.5, 461.2, 1, 953.49, 473.26, 2, 975.9, 493.4, 1, 903.15, 491.38, 2, 945.44, 521.59, 2, 858.84, 493.4, 2, 915.23, 547.77, 2, 832.66, 483.33, 2, 891.06, 559.85, 2, 852.8, 519.58, 2, 870.92, 537.7, 2, 821.0, 537.9, 2, 838.7, 527.63, 1, 780.3, 569.92, 2, 814.9, 522.8, 2]} Awaiting your response.
You must organize your data format according to ntu60_val. These keys are required if you use our
PoseDataset
and thepipeline
you are using now:label, img_shape, total_frames, and keypoint
. Note that the shape of thekeypoint
is(num_persion, total_frames, num_keypoints (e.g., for COCO-17, this value is 17), 2 (corresponding the x, y coordinates respectively))
. The key,img_shape
, will be used inPoseNormalize
to normalize the data distribution if you are using 2D skeleton data.
@cir7 I did some changes in the config which are below
dataset_type = 'MyDataset' ann_file_val = '/home/yukti/mmaction2/Dataset/val_dataset.json' train_pipeline = [ dict(type='PaddingWithLoop', clip_len=300), dict(type='PoseDecode'), dict(type='FormatGCNInput', input_format='NCTVM'), dict(type='PoseNormalize'), dict(type='Collect', keys=['keypoints', 'action'], meta_keys=[]), dict(type='ToTensor', keys=['keypoint'])
Changed this function as mine is json file def load_annotations(self): """Load annotation file to get video information."""
assert self.ann_file.endswith('.json')
return self.load_json_annotations()
So could you please tell me with my above shown json file how to proceed further ?
@YuktiADY
just follow @Dai-Wenxun mentioned, convert your json file to the required format as a pkl file. As far as I have information so far, the MyDataset
is not needed.
@YuktiADY just follow @Dai-Wenxun mentioned, convert your json file to the required format as a pkl file. As far as I have information so far, the
MyDataset
is not needed.
@cir7 @Dai-Wenxun I have a question in conversion as. This is the structure of my json annotation file
Since i have different fields in my json file and fields are required in ntu_val.pkl : label, img_shape, total_frames, and keypoint. So do i need to change those fileds in ntu_pose_extraction.py as well accoding to my json ? because in my json i dont have frame_dir , img_shape field. So can i add or remove the fields according to my json or fields which are mentioned in ntu_pose_extraction.py and pkl file are mandatory to be there in my json too ?
convert your json file to the required format as a pkl file. after converting the my json file which was dict to list .It looks like this
[{'file_name': '1336.png', 'id': 1336, 'width': 1200, 'height': 1200, 'coco_url': '', 'flickr_url': '', 'license': 1, 'date_captured': '2022-08-31', 'cvat_task_id': '150', 'action': 'push-object', 'performer': 'xxx', 'room': 'xxxxroom'}, ...], [{'iscrowd': 0, 'category_id': 1, 'id': 0, 'image_id': 1372, 'bbox': [789.79, 520.7, 191.11, 153.42999999999995], 'area': 29322.007299999994, 'num_keypoints': 17, 'keypoints': [[468.9, 722.68, 2.0], [472.5, 730.6, 1.0], [461.0, 729.9, 1.0], [485.4, 739.5, 2.0], [449.6, 742.1, 1.0], [508.1, 687.84, 2.0], ]] Here 2.0 and 1.0 is the keypoint score every field is dict of items but as you can see like 'images' , annotations keys is a list consisting of dictionaries inside .
So can it work like that ? or it should be exactly like pkl file where it is like one big list inside that each item is a dictionary ? If you can answer these questions it will be really helpful and i can proceed with my work.
Also my input is frames only which are images as you can see in the json file . Do i need to change the def ntu_pose_extraction where frame_paths = extract_frame(vid) . How can i pass my images here because i dont have videos .?
@YuktiADY
frame_dir
can be any unique name for each video. img_shape
is (width, height). skeleton action recognition infers from a sequence of keypoints, and can't handle keypoints from one image, if the images can be concatenated as a video, you must reorganize the keypoints to shape like (num_persion, total_frames, num_keypoints (e.g., for COCO-17, this value is 17), 2 (corresponding the x, y coordinates respectively).ntu_pose_extraction.py
can only handle video input, you can modify the script or convert your images to a video.Hi, @Dipankar1997161
pose_results
is a np.ndarray with shape (num_person, num_frame, 17, 3)
, contains the keypoints info of the video ntu_pose_extraction.py
is designed to process the NTU dataset, we can not guarantee its effectiveness on other videos.@cir7 so I created a pickle file as per ntu format, but the 'label' in your case is an integer 'label': 48,
however, in my case it is a string 'label': 'fall-on-face',
For this a class file would be required, lets say: 0: dancing 1: walking
and so one,
I found 1 file which is: label_map_ntu120.txt
however, where is the following file accessed?? Because I would also have to create 1 and pass my file as well.
I have a total of 16 action class.
DO let me know. Have a great day
@YuktiADY the label is just an index, just map each category to an index as you want. just make sure the label is from 0 to num_class - 1, 15 for your case.
@YuktiADY The log can't give enough info to locate the issue, it looks like the keypoint shape is not matched as required, my suggestion is to try to use the pickle data we provided and find the difference from your data.
@YuktiADY that is as expected. I have no idea why the performance is slow, you can try:
@YuktiADY that is as expected. I have no idea why the performance is slow, you can try:
1. visualize your dataset to make sure that it is as expected 2. use pre-train checkpoint if your dataset is small 3. try tuning hyperparameters.
@cir7 In the config file the val and test are equal . val=dict( type=dataset_type, ann_file=ann_file_val, data_prefix='', pipeline=val_pipeline), test=dict( type=dataset_type, ann_file=ann_file_val, data_prefix='', pipeline=test_pipeline)) that means while training it validates using same data(pickle) and while testing or evaluating the model , also it is using same data. ?? How would the model the predict on unseend data since while training it has also seen. ?? Is that okay if i split my data in some train , val, test and use separate test data to evaluate the trained model.
Also if i am using the pre-trained model so i gave in load_from in config . Is it also required to paas load-from arg in the training script as well ?
@YuktiADY
@cir7 since my data is small . Could you please tell me way how can i increase my data i.e doing time series data augmentation. In my dataset total 16 actions. Each action has 60 frames. dataset consists of total 4790 images. how can i do this . ??
One way i am thinking to do
Do you have any idea of any other method through which data can be increased in large amount and efficient ?? If you have any reference code which i can refer to. It would be really helpful.
Awaiting for your response.
You must organize your data format according to ntu60_val. These keys are required if you use our
PoseDataset
and thepipeline
you are using now:label, img_shape, total_frames, and keypoint
. Note that the shape of thekeypoint
is(num_persion, total_frames, num_keypoints (e.g., for COCO-17, this value is 17), 2 (corresponding the x, y coordinates respectively))
. The key,img_shape
, will be used inPoseNormalize
to normalize the data distribution if you are using 2D skeleton data.@
1. all keys in [ntu60_val](https://download.openmmlab.com/mmaction/posec3d/ntu60_xsub_val.pkl) are required. `frame_dir` can be any unique name for each video. `img_shape` is (width, height). skeleton action recognition infers from a sequence of keypoints, and can't handle keypoints from one image, if the images can be concatenated as a video, you must reorganize the keypoints to shape like (num_persion, total_frames, num_keypoints (e.g., for COCO-17, this value is 17), 2 (corresponding the x, y coordinates respectively). 2. `ntu_pose_extraction.py` can only handle video input, you can modify the script or convert your images to a video.
@cir7 @Dai-Wenxun Is this ntu60_val pkl file same for 3d action recogntion too ? Because i have to do 3d action recogntion on my custom dataset.
The keys are
'frame_dir':
'label':
'img_shape':
'original_shape':
'total_frames':
'keypoint': x,y,z coordinates
'keypoint_score':
But this keypoint score key is required for 3d action ?
Hello @cir7 ,
Hope you are doing well.
KeyError in def prepare_train_frames (self, idx) Is the following idx, the frame index generated from DenseFlow after passing the videos as Input or is it something Else???
I am doing skeletal based action recognition using STGCN method where my annotation is .json format
When i am trying to train the model i am getting this error:
Traceback (most recent call last): File "tools/train.py", line 222, in
main()
File "tools/train.py", line 218, in main
meta=meta)
File "/home/yukti/mmaction2/mmaction/apis/train.py", line 232, in train_model
runner.run(data_loaders, cfg.workflow, cfg.total_epochs, runner_kwargs)
File "/home/yukti/miniconda3/envs/open-mmlab/lib/python3.7/site-packages/mmcv/runner/epoch_based_runner.py", line 136, in run
epoch_runner(data_loaders[i], kwargs)
File "/home/yukti/miniconda3/envs/open-mmlab/lib/python3.7/site-packages/mmcv/runner/epoch_based_runner.py", line 49, in train
for i, data_batch in enumerate(self.data_loader):
File "/home/yukti/miniconda3/envs/open-mmlab/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 628, in next
data = self._next_data()
File "/home/yukti/miniconda3/envs/open-mmlab/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1333, in _next_data
return self._process_data(data)
File "/home/yukti/miniconda3/envs/open-mmlab/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1359, in _process_data
data.reraise()
File "/home/yukti/miniconda3/envs/open-mmlab/lib/python3.7/site-packages/torch/_utils.py", line 543, in reraise
raise exception
KeyError: Caught KeyError in DataLoader worker process 0.
Original Traceback (most recent call last): File "/home/yukti/miniconda3/envs/open-mmlab/lib/python3.7/site-packages/torch/utils/data/_utils/worker.py", line 302, in _worker_loop data = fetcher.fetch(index) File "/home/yukti/miniconda3/envs/open-mmlab/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py", line 58, in fetch data = [self.dataset[idx] for idx in possibly_batched_index] File "/home/yukti/miniconda3/envs/open-mmlab/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py", line 58, in
data = [self.dataset[idx] for idx in possibly_batched_index]
File "/home/yukti/mmaction2/mmaction/datasets/base.py", line 289, in getitem
return self.prepare_train_frames(idx)
File "/home/yukti/mmaction2/mmaction/datasets/base.py", line 252, in prepare_train_frames
results = copy.deepcopy(self.video_infos[idx])
KeyError: 4
I think i need to make in two changes in base.py file
def load_json_annotations(self): """Load json annotation file to get video information.""" video_infos = mmcv.load(self.ann_file) num_videos = len(video_infos) path_key = 'frame_dir' if 'frame_dir' in video_infos[0] else 'filename' for i in range(num_videos): path_value = video_infos[i][path_key] if self.data_prefix is not None: path_value = osp.join(self.data_prefix, path_value) video_infos[i][path_key] = path_value if self.multi_class: assert self.num_classes is not None else: assert len(video_infos[i]['label']) == 1 video_infos[i]['label'] = video_infos[i]['label'][0] return video_infos
def evaluate(self, results, metrics='top_k_accuracy', metric_options=dict(top_k_accuracy=dict(topk=(1, 5))), logger=None, **deprecated_kwargs):
def prepare_train_frames(self, idx): """Prepare the frames for training given the index.""" results = copy.deepcopy(self.video_infos[idx]) results['modality'] = self.modality results['start_index'] = self.start_index
Could you please help me how to resolve this issue and adapt to my dataset.