sinhasaptarshi / EveryShotCounts

Codebase for "Every Shot Counts: Using Exemplars for Repetition Counting in Videos"
MIT License
19 stars 0 forks source link

Thanks #3

Open syanng opened 1 month ago

syanng commented 1 month ago

There is no issue at all, but I want to thank you for this great project: great ideas, detailed paper, and easy-to-implement code.

sinhasaptarshi commented 1 month ago

thank you for your appreciation!!!! :-) :- )

syanng commented 4 weeks ago

@sinhasaptarshi Is it possible for me to provide a exampler during the inference phase? Because I found that sometimes the network still fails to detect strange action or different speed.

sinhasaptarshi commented 3 weeks ago

yes it is possible to use exemplars during inference. you can change the shot_num_ in L179 in here to get results using exemplars

sinhasaptarshi commented 3 weeks ago

Alternately, you can update the demo.py code to use exemplars.

def extract_exemplar_tokens(video, model, args, num_frames=16, starts=[0,0], ends=[-1,-1]):
    C, T, H, W = video.shape
    padding = torch.zeros([C, 64, H, W])
    video = torch.cat([video, padding], 1)
    clip_list = []
    num_exemplars = len(starts)
    for j in range(num_exemplars):
        s = starts[j]  ## start times of each repetition
        e = ends[j]  ## end times of each repetition
        if s==e:
            continue
        idx = np.linspace(s, min(e, video.shape[1]-1), num_frames+1)[:num_frames].astype(int) ###sample 16 frames from the repetition segment defined by the start and end
        clips = video[:, idx]
        clip_list.append(clips)
    data = torch.stack(clip_list).to(args.resource)
    dtype = 'cuda' if 'cuda' in args.resource else 'cpu'
    with torch.autocast(enabled='cuda' in args.resource, device_type=dtype):
        with torch.no_grad():
            encoded, thw = model(data)  ## extract encodings
            encoded = encoded.transpose(1,2).reshape(encoded.shape[0], encoded.shape[-1], thw[0], thw[1], thw[2])
    del data
    # del model
    # encoded = encoded.cpu()
    return encoded
def extract_tokens(video, model, args, num_frames=16):
    C, T, H, W = video.shape
    padding = torch.zeros([C, 64, H, W])
    video = torch.cat([video, padding], 1)
    clip_list = []
    for j in range(0, T, 16):
        idx = np.linspace(j, j+64, num_frames+1)[:num_frames].astype(int)
        clips = video[:, idx]
        clip_list.append(clips)
    data = torch.stack(clip_list).to(args.resource)
    dtype = 'cuda' if 'cuda' in args.resource else 'cpu'
    with torch.autocast(enabled='cuda' in args.resource, device_type=dtype):
        with torch.no_grad():
            encoded, thw = model(data)  ## extract encodings
            encoded = encoded.transpose(1,2).reshape(encoded.shape[0], encoded.shape[-1], thw[0], thw[1], thw[2])
    del data
    # del model
    # encoded = encoded.cpu()
    return encoded

cap = cv2.VideoCapture(video)
num_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
fps = cap.get(cv2.CAP_PROP_FPS)

start = 5.134 * fps
end = 10.281 * fps
cap.release()
transform =  create_video_transform(mode="test",
                                            convert_to_float=False,
                                            min_size = 224,
                                            crop_size = 224,
                                            num_samples = None,
                                            video_mean = [0.485,0.456,0.406], 
                                            video_std = [0.229,0.224,0.225])
frame_idx = np.arange(0, num_frames, 1)
print(len(frame_idx))
no_frames = len(frame_idx)
### read frames from video
vid_frames, _ = read_video_timestamps(video, frame_idx)
vid_frames = transform(vid_frames/255.)
### load encoder checkpoint pretrained on Kinetics
### encode exemplars
print(args.zero_shot)
if not args.zero_shot:
            exemplar = extract_exemplar_tokens(vid_frames, encoder, args, starts=[int(start)], ends=[int(end)])
            exemplar = exemplar[0:1]
            exemplar = einops.rearrange(exemplar, 'S C T H W -> S (T H W) C')
            # exemplar = einops.rearrange(exemplar, 'B C T H W -> B (T H W) C')
            # print(exemplar.shape)
        ### encode video
encoded = extract_tokens(vid_frames, encoder, args)
del encoder
encoded = encoded[0::4]
encoded = einops.rearrange(encoded, 'S C T H W -> C (S T) H W')
# del state_dict
if args.pool_tokens < 1.0:
            factor = math.ceil(encoded.shape[-1] * args.pool_tokens)
            tokens = torch.nn.functional.adaptive_avg_pool3d(encoded, (encoded.shape[-3], factor, factor))

del encoded

##placeholder exemplar
tokens = tokens.unsqueeze(0)
shapes = tokens.shape[-3:]
tokens = einops.rearrange(tokens, 'B C T H W -> B (T H W) C')
# print(tokens.shape)
dtype = 'cuda' if 'cuda' in args.resource else 'cpu'
with torch.autocast(enabled='cuda' in args.resource, device_type=dtype):
         with torch.no_grad():
                if args.zero_shot:
                    predicted_density_map = decoder(tokens, thw=[shapes,], shot_num=0)
                else:
                    print(exemplar.shape)
                    predicted_density_map = decoder(tokens, exemplar, thw=[shapes,], shot_num=1)
        # print(pred.shape)
predicted_density_map = predicted_density_map[0]
predicted_counts = predicted_density_map.sum().item()/args.scale_counts
syanng commented 3 weeks ago

That is very great of you to provide me with the code to run. I have tried with/without exampler method and do not have a correct result with some random video on youtube like this. I am unsure where I am wrong but I have tried several other repos and only RepNet provides the nearest count.