Open syanng opened 1 month ago
thank you for your appreciation!!!! :-) :- )
@sinhasaptarshi Is it possible for me to provide a exampler during the inference phase? Because I found that sometimes the network still fails to detect strange action or different speed.
yes it is possible to use exemplars during inference.
you can change the shot_num_
in L179 in here to get results using exemplars
Alternately, you can update the demo.py
code to use exemplars.
def extract_exemplar_tokens(video, model, args, num_frames=16, starts=[0,0], ends=[-1,-1]):
C, T, H, W = video.shape
padding = torch.zeros([C, 64, H, W])
video = torch.cat([video, padding], 1)
clip_list = []
num_exemplars = len(starts)
for j in range(num_exemplars):
s = starts[j] ## start times of each repetition
e = ends[j] ## end times of each repetition
if s==e:
continue
idx = np.linspace(s, min(e, video.shape[1]-1), num_frames+1)[:num_frames].astype(int) ###sample 16 frames from the repetition segment defined by the start and end
clips = video[:, idx]
clip_list.append(clips)
data = torch.stack(clip_list).to(args.resource)
dtype = 'cuda' if 'cuda' in args.resource else 'cpu'
with torch.autocast(enabled='cuda' in args.resource, device_type=dtype):
with torch.no_grad():
encoded, thw = model(data) ## extract encodings
encoded = encoded.transpose(1,2).reshape(encoded.shape[0], encoded.shape[-1], thw[0], thw[1], thw[2])
del data
# del model
# encoded = encoded.cpu()
return encoded
def extract_tokens(video, model, args, num_frames=16):
C, T, H, W = video.shape
padding = torch.zeros([C, 64, H, W])
video = torch.cat([video, padding], 1)
clip_list = []
for j in range(0, T, 16):
idx = np.linspace(j, j+64, num_frames+1)[:num_frames].astype(int)
clips = video[:, idx]
clip_list.append(clips)
data = torch.stack(clip_list).to(args.resource)
dtype = 'cuda' if 'cuda' in args.resource else 'cpu'
with torch.autocast(enabled='cuda' in args.resource, device_type=dtype):
with torch.no_grad():
encoded, thw = model(data) ## extract encodings
encoded = encoded.transpose(1,2).reshape(encoded.shape[0], encoded.shape[-1], thw[0], thw[1], thw[2])
del data
# del model
# encoded = encoded.cpu()
return encoded
cap = cv2.VideoCapture(video)
num_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
fps = cap.get(cv2.CAP_PROP_FPS)
start = 5.134 * fps
end = 10.281 * fps
cap.release()
transform = create_video_transform(mode="test",
convert_to_float=False,
min_size = 224,
crop_size = 224,
num_samples = None,
video_mean = [0.485,0.456,0.406],
video_std = [0.229,0.224,0.225])
frame_idx = np.arange(0, num_frames, 1)
print(len(frame_idx))
no_frames = len(frame_idx)
### read frames from video
vid_frames, _ = read_video_timestamps(video, frame_idx)
vid_frames = transform(vid_frames/255.)
### load encoder checkpoint pretrained on Kinetics
### encode exemplars
print(args.zero_shot)
if not args.zero_shot:
exemplar = extract_exemplar_tokens(vid_frames, encoder, args, starts=[int(start)], ends=[int(end)])
exemplar = exemplar[0:1]
exemplar = einops.rearrange(exemplar, 'S C T H W -> S (T H W) C')
# exemplar = einops.rearrange(exemplar, 'B C T H W -> B (T H W) C')
# print(exemplar.shape)
### encode video
encoded = extract_tokens(vid_frames, encoder, args)
del encoder
encoded = encoded[0::4]
encoded = einops.rearrange(encoded, 'S C T H W -> C (S T) H W')
# del state_dict
if args.pool_tokens < 1.0:
factor = math.ceil(encoded.shape[-1] * args.pool_tokens)
tokens = torch.nn.functional.adaptive_avg_pool3d(encoded, (encoded.shape[-3], factor, factor))
del encoded
##placeholder exemplar
tokens = tokens.unsqueeze(0)
shapes = tokens.shape[-3:]
tokens = einops.rearrange(tokens, 'B C T H W -> B (T H W) C')
# print(tokens.shape)
dtype = 'cuda' if 'cuda' in args.resource else 'cpu'
with torch.autocast(enabled='cuda' in args.resource, device_type=dtype):
with torch.no_grad():
if args.zero_shot:
predicted_density_map = decoder(tokens, thw=[shapes,], shot_num=0)
else:
print(exemplar.shape)
predicted_density_map = decoder(tokens, exemplar, thw=[shapes,], shot_num=1)
# print(pred.shape)
predicted_density_map = predicted_density_map[0]
predicted_counts = predicted_density_map.sum().item()/args.scale_counts
That is very great of you to provide me with the code to run. I have tried with/without exampler method and do not have a correct result with some random video on youtube like this. I am unsure where I am wrong but I have tried several other repos and only RepNet provides the nearest count.
There is no issue at all, but I want to thank you for this great project: great ideas, detailed paper, and easy-to-implement code.