jfzhang95 / pytorch-video-recognition

PyTorch implemented C3D, R3D, R2Plus1D models for video activity recognition.
MIT License
1.18k stars 250 forks source link

Some questions about inference.py #54

Open Zhangzhiwei1995 opened 3 years ago

Zhangzhiwei1995 commented 3 years ago

I tested the videos in UCF101 on the trained VGG19 network (the training accuracy rate has reached 85%), but the results I got were all wrong classification.

May I ask what might be the reason? Thank you very much.

Here is my revised code, thank you very much.

`import torch import numpy as np

from network import C3D_model

import cv2 from torch import nn from torchvision import transforms from PIL import Image import config import os from model import CNNEncoder, RNNDecoder torch.backends.cudnn.benchmark = True

def CenterCrop(frame, size): h, w = np.shape(frame)[0:2] th, tw = size x1 = int(round((w - tw) / 2.)) y1 = int(round((h - th) / 2.))

frame = frame[y1:y1 + th, x1:x1 + tw, :]
return np.array(frame).astype(np.uint8)

def center_crop(frame):

frame = frame[8:120, 30:142, :]

return np.array(frame).astype(np.uint8)

def transform(self, img): return transforms.Compose([ transforms.Resize((config.img_w, config.img_h)), transforms.ToTensor(), transforms.Normalize( mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] ) ])(img) def main(): device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print("Device being used:", device)

with open('./data/ucf_labels.txt', 'r') as f:
    class_names = f.readlines()
    f.close()

# init models
#model = C3D_model.C3D(num_classes=101)

model = nn.Sequential(
    CNNEncoder(**config.cnn_encoder_params),
    RNNDecoder(**config.rnn_decoder_params)
)

file_epath ='./checkpoints/VGG19'
filenames = os.listdir(file_epath)
print(filenames)
file_list = sorted(filenames, key=lambda x: os.path.getmtime(os.path.join(file_epath, x)))

checkpoint = torch.load(os.path.join(file_epath, file_list[-1]), map_location=lambda storage, loc: storage)
#print(checkpoint)
model.load_state_dict(checkpoint['model_state_dict'])
model.to(device)
model.eval()

# read video
video = 'G:/dataset/UCF-101/PlayingPiano/v_PlayingPiano_g01_c03.avi'
cap = cv2.VideoCapture(video)

cap = cv2.VideoCapture(0)

retaining = True

clip = []
while retaining:
    retaining, frame = cap.read()
    if not retaining and frame is None:
        continue
    # tmp=transform(frame)
    # tmp_ = center_crop(cv2.resize(frame, (171, 128)))
    tmp_ = center_crop(cv2.resize(frame, (config.img_w, config.img_h)))
    # tmp_ = CenterCrop(frame, (config.img_w, config.img_h))
    tmp = tmp_ - np.array([[[90.0, 98.0, 102.0]]])
    clip.append(tmp)
    if len(clip) == 20:
        inputs = np.array(clip).astype(np.float32)
        inputs = np.expand_dims(inputs, axis=0)
        # inputs = np.transpose(inputs, (0, 4, 1, 2, 3))
        inputs = torch.from_numpy(inputs)
        inputs = torch.autograd.Variable(inputs, requires_grad=False).to(device)
        with torch.no_grad():
            outputs = model.forward(inputs)

        probs = torch.nn.Softmax(dim=1)(outputs)
        label = torch.max(probs, 1)[1].detach().cpu().numpy()[0]

        cv2.putText(frame, class_names[label].split(' ')[-1].strip(), (20, 20),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.6,
                    (0, 0, 255), 1)
        cv2.putText(frame, "prob: %.4f" % probs[0][label], (20, 40),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.6,
                    (0, 0, 255), 1)
        clip.pop(0)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break
    cv2.imshow('result', frame)
    cv2.waitKey(30)

cap.release()
cv2.destroyAllWindows()

if name == 'main': main()

`