How could I use VIDEO mode on gesture recognizer?

Hello, I've tried to read Video frame to numpy array. Did I missed something to make an input of recognizer?

import random
import ctypes 
from PIL import Image
with vision.GestureRecognizer.create_from_options(options) as recognizer:
  cap = cv2.VideoCapture('TRAIN_300.mp4')
  print("==== Video Info. ===== ")
  #print(cv2.CAP_PROP_FRAME_WIDTH) 
  #print(cv2.CAP_PROP_FRAME_HEIGHT)
  fps = cv2.CAP_PROP_FPS
  #print(fps)
  timestamps = [cv2.CAP_PROP_POS_MSEC]
  calc_timestamps = [0.0]
  timearray = []

  frameCount = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
  frameWidth = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
  frameHeight = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

  buf = np.empty((frameCount, frameHeight, frameWidth, 3), np.dtype('uint8'))

  fc = 0
  ret = True

  while (fc < frameCount  and ret):
    ret, buf[fc] = cap.read()
    fc += 1 
    timestamps.append(cap.get(cv2.CAP_PROP_POS_MSEC))
    ts = cap.get(cv2.CAP_PROP_POS_MSEC)
    cts = calc_timestamps[-1] + 1000/fps
    timearray.append(abs(ts - cts))
  cap.release()

  frame_timestamp_ms = timearray[9]
  print(type(buf[9]))
  mp_image = mp.Image(format=ImageFormat.SRGB, data=np.stack(buf[9]))

  gesture_recognition_result = recognizer.recognize_for_video(mp_image,frame_timestamp_ms)

  #numpy_frame_from_opencv = np.stack(frames, axis=0) # dimensions (T, H, W, C)

  #print(len(numpy_frame_from_opencv))

  cv2.destroyAllWindows()

==== Video Info. ===== <class 'numpy.ndarray'> W20230204 14:13:15.370810 88347 gesture_recognizer_graph.cc:122] Hand Gesture Recognizer contains CPU only ops. Sets HandGestureRecognizerGraph acceleartion to Xnnpack. I20230204 14:13:15.374961 88347 hand_gesture_recognizer_graph.cc:250] Custom gesture classifier is not defined.

TypeError Traceback (most recent call last) Cell In[10], line 35 33 frame_timestamp_ms = timearray[9] 34 print(type(buf[9])) ---> 35 mp_image = mp.Image(format=ImageFormat.SRGB, data=np.stack(buf[9])) 37 gesture_recognition_result = recognizer.recognize_for_video(mp_image,frame_timestamp_ms) 40 #numpy_frame_from_opencv = np.stack(frames, axis=0) # dimensions (T, H, W, C) 41 42 #print(len(numpy_frame_from_opencv))

TypeError: init(): incompatible constructor arguments. The following argument types are supported:

mediapipe.python._framework_bindings.image.Image(image_format: mediapipe::ImageFormat_Format, data: numpy.ndarray[numpy.uint8])
mediapipe.python._framework_bindings.image.Image(image_format: mediapipe::ImageFormat_Format, data: numpy.ndarray[numpy.uint16])
mediapipe.python._framework_bindings.image.Image(image_format: mediapipe::ImageFormat_Format, data: numpy.ndarray[numpy.float32])

Invoked with: kwargs: format=<ImageFormat.SRGB: 1>, data=array([[[113, 123, 106], [113, 123, 106], [113, 123, 106], ..., [149, 162, 144], [149, 162, 144], [147, 160, 142]],

   [[114, 124, 107],
    [114, 124, 107],
    [114, 124, 107],
    ...,
    [149, 162, 144],
    [147, 160, 142],
    [147, 160, 142]],

   [[114, 124, 107],
    [114, 124, 107],
    [114, 124, 107],
    ...,
    [147, 160, 142],
    [146, 159, 141],
    [146, 159, 141]],

   ...,

   [[ 38,  43,  41],
    [ 52,  57,  55],
    [ 68,  74,  69],
    ...,
    [ 19,  24,  22],
    [ 20,  25,  23],
    [ 20,  25,  23]],

   [[ 68,  73,  71],
    [ 92,  97,  95],
    [104, 110, 105],
    ...,
    [ 18,  23,  21],
    [ 19,  24,  22],
    [ 19,  24,  22]],

   [[ 49,  54,  52],
    [ 46,  51,  49],
    [ 62,  68,  63],
    ...,
    [ 18,  23,  21],
    [ 19,  24,  22],
    [ 20,  25,  23]]], dtype=uint8)

self answer .. (I resolved by myself.)

  # STEP 2: Create an GestureRecognizer object.
  VisionRunningMode = mp.tasks.vision.RunningMode
  base_options = python.BaseOptions(model_asset_path='gesture_recognizer.task')
  options = vision.GestureRecognizerOptions(base_options=base_options, running_mode=VisionRunningMode.VIDEO )
  recognizer = vision.GestureRecognizer.create_from_options(options)

  results = []
  cap = cv2.VideoCapture(video_file_path)
  fps = cv2.CAP_PROP_FPS
  calc_timestamps = [0.0]

  res_landmark = []
  res_not_found_count = 0

  ret = True
  while ret:
    ret, img = cap.read() # read one frame from the 'capture' object; img is (H, W, C)
    if ret:      
      ts = cap.get(cv2.CAP_PROP_POS_MSEC)      
      cts = calc_timestamps[-1] + 1000/fps
      #print(abs(ts-cts))
      recognition_result =recognizer.recognize_for_video( mp.Image(image_format=ImageFormat.SRGB, data=img), int(ts))

      try : 
        top_gesture = recognition_result.gestures[0][0]

        hand_landmarks = top_gesture.category_name
        #print(hand_landmarks)
        results.append(top_gesture)
        res_landmark.append(hand_landmarks)
      except IndexError: 
        res_landmark.append('-')
        res_not_found_count += 1
        continue 

  cap.release()
  cv2.destroyAllWindows

google-ai-edge / mediapipe-samples

How could I use VIDEO mode on gesture recognizer? #41