help with recognizing keystrokes based on webcam video

im creating an AI that plays minecraft and im using training data to get the AI a little headstart. ive some test data with a camera over the players hand including the keyboard and im trying to figure out a way to get keystrokes out of the videos using mediapipe by looking at the y level of the middle finger. ive had some success but the keystrokes outputted are very messy and might mess up the training of the AI. here is my code so far:

!pip install pytube
!pip install mediapipe opencv-python
import os
from pytube import YouTube
from IPython.display import YouTubeVideo
import mediapipe as mp
import cv2

# Function to download a YouTube video and return the path to the downloaded video
def download_video(video_url, output_dir):
    yt = YouTube(video_url)
    video_stream = yt.streams.get_highest_resolution()
    original_filename = video_stream.title
    video_stream.download(output_path=output_dir, filename=original_filename)
    return os.path.join(output_dir, f"{original_filename}.mp4")

def determine_keystrokes(hand_landmarks, amount):
    if hand_landmarks:
      #return hand_landmarks.landmark[mp_hands.HandLandmark.MIDDLE_FINGER_TIP].y
        if hand_landmarks.landmark[mp_hands.HandLandmark.MIDDLE_FINGER_TIP].y > amount: #replace that with amount in a minute
            return 'W'
        #if hand_landmarks.landmark[mp_hands.HandLandmark.INDEX_FINGER_TIP].y > 0.7614:
    return 'None'

def crop_handcam(frame):
    # Implement your logic here
    x, y, width, height = 100, 100, 200, 200
    cropped_frame = frame[y:y + height, x:x + width]
    return cropped_frame

def map_to_tensor(video_id, frame_id, keystrokes):
    return {
        "video_id": video_id,
        "frame_id": frame_id,
        "keystrokes": keystrokes,
    }

mp_hands = mp.solutions.hands
hands = mp_hands.Hands()

# List of video URLs for training data
training_data_videos = [
    "https://youtube.com/shorts/_QDte-cXaMs?feature=share",
    "https://www.youtube.com/shorts/xNJlBWk5iN4",
    "https://www.youtube.com/watch?v=b97wWi6tw6k",
    ""
]

data_dir = 'data'
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

for video_id, video_url in enumerate(training_data_videos):
    video_path = download_video(video_url, data_dir)
    print(video_id)
    video_path = os.path.join(data_dir, "November 10, 2023")
    video_capture = cv2.VideoCapture(video_path)
    print(video_capture)
    total_frames = int(video_capture.get(cv2.CAP_PROP_FRAME_COUNT))
    print(f"the total amount of frames in this video is {total_frames}")
    if video_id == 0:
      amount = 0.79
    if video_id == 1:
      amount = 0.52
    if video_id == 2:
      amount = 0.7614
    for frame_id in range(total_frames):

      ret, frame = video_capture.read()
      if not ret:
          break
        #cropped_frame = crop_handcam(frame)

      rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) #instead of frame put cropped frame once project furthers
      results = hands.process(rgb_frame)

      hand_landmarks = results.multi_hand_landmarks[0] if results.multi_hand_landmarks else None
      detected_keystrokes = determine_keystrokes(hand_landmarks, amount)

      tensor_data = map_to_tensor(video_id, frame_id, detected_keystrokes)
      if frame_id % 30 == 0:
        print(f"second {frame_id/30:.0f}" )

      print(tensor_data)

    # Release the video capture for each video
    video_capture.release()

# Release the hands module
hands.close()

as of now im only tracking the middle finger which will be atributed to the w key (in the future it will be all the fingers and the mouse and im using some test videos, the first of which is just me pressing the w key in a predictable way. the code kinda works but id like it to be better. thank you for any help and i dont use github much so if i posted this in the wrong place or i shouldve given more information please tell me. thank you again

google-ai-edge / mediapipe-samples

help with recognizing keystrokes based on webcam video #291