jdepoix / youtube-transcript-api

This is a python API which allows you to get the transcript/subtitles for a given YouTube video. It also works for automatically generated subtitles and it does not require an API key nor a headless browser, like other selenium based solutions do!
MIT License
2.84k stars 320 forks source link

Client Error: Too Many Requests for url #331

Open MohamedFakhry2007 opened 1 week ago

MohamedFakhry2007 commented 1 week ago

What code / cli command are you executing?

For example: I am running this code:

import os
from googleapiclient.discovery import build
from youtube_transcript_api import YouTubeTranscriptApi
import logging
import json
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
import random

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')
logger = logging.getLogger()

# YouTube API credentials
API_KEY = 'API_KEY'

def get_video_ids_and_titles(playlist_id, max_results=250):
    youtube = build('youtube', 'v3', developerKey=API_KEY)

    videos = []
    next_page_token = None

    while True:
        request = youtube.playlistItems().list(
            part='contentDetails,snippet',
            playlistId=playlist_id,
            maxResults=min(max_results - len(videos), 50),
            pageToken=next_page_token
        )
        response = request.execute()

        for item in response['items']:
            videos.append({
                'id': item['contentDetails']['videoId'],
                'title': item['snippet']['title']
            })

        next_page_token = response.get('nextPageToken')
        if not next_page_token or len(videos) >= max_results:
            break

        # Add a 3-second delay before the next API request
        time.sleep(10)

    return videos

def extract_subtitles(video, output_dir, max_retries=10):
    retry_count = 0
    while retry_count < max_retries:
        try:
            video_id = video['id']
            video_title = video['title']

            transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)

            arabic_auto_transcript = None
            for transcript in transcript_list:
                if transcript.language_code == 'ar' and transcript.is_generated:
                    arabic_auto_transcript = transcript
                    break

            if arabic_auto_transcript is None:
                logger.warning(f"No Arabic auto-generated subtitles found for video {video_id}")
                return

            transcript = arabic_auto_transcript.fetch()

            safe_title = "".join([c for c in video_title if c.isalpha() or c.isdigit() or c==' ']).rstrip()
            filename = os.path.join(output_dir, f"{safe_title}_ar_auto.txt")

            with open(filename, 'w', encoding='utf-8') as f:
                for entry in transcript:
                    f.write(f"{entry['text']}\n")

            logger.info(f"Arabic auto-generated subtitles extracted for video: {video_title}")
            return
        except Exception as e:
            retry_count += 1
            wait_time = (2 ** retry_count) + (random.randint(0, 1000) / 1000)
            logger.warning(f"Error extracting subtitles for video {video_id}. Retrying in {wait_time:.2f} seconds. Error: {str(e)}")
            time.sleep(wait_time)

    logger.error(f"Failed to extract subtitles for video {video_id} after {max_retries} attempts")

def main():
    channel_id = "UCGnCvNgWZ3T7hJJajjGYucA"
    output_dir = "Kwili"
    max_videos = 250  # Set this to the number of videos you want to process

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    logger.info("Starting Arabic auto-generated subtitle extraction process")

    try:
        # Get the uploads playlist ID for the channel
        youtube = build('youtube', 'v3', developerKey=API_KEY)
        request = youtube.channels().list(
            part='contentDetails',
            id=channel_id
        )
        response = request.execute()

        logger.info(f"API Response: {json.dumps(response, indent=2)}")

        if 'items' not in response or not response['items']:
            logger.error(f"No channel found for ID: {channel_id}")
            return

        playlist_id = response['items'][0]['contentDetails']['relatedPlaylists']['uploads']

        logger.info(f"Found uploads playlist ID: {playlist_id}")

        # Add a 3-second delay before the next API request
        time.sleep(10)

        try:
          videos = get_video_ids_and_titles(playlist_id, max_videos)
          logger.info(f"Retrieved {len(videos)} videos")

          with ThreadPoolExecutor(max_workers=1) as executor:  # Reduced max_workers
              future_to_video = {executor.submit(extract_subtitles, video, output_dir): video for video in videos}
              for future in as_completed(future_to_video):
                  video = future_to_video[future]
                  try:
                      future.result()
                  except Exception as exc:
                      logger.error(f"Video {video['id']} generated an exception: {exc}")

                  time.sleep(30)  # Increased delay between video processing

          logger.info("Arabic auto-generated subtitle extraction process completed")

        except Exception as e:
          logger.error(f"An error occurred: {str(e)}")

    except Exception as e:
          logger.error(f"An error occurred: {str(e)}")
if __name__ == "__main__":
    main()```

### Which Python version are you using?
Python 3.10.12

### Which version of youtube-transcript-api are you using?
Version: 0.6.2

# Expected behavior
Describe what you expected to happen. 
to get the required Arabic Auto-generated transcripts from each channel's playlist and export them to txt files. I have set proper timing between each call but if more time is required please let me know if it will fix the issue.

# Actual behaviour

WARNING:root:Error extracting subtitles for video R8mtJd4wUFI. Retrying in 2.84 seconds. Error: Could not retrieve a transcript for the video https://www.youtube.com/watch?v=429 Client Error: Too Many Requests for url: https://www.google.com/sorry/index?continue=https://www.youtube.com/watch%3Fv%3DR8mtJd4wUFI&q=EgQiqF_7GPe6lbcGIjAPLKF3hlhkSNN1i4W9XS6xKPjE-w0Ks_uSLMc6FDZ-vX9W-tCj2esBEOlM6LWDlo8yAXJaAUM! This is most likely caused by:

Request to YouTube failed: R8mtJd4wUFI

If you are sure that the described cause is not responsible for this error and that a transcript should be retrievable, please create an issue at https://github.com/jdepoix/youtube-transcript-api/issues. Please add which version of youtube_transcript_api you are using and provide the information needed to replicate the error. Also make sure that there are no open issues which already describe your problem! WARNING:root:Error extracting subtitles for video R8mtJd4wUFI. Retrying in 4.70 seconds. Error: Could not retrieve a transcript for the video https://www.youtube.com/watch?v=429 Client Error: Too Many Requests for url: https://www.google.com/sorry/index?continue=https://www.youtube.com/watch%3Fv%3DR8mtJd4wUFI&q=EgQiqF_7GPq6lbcGIjAbXTmkWPBB4MkQnI1NXYcrP17UrWbZbKIiKPhJaTYZnMXOUovndjDz0TJ9eLzvKGkyAXJaAUM! This is most likely caused by:

Request to YouTube failed: R8mtJd4wUFI

bltnico commented 2 days ago

I’ve been having the same problem since this weekend!