This is a python API which allows you to get the transcript/subtitles for a given YouTube video. It also works for automatically generated subtitles and it does not require an API key nor a headless browser, like other selenium based solutions do!
import os
from googleapiclient.discovery import build
from youtube_transcript_api import YouTubeTranscriptApi
import logging
import json
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
import random
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')
logger = logging.getLogger()
# YouTube API credentials
API_KEY = 'API_KEY'
def get_video_ids_and_titles(playlist_id, max_results=250):
youtube = build('youtube', 'v3', developerKey=API_KEY)
videos = []
next_page_token = None
while True:
request = youtube.playlistItems().list(
part='contentDetails,snippet',
playlistId=playlist_id,
maxResults=min(max_results - len(videos), 50),
pageToken=next_page_token
)
response = request.execute()
for item in response['items']:
videos.append({
'id': item['contentDetails']['videoId'],
'title': item['snippet']['title']
})
next_page_token = response.get('nextPageToken')
if not next_page_token or len(videos) >= max_results:
break
# Add a 3-second delay before the next API request
time.sleep(10)
return videos
def extract_subtitles(video, output_dir, max_retries=10):
retry_count = 0
while retry_count < max_retries:
try:
video_id = video['id']
video_title = video['title']
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
arabic_auto_transcript = None
for transcript in transcript_list:
if transcript.language_code == 'ar' and transcript.is_generated:
arabic_auto_transcript = transcript
break
if arabic_auto_transcript is None:
logger.warning(f"No Arabic auto-generated subtitles found for video {video_id}")
return
transcript = arabic_auto_transcript.fetch()
safe_title = "".join([c for c in video_title if c.isalpha() or c.isdigit() or c==' ']).rstrip()
filename = os.path.join(output_dir, f"{safe_title}_ar_auto.txt")
with open(filename, 'w', encoding='utf-8') as f:
for entry in transcript:
f.write(f"{entry['text']}\n")
logger.info(f"Arabic auto-generated subtitles extracted for video: {video_title}")
return
except Exception as e:
retry_count += 1
wait_time = (2 ** retry_count) + (random.randint(0, 1000) / 1000)
logger.warning(f"Error extracting subtitles for video {video_id}. Retrying in {wait_time:.2f} seconds. Error: {str(e)}")
time.sleep(wait_time)
logger.error(f"Failed to extract subtitles for video {video_id} after {max_retries} attempts")
def main():
channel_id = "UCGnCvNgWZ3T7hJJajjGYucA"
output_dir = "Kwili"
max_videos = 250 # Set this to the number of videos you want to process
if not os.path.exists(output_dir):
os.makedirs(output_dir)
logger.info("Starting Arabic auto-generated subtitle extraction process")
try:
# Get the uploads playlist ID for the channel
youtube = build('youtube', 'v3', developerKey=API_KEY)
request = youtube.channels().list(
part='contentDetails',
id=channel_id
)
response = request.execute()
logger.info(f"API Response: {json.dumps(response, indent=2)}")
if 'items' not in response or not response['items']:
logger.error(f"No channel found for ID: {channel_id}")
return
playlist_id = response['items'][0]['contentDetails']['relatedPlaylists']['uploads']
logger.info(f"Found uploads playlist ID: {playlist_id}")
# Add a 3-second delay before the next API request
time.sleep(10)
try:
videos = get_video_ids_and_titles(playlist_id, max_videos)
logger.info(f"Retrieved {len(videos)} videos")
with ThreadPoolExecutor(max_workers=1) as executor: # Reduced max_workers
future_to_video = {executor.submit(extract_subtitles, video, output_dir): video for video in videos}
for future in as_completed(future_to_video):
video = future_to_video[future]
try:
future.result()
except Exception as exc:
logger.error(f"Video {video['id']} generated an exception: {exc}")
time.sleep(30) # Increased delay between video processing
logger.info("Arabic auto-generated subtitle extraction process completed")
except Exception as e:
logger.error(f"An error occurred: {str(e)}")
except Exception as e:
logger.error(f"An error occurred: {str(e)}")
if __name__ == "__main__":
main()```
### Which Python version are you using?
Python 3.10.12
### Which version of youtube-transcript-api are you using?
Version: 0.6.2
# Expected behavior
Describe what you expected to happen.
to get the required Arabic Auto-generated transcripts from each channel's playlist and export them to txt files. I have set proper timing between each call but if more time is required please let me know if it will fix the issue.
# Actual behaviour
What code / cli command are you executing?
For example: I am running this code:
WARNING:root:Error extracting subtitles for video R8mtJd4wUFI. Retrying in 2.84 seconds. Error: Could not retrieve a transcript for the video https://www.youtube.com/watch?v=429 Client Error: Too Many Requests for url: https://www.google.com/sorry/index?continue=https://www.youtube.com/watch%3Fv%3DR8mtJd4wUFI&q=EgQiqF_7GPe6lbcGIjAPLKF3hlhkSNN1i4W9XS6xKPjE-w0Ks_uSLMc6FDZ-vX9W-tCj2esBEOlM6LWDlo8yAXJaAUM! This is most likely caused by:
Request to YouTube failed: R8mtJd4wUFI
If you are sure that the described cause is not responsible for this error and that a transcript should be retrievable, please create an issue at https://github.com/jdepoix/youtube-transcript-api/issues. Please add which version of youtube_transcript_api you are using and provide the information needed to replicate the error. Also make sure that there are no open issues which already describe your problem! WARNING:root:Error extracting subtitles for video R8mtJd4wUFI. Retrying in 4.70 seconds. Error: Could not retrieve a transcript for the video https://www.youtube.com/watch?v=429 Client Error: Too Many Requests for url: https://www.google.com/sorry/index?continue=https://www.youtube.com/watch%3Fv%3DR8mtJd4wUFI&q=EgQiqF_7GPq6lbcGIjAbXTmkWPBB4MkQnI1NXYcrP17UrWbZbKIiKPhJaTYZnMXOUovndjDz0TJ9eLzvKGkyAXJaAUM! This is most likely caused by:
Request to YouTube failed: R8mtJd4wUFI