m-bain / whisperX

WhisperX: Automatic Speech Recognition with Word-level Timestamps (& Diarization)
BSD 2-Clause "Simplified" License
12.49k stars 1.31k forks source link

A tip: color coding the outputs (e.g. srt) from the JSON files #787

Open Manamama opened 7 months ago

Manamama commented 7 months ago
  1. It works in Termux, directly, without prooting, yay: some submodules needed to be "hand compiled", though, or their requirements tinkered with.

  2. The tips, which may be included as the options later on:

import pandas as pd
import json
import sys
from colorama import Fore, Style

from colorama import Fore, Back, Style

def color_code(word, score):
    if score > 0.9:
        return Fore.GREEN + word + Style.RESET_ALL  # Green
    elif score > 0.8:
        return Fore.LIGHTGREEN_EX + word + Style.RESET_ALL  # Light Green
    elif score > 0.7:
        return Fore.YELLOW + word + Style.RESET_ALL  # Yellow
    elif score > 0.6:
        return Fore.LIGHTYELLOW_EX + word + Style.RESET_ALL  # Light Yellow
    elif score > 0.5:
        return Fore.LIGHTRED_EX + word + Style.RESET_ALL  # Light Red
    else:
        return Fore.RED + word + Style.RESET_ALL  # Red

def process_file(filename):
    # Open the file and load the JSON data
    with open(filename, 'r') as f:
        data = json.load(f)

    # Loop over each segment in the data
    for segment in data['segments']:
        # Convert the 'words' list into a DataFrame
        df = pd.DataFrame(segment['words'])

        # Apply the function to the 'word' column
        df['word'] = df.apply(lambda row: color_code(row['word'], row['score']), axis=1)

        # Join the words into a sentence
        sentence = ' '.join(df['word'])

        print(sentence)

# Get the filename from the command-line arguments
filename = sys.argv[1]

# Call the function with the filename as an argument
process_file(filename)

And its srt version:

import pandas as pd
import json
import sys
import os

def color_code(word, score):
    if score > 0.9:
        return '<font color="#008000">' + word + '</font>'  # Green
    elif score > 0.8:
        return '<font color="#32CD32">' + word + '</font>'  # LimeGreen
    elif score > 0.7:
        return '<font color="#ADFF2F">' + word + '</font>'  # GreenYellow
    elif score > 0.6:
        return '<font color="#FFFF00">' + word + '</font>'  # Yellow
    elif score > 0.5:
        return '<font color="#FFA500">' + word + '</font>'  # Orange
    else:
        return '<font color="#FF0000">' + word + '</font>'  # Red

def format_time(seconds):
    # Convert seconds to hh:mm:ss,ms format
    hours = int(seconds // 3600)
    seconds %= 3600
    minutes = int(seconds // 60)
    seconds %= 60
    milliseconds = int((seconds % 1) * 1000)
    return f'{hours:02}:{minutes:02}:{int(seconds):02},{milliseconds:03}'

def process_file(filename):
    # Open the file and load the JSON data
    with open(filename, 'r') as f:
        data = json.load(f)

    # Get the base filename without the extension
    base_filename = os.path.splitext(filename)[0]

    # Open the output .srt file
    with open(f'{base_filename}_colorcoded.srt', 'w') as f:
        # Loop over each segment in the data
        for i, segment in enumerate(data['segments'], start=1):
            # Convert the 'words' list into a DataFrame
            df = pd.DataFrame(segment['words'])

            # Apply the function to the 'word' column
            df['word'] = df.apply(lambda row: color_code(row['word'], row['score']), axis=1)

            # Join the words into a sentence
            sentence = ' '.join(df['word'])

            # Write the subtitle number, time range, and sentence to the .srt file
            f.write(f'{i}\n')
            f.write(f'{format_time(segment["start"])} --> {format_time(segment["end"])}\n')
            f.write(f'{sentence}\n\n')

# Get the filename from the command-line arguments
filename = sys.argv[1]

# Call the function with the filename as an argument
process_file(filename)
Manamama commented 6 months ago

Ad 1. It works in Termux, ... also see here for a summary how it can be done.

Ad 2. The tips above are actually a feature request.