mmguero / monkeyplug

monkeyplug is a little script to mute profanity in audio files
BSD 3-Clause "New" or "Revised" License
27 stars 1 forks source link

Feature Request: Improved matching using transcript #8

Open therealmichaelberna opened 4 days ago

therealmichaelberna commented 4 days ago

I have a similar project that was developed before I discovered your project. I like that your project uses whisper and VOSK as VOSK seems to be less accurate. Something I had developed that I think may benefit your project is transcript based matching accuracy improvement.

https://github.com/therealmichaelberna/FoulFilter/blob/main/word_finder.py

Basically, it does a diff and if a word is incorrect, it will fix the word match while retaining the timestamp. This is useful if you have a known accurate transcript such as a pdf of an audiobook. Just figured I'd share and hope it can be of help.

def improve_match_with_transcript(results_list, transcript_word_array): # attempts to improve accuracy of the match using transcript
    print("using transcript")

    improved_detection = []
    results_words_only_list = []#only words not start, end, or anything else.

    for item in results_list:#combine our results list with words only
        results_words_only_list.append(item.get('word'))

    #differ = difflib.Differ()

    html_table = difflib.HtmlDiff().make_table(
        fromlines=transcript_word_array,
        tolines=results_words_only_list,
        fromdesc="Transcript",
        todesc="Results",
        context=False  # Show surrounding context
    )

    print(f"table:'{html_table}'")

    soup = BeautifulSoup(html_table, 'html.parser')# Parse the HTML using BeautifulSoup
    # Find the table element by its class name
    table = soup.find('table', class_='diff')
    # Initialize an empty list to store the rows
    result_rows = []

    # Find all the rows in the table body
    rows = table.tbody.find_all('tr')

    #for row in rows:
    for i in range(0, len(rows), 1):
        cells = rows[i].find_all('td')

        # Check if the first cell is 'n' or 't'
        if cells[0].text.strip() == 'n' or cells[0].text.strip() == 't':
            # Extract the data from the cells
            result_line_index = cells[4].text.strip()
            # if the word wasn't detected in our original result, we must continue anyways because we have no start and end
            if result_line_index == "":
                continue
            transcript = cells[2].text.strip()
            results = cells[5].text.strip()

            # Create a dictionary for the row data
            row_data = {
                'Transcript': transcript,
                'Results': results,
                'Res_index' : result_line_index
            }

            # Append the row data to the list
            result_rows.append(row_data)

    improved_detection = results_list

    for item in result_rows:
        results = item["Results"]
        transcript_word = item["Transcript"]
        res_index = item["Res_index"]
        #print(f"trans: {transcript_word} res: {results} res_index: {res_index}")
        list_index = int(res_index)-1
        #print(f"Transcript: {transcript_word}, Res_index: {res_index}, List index: {list_index}")
        improved_detection[list_index]['word'] = transcript_word

    #print(f"improved detection: {improved_detection}")
    return improved_detection