awslabs / amazon-transcribe-streaming-sdk

The Amazon Transcribe Streaming SDK is an async Python SDK for converting audio into text via Amazon Transcribe.
Apache License 2.0
153 stars 41 forks source link

Web socket connection to : ${aws_presigned_url} failed #100

Open MaxBarbet opened 9 months ago

MaxBarbet commented 9 months ago

Hi,

I have been downloading the sample code from AWS and using it with this sdk. I have a Web socket connection to : ${aws_presigned_url} failed each time I try to connect to AWS.

Here are my dependencies :

"@aws-sdk/client-transcribe-streaming": "3.523.0",
    "@aws-sdk/client-translate": "^3.523.0",
    "@types/node": "^16.18.84",
    "@types/react": "^18.2.60",
    "@types/react-dom": "^18.2.19",
    "microphone-stream": "^6.0.1",
    "react": "16.10",
    "react-dom": "16.8.4",
    "react-scripts": "4.0.3",
    "typescript": "^4.9.5"

// APP.tsx

import React, { useState } from 'react';

import * as TranscribeClient from "./transcribeClient";
import * as TranslateClient from "./translateClient";
import { LanguageCode } from '@aws-sdk/client-transcribe-streaming';

const App = () => {
    const [record, setRecord] = useState(false)
    let translatedText = ""
    let transcribedText = ""

    const onRecordPress = () => {
        if (record) {
            stopRecording();
        } else {
            startRecording();
        }
    };

    const startRecording = async () => {
        clearTranscription();
        const selectedLanguage = "fr-FR" as LanguageCode;
        setRecord(true)
        try {
            await TranscribeClient.startRecording(selectedLanguage, onTranscriptionDataReceived);
        } catch (error: any) {
            alert("An error occurred while recording: " + error.message);
            stopRecording();
        }
    };

    const onTranscriptionDataReceived = async (data: string) => {
        transcribedText = data;
        const translation = await TranslateClient.translateTextToLanguage(data, "fr", "en");
        if (translation) {
            translatedText = translation;
        }
    }

    const stopRecording = function () {
        setRecord(false)
        TranscribeClient.stopRecording();
    };

    const clearTranscription = () => {
        transcribedText = "";
        translatedText = "";
    };

    return (
        <div>
            <h1>Streaming Speech to Text</h1>
            <button onClick={onRecordPress}>
                ◉
            </button>
            <div id="outputSection">
                <div id="headerText"><h2>Transcription</h2></div>
                <div>{transcribedText}</div>
                <div id="controlContainer">
                    <button onClick={clearTranscription}>Clear</button>
                </div>
                <div id="headerText"><h2>Translation</h2></div>
                <div>{translatedText}</div>
            </div>
        </div>
    );
}

export default App;

// TranscribeClient

import { LanguageCode, TranscribeStreamingClient } from "@aws-sdk/client-transcribe-streaming";
import MicrophoneStream from "microphone-stream";
import { StartStreamTranscriptionCommand } from "@aws-sdk/client-transcribe-streaming";
import { Buffer } from "buffer";

const SAMPLE_RATE = 44100;
const MAX_RETRIES = 3;
let microphoneStream: { stop: () => void; setStream: (arg0: MediaStream) => void; } | undefined = undefined;
let transcribeClient: TranscribeStreamingClient | undefined = undefined;

const AWS_REGION = "eu-west-3"
const AWS_ACCESS_KEY_ID = "XXXXX"
const AWS_SECRET_ACCESS_KEY = "XXXXXXXX"

let retries = 0;

export const startRecording = async (language: LanguageCode, callback: any) => {
    if (!language) {
        return false;
    }
    if (microphoneStream || transcribeClient) {
        stopRecording();
    }
    createTranscribeClient();
    createMicrophoneStream();
    await startStreaming(language, callback);
};

export const stopRecording = function () {
    if (microphoneStream) {
        microphoneStream.stop();
        microphoneStream = undefined;
    }
    if (transcribeClient) {
        transcribeClient.destroy();
        transcribeClient = undefined;
    }
};

const createTranscribeClient = () => {
    transcribeClient = new TranscribeStreamingClient({
        region: AWS_REGION,
        credentials: {
            accessKeyId: AWS_ACCESS_KEY_ID,
            secretAccessKey: AWS_SECRET_ACCESS_KEY
        },
    });
    transcribeClient.config.logger = console;
}

const createMicrophoneStream = async () => {
    microphoneStream = new MicrophoneStream();
    microphoneStream.setStream(
        await window.navigator.mediaDevices.getUserMedia({
            video: false,
            audio: true,
        })
    );
}

const startStreaming = async (language: LanguageCode, callback: any) => {
    console.log("startStreaming")
    const command = new StartStreamTranscriptionCommand({
        LanguageCode: language,
        MediaEncoding: "pcm",
        MediaSampleRateHertz: SAMPLE_RATE,
        AudioStream: getAudioStream(),
    });
    console.log("command", command)
    try {
        if (transcribeClient) {
            console.log("transcribeClient")
            const data = await transcribeClient.send(command);
            console.log(data)
            if (data.TranscriptResultStream) {
                console.log("TranscriptResultStream")
                for await (const event of data.TranscriptResultStream) {
                    if (event.TranscriptEvent && event.TranscriptEvent.Transcript) {
                        console.log("event", event)
                        for (const result of event.TranscriptEvent.Transcript.Results || []) {
                            if (result.IsPartial === false && result.Alternatives) {
                                console.log("result", result)
                                const alternative = result.Alternatives[0]
                                const items = alternative?.Items;
                                if (items) {
                                    console.log("items", items)
                                    items.forEach((item: any) => {
                                        console.log(item.Content);
                                        callback(item.Content + " ");
                                    })
                                }
                            }
                        }
                    }
                }
            }
        }
    } catch (error: any) {
        console.error("Error in startStreaming:", error);
        if (error.$metadata && error.$metadata.httpStatusCode === 500 && retries < MAX_RETRIES) {
            console.log(`Retrying (${retries + 1}/${MAX_RETRIES})...`);
            retries++;
            await startStreaming(language, callback);
        } else {
            console.error("Maximum retries reached or non-retryable error.");
            if (error.$metadata && error.$metadata.httpStatusCode) {
                console.error("HTTP Status Code:", error.$metadata.httpStatusCode);
            }
        }
    }
}

const getAudioStream = async function* () {
    for await (const chunk of microphoneStream as any) {
        if (chunk.length <= SAMPLE_RATE) {
            yield {
                AudioEvent: {
                    AudioChunk: encodePCMChunk(chunk),
                },
            };
        }
    }
};

const encodePCMChunk = (chunk: any) => {
    const input = MicrophoneStream.toRaw(chunk);
    let offset = 0;
    const buffer = new ArrayBuffer(input.length * 2);
    const view = new DataView(buffer);
    for (let i = 0; i < input.length; i++, offset += 2) {
        let s = Math.max(-1, Math.min(1, input[i]));
        view.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7fff, true);
    }
    return Buffer.from(buffer);
};

//TranslateClient

import {
    TranslateClient,
    TranslateTextCommand,
} from "@aws-sdk/client-translate";

const AWS_REGION = "eu-west-3"
const AWS_ACCESS_KEY_ID = "XXXXXXXX"
const AWS_SECRET_ACCESS_KEY = "XXXXXXXXXXXX"

export const translateTextToLanguage = async (text: string, sourceLanguage: string, targetLanguage: string) => {
    return await translateTextFromLanguageToLanguage(
        text,
        sourceLanguage,
        targetLanguage
    );
};

const createTranslateClient = () => {
    return new TranslateClient({
        region: AWS_REGION,
        credentials: {
            accessKeyId: AWS_ACCESS_KEY_ID,
            secretAccessKey: AWS_SECRET_ACCESS_KEY
        },
    });
};

const translateTextFromLanguageToLanguage = async (
    text: string,
    sourceLanguage: string,
    targetLanguage: string
) => {
    const translateClient = createTranslateClient();
    const translateParams = {
        Text: text,
        SourceLanguageCode: sourceLanguage,
        TargetLanguageCode: targetLanguage,
    };
    const data = await translateClient.send(
        new TranslateTextCommand(translateParams),
    );
    return data.TranslatedText;
};

I am having thoses issues :

Capture d’écran 2024-02-29 à 10 48 57

Does anyone have an idea on how to fix this ?

sjdeak commented 8 months ago

+1 , met same issues, and in my case if user uses US network is rare to happen, but when using HongKong network it's super common to occur, can you investigate?