yacine-bens / MsEdge-TTS-Extension

26 stars 7 forks source link

Could you help me build an edge-tts float audio player? #23

Open taowang1993 opened 1 month ago

taowang1993 commented 1 month ago

Hi, your plugin is great. I am very inspired by your work.

I also want to build a mini-app on top of edge tts.

My idea is a floating audio player that anyone can embed on their website.

Then, users can read your website with microsoft tts.

Here is my code. I can get it to load the voices, but I can't get it to read the web page content.

So I am here to ask for help.


<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Floating TTS Audio Player</title>
    <style>
        #tts-player {
            position: fixed;
            bottom: 20px;
            left: 50%;
            transform: translateX(-50%);
            background-color: #f0f0f0;
            border-radius: 25px;
            padding: 10px 20px;
            display: flex;
            align-items: center;
            box-shadow: 0 2px 10px rgba(0, 0, 0, 0.1);
        }
        #tts-player button {
            background: none;
            border: none;
            font-size: 24px;
            cursor: pointer;
            margin: 0 10px;
        }
        #tts-player select, #tts-player input {
            margin: 0 10px;
        }
    </style>
</head>
<body>
    <h1>Welcome to the TTS-enabled webpage</h1>
    <p>This is a sample paragraph that can be read aloud using the TTS player below.</p>

    <div id="tts-player">
        <button id="play-pause">▶️</button>
        <select id="voice-select"></select>
        <input type="range" id="speed-control" min="0.5" max="2" step="0.1" value="1">
        <span id="speed-value">1x</span>
    </div>

    <script>
        const BASE_URL = "speech.platform.bing.com/consumer/speech/synthesize/readaloud";
        const TRUSTED_CLIENT_TOKEN = "6A5AA1D4EAFF4E9FB37E23D68491D6F4";
        const VOICES_URL = `https://${BASE_URL}/voices/list?trustedclienttoken=${TRUSTED_CLIENT_TOKEN}`;
        const SYNTH_URL = `wss://${BASE_URL}/edge/v1?TrustedClientToken=${TRUSTED_CLIENT_TOKEN}`;
        const BINARY_DELIM = "Path:audio\r\n";
        const VOICE_LANG_REGEX = /\w{2}-\w{2}/;

        const playPauseButton = document.getElementById('play-pause');
        const voiceSelect = document.getElementById('voice-select');
        const speedControl = document.getElementById('speed-control');
        const speedValue = document.getElementById('speed-value');
        let isPlaying = false;
        let audioContext;
        let sourceNode;
        let websocket;
        let reconnectAttempts = 0;
        let voices = [];

        // Initialize audio context
        function initAudioContext() {
            audioContext = new (window.AudioContext || window.webkitAudioContext)();
        }

        // Format voices
        function formatVoices(voices) {
            return voices.map(v => ({
                language: v.FriendlyName.match(/- ([a-zA-Z]+) \(/)[1],
                country: v.FriendlyName.match(/- .*\(([^)]+)\)/)[1],
                name: v.FriendlyName.match(/Microsoft (.+) Online/)[1],
                gender: v.Gender,
                shortName: v.ShortName,
            })).reduce((acc, voice) => {
                acc[voice.language] = acc[voice.language] || {};
                acc[voice.language][voice.country] = acc[voice.language][voice.country] || {};
                acc[voice.language][voice.country][voice.name] = { name: voice.name, shortName: voice.shortName };
                return acc;
            }, {});
        }

        // Fetch available voices
        async function fetchVoices() {
            try {
                const response = await fetch(VOICES_URL);
                voices = await response.json();
                const formattedVoices = formatVoices(voices);

                for (const language in formattedVoices) {
                    const optgroup = document.createElement('optgroup');
                    optgroup.label = language;

                    for (const country in formattedVoices[language]) {
                        for (const name in formattedVoices[language][country]) {
                            const voice = formattedVoices[language][country][name];
                            const option = document.createElement('option');
                            option.value = voice.shortName;
                            option.textContent = `${country} - ${name}`;
                            optgroup.appendChild(option);
                        }
                    }

                    voiceSelect.appendChild(optgroup);
                }

                console.log('Voices loaded:', formattedVoices);
            } catch (error) {
                console.error('Error fetching voices:', error);
            }
        }

        // Connect to WebSocket
        function connectWebSocket() {
            if (websocket && websocket.readyState === WebSocket.OPEN) {
                console.log('WebSocket is already connected');
                return;
            }

            const connectionId = Math.random().toString(36).substr(2, 10);
            websocket = new WebSocket(`${SYNTH_URL}&ConnectionId=${connectionId}`);

            websocket.onopen = () => {
                console.log('WebSocket connected');
                reconnectAttempts = 0;
            };

            websocket.onmessage = (event) => {
                if (typeof event.data === 'string') {
                    const message = JSON.parse(event.data);
                    console.log('Received message:', message);
                } else {
                    const reader = new FileReader();
                    reader.onload = () => {
                        const audioData = reader.result.split(BINARY_DELIM)[1];
                        if (audioData) {
                            playAudio(audioData);
                        }
                    };
                    reader.readAsText(event.data);
                }
            };

            websocket.onerror = (error) => {
                console.error('WebSocket error:', error);
            };

            websocket.onclose = (event) => {
                console.log('WebSocket closed:', event);
                reconnectWithBackoff();
            };
        }

        function reconnectWithBackoff() {
            const backoffTime = Math.min(30000, (Math.pow(2, reconnectAttempts) - 1) * 1000);
            console.log(`Attempting to reconnect in ${backoffTime}ms...`);
            setTimeout(() => {
                connectWebSocket();
                reconnectAttempts++;
            }, backoffTime);
        }

        // Play audio
        function playAudio(audioData) {
            const arrayBuffer = new Uint8Array(atob(audioData).split('').map(char => char.charCodeAt(0))).buffer;
            audioContext.decodeAudioData(arrayBuffer, (buffer) => {
                sourceNode = audioContext.createBufferSource();
                sourceNode.buffer = buffer;
                sourceNode.connect(audioContext.destination);
                sourceNode.playbackRate.value = parseFloat(speedControl.value);
                sourceNode.start(0);
            });
        }

        // Send TTS request
        function sendTTSRequest(text) {
            if (!websocket || websocket.readyState !== WebSocket.OPEN) {
                console.error('WebSocket is not ready. Reconnecting...');
                connectWebSocket();
                setTimeout(() => sendTTSRequest(text), 1000); // Retry after 1 second
                return;
            }

            const selectedVoice = voices.find(v => v.ShortName === voiceSelect.value);
            const voiceLocale = selectedVoice.Locale.match(VOICE_LANG_REGEX)[0];

            const request = {
                context: {
                    synthesis: {
                        audio: {
                            metadataOptions: {
                                sentenceBoundaryEnabled: "false",
                                wordBoundaryEnabled: "false"
                            },
                            outputFormat: "audio-24khz-48kbitrate-mono-mp3"
                        }
                    }
                },
                ssml: `<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xml:lang='${voiceLocale}'><voice name='${selectedVoice.ShortName}'><prosody rate='${speedControl.value}'>${text}</prosody></voice></speak>`
            };
            websocket.send(JSON.stringify(request));
        }

        // Toggle play/pause
        function togglePlayPause() {
            if (!audioContext) {
                initAudioContext();
            }

            if (!websocket || websocket.readyState !== WebSocket.OPEN) {
                connectWebSocket();
                setTimeout(togglePlayPause, 1000); // Retry after 1 second
                return;
            }

            if (isPlaying) {
                if (sourceNode) {
                    sourceNode.stop();
                }
                playPauseButton.textContent = '▶️';
            } else {
                const pageText = document.body.innerText;
                sendTTSRequest(pageText);
                playPauseButton.textContent = '⏸️';
            }
            isPlaying = !isPlaying;
        }

        // Event listeners
        playPauseButton.addEventListener('click', togglePlayPause);
        speedControl.addEventListener('input', () => {
            speedValue.textContent = `${speedControl.value}x`;
            if (sourceNode) {
                sourceNode.playbackRate.value = parseFloat(speedControl.value);
            }
        });

        // Initialize
        fetchVoices();
        connectWebSocket();
    </script>
</body>
</html>