AceCentre / SAPI-POC

MIT License
0 stars 0 forks source link

Not working.. #2

Open willwade opened 4 days ago

willwade commented 4 days ago

What does work

What doesnt work

The code in VoiceServer.

See engine.cpp is doing some neat little things to call our python file directly in voices/

We dont need to do that. I wonder about either

a. Calling our pipe service directly..


#include <windows.h>
#include <iostream>
#include <string>
#include <json/json.h>  // Include a JSON library for serializing the request

// Function to send request to pipe server
bool SendRequestToPipe(const std::string& text, std::vector<char>& audio_data) {
    // Connect to the pipe
    HANDLE pipe = CreateFile(
        R"(\\.\pipe\AACSpeakHelper)", // Pipe name
        GENERIC_READ | GENERIC_WRITE,
        0,
        NULL,
        OPEN_EXISTING,
        0,
        NULL);

    if (pipe == INVALID_HANDLE_VALUE) {
        std::cerr << "Error: Could not connect to pipe server.\n";
        return false;
    }

    // Create JSON request
    Json::Value request;
    request["action"] = "speak";
    request["text"] = text;
    request["engine"] = "AzureNeural"; // Or any other engine

    std::string request_data = Json::writeString(Json::StreamWriterBuilder(), request);

    DWORD bytes_written;
    WriteFile(pipe, request_data.c_str(), request_data.size(), &bytes_written, NULL);

    // Read response
    char buffer[65536];
    DWORD bytes_read;
    ReadFile(pipe, buffer, sizeof(buffer), &bytes_read, NULL);

    // Deserialize JSON response
    Json::Value response;
    Json::CharReaderBuilder reader;
    std::string errors;
    std::string response_data(buffer, bytes_read);

    if (!Json::parseFromStream(reader, response_data, &response, &errors)) {
        std::cerr << "Error parsing response from pipe server: " << errors << std::endl;
        CloseHandle(pipe);
        return false;
    }

    if (response["status"] == "success") {
        // Extract audio data
        const Json::Value& audio_chunks = response["audio_data"];
        for (const auto& chunk : audio_chunks) {
            std::vector<char> chunk_data = chunk.asCString();
            audio_data.insert(audio_data.end(), chunk_data.begin(), chunk_data.end());
        }

        CloseHandle(pipe);
        return true;
    }

    CloseHandle(pipe);
    return false;
}

HRESULT __stdcall Engine::Speak(DWORD dwSpeakFlags, REFGUID rguidFormatId, const WAVEFORMATEX* pWaveFormatEx,
                                const SPVTEXTFRAG* pTextFragList, ISpTTSEngineSite* pOutputSite)
{
    slog("Engine::Speak");

    for (const auto* text_frag = pTextFragList; text_frag != nullptr; text_frag = text_frag->pNext) {
        if (handle_actions(pOutputSite) == 1) {
            return S_OK;
        }

        slog(L"action={}, offset={}, length={}, text=\"{}\"",
            (int)text_frag->State.eAction,
            text_frag->ulTextSrcOffset,
            text_frag->ulTextLen, 
            text_frag->pTextStart);

        // Convert wide string to UTF-8
        std::string text = utf8_encode(std::wstring(text_frag->pTextStart, text_frag->ulTextLen));

        std::vector<char> audio_data;
        if (!SendRequestToPipe(text, audio_data)) {
            std::cerr << "Failed to get audio data from pipe server.\n";
            return E_FAIL;
        }

        // Write audio data to the output
        ULONG written;
        HRESULT result = pOutputSite->Write(audio_data.data(), audio_data.size(), &written);
        if (result != S_OK || written != audio_data.size()) {
            std::cerr << "Error writing audio data to output site.\n";
            return E_FAIL;
        }

        slog("Engine::Speak written={} bytes", written);
    }

    return S_OK;
}

or

b. Call an executable (e.g pyfrozen exe) that calls our pipe service

e.g

#include <windows.h>
#include <iostream>
#include <vector>
#include <string>

// Function to execute the external executable and get its output (audio data)
bool RunExternalTTSProcess(const std::string& text, std::vector<char>& audio_data) {
    STARTUPINFO si;
    PROCESS_INFORMATION pi;
    SECURITY_ATTRIBUTES sa;
    HANDLE hReadPipe, hWritePipe;
    char buffer[4096];
    DWORD bytes_read;

    ZeroMemory(&si, sizeof(si));
    si.cb = sizeof(si);
    ZeroMemory(&pi, sizeof(pi));

    sa.nLength = sizeof(SECURITY_ATTRIBUTES);
    sa.bInheritHandle = TRUE;
    sa.lpSecurityDescriptor = NULL;

    // Create a pipe for the child process's STDOUT
    if (!CreatePipe(&hReadPipe, &hWritePipe, &sa, 0)) {
        std::cerr << "CreatePipe failed\n";
        return false;
    }

    // Ensure the read handle to the pipe is not inherited
    if (!SetHandleInformation(hReadPipe, HANDLE_FLAG_INHERIT, 0)) {
        std::cerr << "SetHandleInformation failed\n";
        return false;
    }

    std::string command = "tts_pipe_exe " + text;  // Call your TTS exe
    if (!CreateProcess(NULL, (LPSTR)command.c_str(), NULL, NULL, TRUE, 0, NULL, NULL, &si, &pi)) {
        std::cerr << "CreateProcess failed\n";
        return false;
    }

    // Read output from the pipe
    while (ReadFile(hReadPipe, buffer, sizeof(buffer), &bytes_read, NULL) && bytes_read > 0) {
        audio_data.insert(audio_data.end(), buffer, buffer + bytes_read);
    }

    // Wait for process to exit and clean up
    WaitForSingleObject(pi.hProcess, INFINITE);
    CloseHandle(pi.hProcess);
    CloseHandle(pi.hThread);
    CloseHandle(hReadPipe);
    CloseHandle(hWritePipe);

    return true;
}

HRESULT __stdcall Engine::Speak(DWORD dwSpeakFlags, REFGUID rguidFormatId, const WAVEFORMATEX* pWaveFormatEx,
                                const SPVTEXTFRAG* pTextFragList, ISpTTSEngineSite* pOutputSite)
{
    slog("Engine::Speak");

    for (const auto* text_frag = pTextFragList; text_frag != nullptr; text_frag = text_frag->pNext) {
        if (handle_actions(pOutputSite) == 1) {
            return S_OK;
        }

        slog(L"action={}, offset={}, length={}, text=\"{}\"",
            (int)text_frag->State.eAction,
            text_frag->ulTextSrcOffset,
            text_frag->ulTextLen,
            text_frag->pTextStart);

        // Convert wide string to UTF-8
        std::string text = utf8_encode(std::wstring(text_frag->pTextStart, text_frag->ulTextLen));

        std::vector<char> audio_data;
        if (!RunExternalTTSProcess(text, audio_data)) {
            std::cerr << "Failed to run external TTS process.\n";
            return E_FAIL;
        }

        // Write audio data to the output
        ULONG written;
        HRESULT result = pOutputSite->Write(audio_data.data(), audio_data.size(), &written);
        if (result != S_OK || written != audio_data.size()) {
            std
willwade commented 4 days ago

Im attempting the pipe service approach - heres an update

So register seems to work - but it doesnt. It adds registry keys but you don't see it listed in SAPI voices. I dont understand how the old code worked in that I cant see in the registry a reference to the dll that is created.

Adding a InprocServer32 key seems to help.. but I feel this is a hack

# Define the registry path and the DLL path
$registryPath = "HKLM:\SOFTWARE\Microsoft\Speech\Voices\Tokens\PYTTS-Microsoft\InprocServer32"
$dllPath = "C:\GitHub\SAPI-POC\VoiceServer\_libs\pysapittsengine.dll"

# Create the 'InprocServer32' key
New-Item -Path $registryPath -Force

# Set the default value to the DLL path
Set-ItemProperty -Path $registryPath -Name "(default)" -Value $dllPath

# Verify the change
Get-ItemProperty -Path $registryPath

If you do this (after you register using our code). You can see a voice in powershell but still not in a proper SAPI system like balabolka

So why? And this still isnt getting to try out speaking..

Update

So my recent commit - which writes keys in both 64bit Reg and 32 bit DO show the voice (Test with https://www.cross-plus-a.com/balabolka.htm) .

BUT speaking text with them - you get a fault - but it doesnt reach the pipe service..