Piper-phonemize Web Assembly module

ken107 commented 6 months ago

https://github.com/rhasspy/piper-phonemize

Currently we still have to call server to do this. A wasm or JS phonemize module would complete the final requirement for fully offline functionality.

Piper-phonemize depends on onnxruntime to perform Arabic diacritization. This dependency should be removed, and the diacritization should be performed by the calling application.

jozefchutka commented 5 months ago

Here are the build steps for wasm version of piper-phonemize.

Notice the removed onnxruntime dependency.

If you are able/interested, consider moving the arabic inference to the JS using onnixruntime-web, and keep me in loop.

See this poc for the minimal working example for piper-phonemize + inference.

guest271314 commented 1 month ago

I'm doing something like this using modified code from here that evidently was inspired by code in this forked repository https://github.com/guest271314/vits-web/tree/patch-1/docs as an unpacked extension. See https://guest271314.github.io/vits-web/ for GH Pages example of roughly similar code. @jozefchutka an issue with using Emscripten is generated JavaScript includes references to XMLHttpRequest(), window.location.pathname which don't exist in a node, deno environment. Should be possible to comile to a .wasm file and use the same code to produce Float32Array in any JavaScript engine or runtime that supports WebAssembly.

import { createPiperPhonemize } from "./deno-piper-bundle.js";
import * as ort from "./onyx-runtimeweb.js";

const voiceData = {
  "en_US-hfc_female-medium": {
    "arraybuffer": await (await fetch(new URL("./en_US-hfc_female-medium.onnx", import.meta.url)))
      .arrayBuffer(),
    "json": (await import("./en_US-hfc_female-medium.onnx.json", {
      with: { type: "json" },
    })).default,
  },
  "en_US-hfc_male-medium": {
    "arraybuffer": await (await fetch(new URL("./en_US-hfc_male-medium.onnx", import.meta.url)))
      .arrayBuffer(),
    "json": (await import("./en_US-hfc_male-medium.onnx.json", {
      with: { type: "json" },
    })).default,
  },
};
const PATH_MAP = {
  "en_US-hfc_female-medium": "en_US-hfc_female-medium.onnx",
  "en_US-hfc_male-medium": "en_US-hfc_male-medium.onnx",
};

function pcm2wav(buffer, numChannels, sampleRate) {
  const bufferLength = buffer.length;
  const view = new DataView(
    new ArrayBuffer(bufferLength * numChannels * 2 + 44),
  );
  view.setUint32(0, 0x46464952, true);
  view.setUint32(4, view.buffer.byteLength - 8, true);
  view.setUint32(8, 0x45564157, true);
  view.setUint32(12, 0x20746d66, true);
  view.setUint32(16, 0x10, true);
  view.setUint16(20, 0x0001, true);
  view.setUint16(22, numChannels, true);
  view.setUint32(24, sampleRate, true);
  view.setUint32(28, numChannels * 2 * sampleRate, true);
  view.setUint16(32, numChannels * 2, true);
  view.setUint16(34, 16, true);
  view.setUint32(36, 0x61746164, true);
  view.setUint32(40, 2 * bufferLength, true);
  let p = 44;
  for (let i = 0; i < bufferLength; i++) {
    const v = buffer[i];
    if (v >= 1) view.setInt16(p, 0x7fff, true);
    else if (v <= -1) view.setInt16(p, -0x8000, true);
    else view.setInt16(p, (v * 0x8000) | 0, true);
    p += 2;
  }
  return view.buffer;
}

const piperPhonemizeWasm = new URL("./piper_phonemize.wasm", import.meta.url).href; 
const piperPhonemizeData = new URL("./piper_phonemize.data", import.meta.url).href; 

async function tts(config) {
  const input = JSON.stringify([
    {
      text: config.text.trim(),
    },
  ]);

  ort.env.wasm.numThreads = 2; // navigator.hardwareConcurrency;
  ort.env.wasm.wasmPaths = "./";
  const modelConfig = config.voiceId.json;
  const modelArrayBuffer = config.voiceId.arraybuffer;

  const phonemeIds = await new Promise(async (resolve) => {
    const module = await createPiperPhonemize({
      print: (data) => {
        resolve(JSON.parse(data).phoneme_ids);
      },
      printErr: (message) => {
        throw new Error(message);
      },
      locateFile: (url) => {
        if (url.endsWith(".wasm")) return piperPhonemizeWasm;
        if (url.endsWith(".data")) return piperPhonemizeData;
        return url;
      },
    });
    module.callMain([
      "-l",
      modelConfig.espeak.voice,
      "--input",
      input,
      "--espeak_data",
      "/espeak-ng-data",
    ]);
  });
  const sampleRate = modelConfig.audio.sample_rate;
  const noiseScale = modelConfig.inference.noise_scale;
  const lengthScale = modelConfig.inference.length_scale;
  const noiseW = modelConfig.inference.noise_w;
  const session = await ort.InferenceSession.create(modelArrayBuffer);
  const feeds = {
    input: new ort.Tensor("int64", phonemeIds, [1, phonemeIds.length]),
    input_lengths: new ort.Tensor("int64", [phonemeIds.length]),
    scales: new ort.Tensor("float32", [noiseScale, lengthScale, noiseW]),
  };
  if (Object.keys(modelConfig.speaker_id_map).length) {
    Object.assign(feeds, {
      sid: new ort.Tensor("int64", [0]),
    });
  }
  const {
    output: { data: pcm },
  } = await session.run(feeds);
  return pcm;
  /*
  return new Blob([pcm2wav(pcm, 1, sampleRate)], {
    type: "audio/x-wav",
  });
  */
}

if (/Chrome|Firefox/.test(navigator.userAgent)) {
  const text = "Text to speech in the browser is amazing!";
  const select = document.querySelector("select");
  const audio = document.querySelector("audio");
  const textarea = document.querySelector("textarea");
  const button = document.querySelector("button");
  const a = document.querySelector("a");
  textarea.placeholder = text;
  const entries = Object.entries(PATH_MAP);

  for (let i = 0; i < entries.length + 1; i++) {
    if (i === 0) {
      select[i] = new Option(
        "Choose a voice for Text-To-Speech:",
        "",
        true,
        true,
      );
      continue;
    }
    const [key] = entries[i - 1];
    select[i] = new Option(key);
  }

  select.addEventListener("change", async (e) => {
    // audio.setAttribute("autoplay", true);
    const value = e.target.value.trim();
    if (value) {
      const floats = await tts({
        "text": textarea.value || text,
        "voiceId": voiceData[value],
      });
      let ac = new AudioContext({sampleRate: 22050, latencyHint: 0});
      let buffer = new AudioBuffer({
        length: floats.length,
        numberOfChannels: 1,
        sampleRate: 22050,
      });
      buffer.getChannelData(0).set(floats);
      let absn = new AudioBufferSourceNode(ac, { buffer });
      let msd = new MediaStreamAudioDestinationNode(ac, {channelCount: 1});
      let { stream } = msd;
      absn.connect(msd);
      absn.connect(ac.destination);
      let recorder = new MediaRecorder(stream);
      recorder.start();
      recorder.addEventListener("dataavailable", (e) => {
        // audio.srcObject = null;
        audio.removeAttribute("autoplay");
        audio.src = URL.createObjectURL(e.data);
      });
      // Wobbly
      // audio.srcObject = stream;
      absn.addEventListener("ended", async (e) => {
        await ac.close();
        recorder.stop();
      });
      absn.start();
    }
  });

  button.addEventListener("click", (e) => {
    if (blobURL) {
      a.href = blobURL;
      a.download = `${select.value}.wav`;
      a.click();
    }
  });
} else {
  // TODO: Get rid of XMLHttpRequest() in piper.js
  const result = await tts({
    "text": "Text-To-Speech for Node.js, Deno, Bun, txiki.js, and the browser",
    "voiceId": voiceData["en_US-hfc_male-medium"],
  });
  console.log(result.length);
}

jozefchutka commented 1 month ago

I do not have experience with wasm for node but would try -s ENVIRONMENT=web https://emscripten.org/docs/tools_reference/settings_reference.html?highlight=environment

guest271314 commented 1 month ago

@jozefchutka Thanks. I think shell would be the most JavaScript engine and runtime agnostic. Ideally we compile to .wasm file for the ability to run in any environment that supports WebAssembly and with WASM runtimes.

guest271314 commented 1 month ago

@jozefchutka I'm getting all kinds of errors trying to run your build steps.

bash build.sh
fatal: could not create leading directories of '/wasm/modules/emsdk': Permission denied
build.sh: line 4: cd: /wasm/modules/emsdk: No such file or directory

After manually creating the directories

build.sh: line 5: ./emsdk: Is a directory
build.sh: line 6: ./emsdk: Is a directory
build.sh: line 7: ./emsdk_env.sh: No such file or directory
sed: can't read ./upstream/emscripten/cache/sysroot/include/wchar.h: No such file or directory

ken107 / piper-browser-extension

Piper-phonemize Web Assembly module #2