JamesBrill / react-speech-recognition

💬Speech recognition for your React app
https://webspeechrecognition.com/
MIT License
645 stars 116 forks source link

Handling multiple matching commands #140

Open noinskit opened 2 years ago

noinskit commented 2 years ago

Is it possible to handle multiple commands matching a spoken text? As I understand the API, if multiple commands match (which can happen easily with wildcards etc.), many callbacks will trigger in undefined (?) order and there's no clean way to inspect them together and e.g. prioritize to handle a chosen one.

To give a specific example, if I define two commands: "What's up" and "What's *" ...then on recognizing "what's up", two callbacks associated with the two commands will trigger. I'm looking for a way to resolve this - in my case, to give priority to the more specific command. For example, a callback that gets a list of all matching commands (instead of per-command) would be perfect.

ugnelis commented 1 year ago

Have you managed to solve this issue?

ugnelis commented 1 year ago

I have made that only the best command can be matched. Basically, I am using a timer to track if the transcript has changed.

I am placing my code below. Note: I am using customized react-speech-recognition functionality, but not everything matches the react-speech-recognition functionality.

SpeechRecognition.js

import { useCallback, useEffect, useReducer, useRef, useState } from 'react';
import {
  browserSupportsPolyfills,
  compareTwoStringsUsingDiceCoefficient,
  concatTranscripts,
} from './utils';
import { appendTranscript, clearTranscript } from './actions';
import { transcriptReducer } from './reducers';
import RecognitionManager from './RecognitionManager';
import NativeSpeechRecognition from './NativeSpeechRecognition';

let _browserSupportsSpeechRecognition = !!NativeSpeechRecognition;
let _browserSupportsContinuousListening = _browserSupportsSpeechRecognition;
let recognitionManager;

const useSpeechRecognition = ({
  transcribing = true,
  clearTranscriptOnListen = true,
  matchAfterTranscribingTimeInMs = 2000,
  commands = [],
} = {}) => {
  const [recognitionManager] = useState(SpeechRecognition.getRecognitionManager());
  const [browserSupportsSpeechRecognition, setBrowserSupportsSpeechRecognition] = useState(
    _browserSupportsSpeechRecognition,
  );
  const [browserSupportsContinuousListening, setBrowserSupportsContinuousListening] = useState(
    _browserSupportsContinuousListening,
  );
  const [{ interimTranscript, finalTranscript }, dispatch] = useReducer(transcriptReducer, {
    interimTranscript: recognitionManager.interimTranscript,
    finalTranscript: '',
  });
  const [listening, setListening] = useState(recognitionManager.listening);

  const [isMicrophoneAvailable, setMicrophoneAvailable] = useState(
    recognitionManager.isMicrophoneAvailable,
  );

  const timerRef = useRef(null);

  const commandsRef = useRef(commands);
  commandsRef.current = commands;

  const dispatchClearTranscript = () => {
    dispatch(clearTranscript());
  };

  const resetTranscript = useCallback(() => {
    recognitionManager.resetTranscript();
    dispatchClearTranscript();
  }, [recognitionManager]);

  const testFuzzyMatch = (command, input, fuzzyMatchingThreshold) => {
    const commandToString = typeof command === 'object' ? command.toString() : command;
    const commandWithoutSpecials = commandToString
      .replace(/[&/\\#,+()!$~%.'":*?<>{}]/g, '')
      .replace(/  +/g, ' ')
      .trim();
    const howSimilar = compareTwoStringsUsingDiceCoefficient(commandWithoutSpecials, input);
    if (howSimilar >= fuzzyMatchingThreshold) {
      return {
        command,
        commandWithoutSpecials,
        howSimilar,
      };
    }
    return null;
  };

  const matchCommands = useCallback(
    (newInterimTranscript, newFinalTranscript) => {
      const commandsMap = new Map();

      commandsRef.current.forEach(
        ({ command, callback, matchInterim = false, fuzzyMatchingThreshold = 0.8 }) => {
          const input =
            !newFinalTranscript && matchInterim
              ? newInterimTranscript.trim()
              : newFinalTranscript.trim();
          const subcommands = Array.isArray(command) ? command : [command];
          const results = subcommands
            .map((subcommand) => {
              return testFuzzyMatch(subcommand, input, fuzzyMatchingThreshold);
            })
            .filter((x) => x);

          if (results.length >= 2) {
            results.sort((a, b) => b.howSimilar - a.howSimilar);
            const { command, commandWithoutSpecials, howSimilar } = results[0];
            commandsMap.set(howSimilar, () =>
              callback(commandWithoutSpecials, input, howSimilar, {
                command,
                resetTranscript,
              }),
            );
          } else {
            results.forEach((result) => {
              const { command, commandWithoutSpecials, howSimilar } = result;
              commandsMap.set(howSimilar, () =>
                callback(commandWithoutSpecials, input, howSimilar, {
                  command,
                  resetTranscript,
                }),
              );
            });
          }
        },
      );
      const bestMatch = [...commandsMap].sort().reverse()[0];
      if (bestMatch && bestMatch[1] !== undefined) {
        bestMatch[1]();
      }
    },
    [resetTranscript],
  );

  const handleTranscriptChange = useCallback(
    (newInterimTranscript, newFinalTranscript) => {
      if (transcribing) {
        dispatch(appendTranscript(newInterimTranscript, newFinalTranscript));

        if (timerRef.current) {
          clearTimeout(timerRef.current);
        }

        timerRef.current = setTimeout(() => {
          matchCommands(newInterimTranscript, newFinalTranscript);
        }, matchAfterTranscribingTimeInMs);
      } else {
        matchCommands(newInterimTranscript, newFinalTranscript);
      }
    },
    [matchCommands, transcribing, matchAfterTranscribingTimeInMs],
  );

  const handleClearTranscript = useCallback(() => {
    if (clearTranscriptOnListen) {
      dispatchClearTranscript();
    }
  }, [clearTranscriptOnListen]);

  useEffect(() => {
    const id = SpeechRecognition.counter;
    SpeechRecognition.counter += 1;
    const callbacks = {
      onListeningChange: setListening,
      onMicrophoneAvailabilityChange: setMicrophoneAvailable,
      onTranscriptChange: handleTranscriptChange,
      onClearTranscript: handleClearTranscript,
      onBrowserSupportsSpeechRecognitionChange: setBrowserSupportsSpeechRecognition,
      onBrowserSupportsContinuousListeningChange: setBrowserSupportsContinuousListening,
    };
    recognitionManager.subscribe(id, callbacks);

    return () => {
      recognitionManager.unsubscribe(id);
    };
  }, [
    transcribing,
    clearTranscriptOnListen,
    recognitionManager,
    handleTranscriptChange,
    handleClearTranscript,
  ]);

  const transcript = concatTranscripts(finalTranscript, interimTranscript);
  return {
    transcript,
    interimTranscript,
    finalTranscript,
    listening,
    isMicrophoneAvailable,
    resetTranscript,
    browserSupportsSpeechRecognition,
    browserSupportsContinuousListening,
  };
};
const SpeechRecognition = {
  counter: 0,
  applyPolyfill: (PolyfillSpeechRecognition) => {
    if (recognitionManager) {
      recognitionManager.setSpeechRecognition(PolyfillSpeechRecognition);
    } else {
      recognitionManager = new RecognitionManager(PolyfillSpeechRecognition);
    }
    const browserSupportsPolyfill = !!PolyfillSpeechRecognition && browserSupportsPolyfills();
    _browserSupportsSpeechRecognition = browserSupportsPolyfill;
    _browserSupportsContinuousListening = browserSupportsPolyfill;
  },
  removePolyfill: () => {
    if (recognitionManager) {
      recognitionManager.setSpeechRecognition(NativeSpeechRecognition);
    } else {
      recognitionManager = new RecognitionManager(NativeSpeechRecognition);
    }
    _browserSupportsSpeechRecognition = !!NativeSpeechRecognition;
    _browserSupportsContinuousListening = _browserSupportsSpeechRecognition;
  },
  getRecognitionManager: () => {
    if (!recognitionManager) {
      recognitionManager = new RecognitionManager(NativeSpeechRecognition);
    }
    return recognitionManager;
  },
  getRecognition: () => {
    const recognitionManager = SpeechRecognition.getRecognitionManager();
    return recognitionManager.getRecognition();
  },
  startListening: async ({ continuous, language } = {}) => {
    const recognitionManager = SpeechRecognition.getRecognitionManager();
    await recognitionManager.startListening({ continuous, language });
  },
  stopListening: async () => {
    const recognitionManager = SpeechRecognition.getRecognitionManager();
    await recognitionManager.stopListening();
  },
  abortListening: async () => {
    const recognitionManager = SpeechRecognition.getRecognitionManager();
    await recognitionManager.abortListening();
  },
  browserSupportsSpeechRecognition: () => _browserSupportsSpeechRecognition,
  browserSupportsContinuousListening: () => _browserSupportsContinuousListening,
};

export { useSpeechRecognition };
export default SpeechRecognition;