react-native-voice / voice

:microphone: React Native Voice Recognition library for iOS and Android (Online and Offline Support)
MIT License
1.82k stars 492 forks source link

SpeechResults called prematurely in iOS - on partial results #299

Open mikob opened 3 years ago

mikob commented 3 years ago

If you try saying a phrase, onSpeechResults is called while you're saying the phrase before you're done saying it, when in fact only speechPartialResults should be called, and speechResults should be called only when you're done with the phrase.

@react-native-voice/voice==2.0.4 ios 14.4 iPhone SE

    Voice.onSpeechPartialResults = (event) => {
      this.log("log", `Partial [${this.transcriptI}]: ${event.value}`);
    };
    Voice.onSpeechResults = (event) => {
      // Problem here - this is called while saying the phrase eg. if I say "the new penguin" it is called on "the" before I finish speaking.
      this.log("log", "Speech results: " + event.value);
    };
xzed90 commented 3 years ago

Same issue here. Did anyone fix it?

JaVlinPi commented 3 years ago

Same here. I'm having many issues on iOS.

yuritoledo commented 3 years ago

Can you share a piece of lib implementation? I follow the docs and when press the Voice.start, nothing happens ...

tearjuIce commented 3 years ago

anyone solved this? i get the same issue

just-dodo commented 3 years ago

+1

roots-ai commented 3 years ago

Any updates herE?

mfkrause commented 2 years ago

So I think I kinda solved this for my use case - PSA, this is a pretty hacky solution but works okay in my case. PSA 2, I'm using this library in a React hook. So you might need to make some changes to this if you're using it another way.

iOS seems to return a full transcript every time a single word has been processed. To counteract this, I'm keeping track of the current transcript and then check if the transcript was changed on a regular basis (1 second in my case which seems to work well for natural speech). If it has been changed, I check again the next second and so on until the previous and the new transcript are the same. Then I use the last transcript received as the "processed" one. This whole process is only used on iOS, on Android I simply use the transcript returned by the library since this seems to work fine. My full hook looks something like this (TypeScript)... the "magic" happens mostly in the onSpeechResults function:

import { useEffect, useState } from 'react';

import Voice, {
  SpeechResultsEvent,
  SpeechErrorEvent,
  SpeechVolumeChangeEvent,
} from '@react-native-voice/voice';
import { Platform } from 'react-native';

const useVoice = (listen: boolean): [
  string | null,
  {
    partialResults: string | null,
    volume: number | null,
    error: SpeechErrorEvent | null,
  },
] => {
  const [currentTranscript, setCurrentTranscript] = useState<string | null>(null);
  const [processedTranscript, setProcessedTranscript] = useState<string | null>(null);
  const [partialResults, setPartialResults] = useState<string | null>(null);
  const [volume, setVolume] = useState<number | null>(null);
  const [error, setError] = useState<SpeechErrorEvent | null>(null);

  const startListening = () => {
    Voice.start('de-DE').catch((e) => console.error(e));
  };

  const onSpeechResults = (e: SpeechResultsEvent) => {
    console.log('onSpeechResults: ', e);
    if (e.value) {
      // receivedTranscript is always the "freshest" transcript the system marked as complete.
      const receivedTranscript = e.value[0];
      if (Platform.OS === 'ios') {
        /* onSpeechResults is called prematurely on iOS (as soon as one single word has been processed). Therefore,
        wait for some time and check if the user has stopped speaking, and only then mark the transcript as
        processed and return it. */
        setCurrentTranscript(receivedTranscript);
        setTimeout(() => {
          setCurrentTranscript((previousTranscript) => {
            // Check if the transcript from 1 second ago equals to the current transcript from the system
            if (receivedTranscript === previousTranscript) {
              // They match! The user seems to have stopped speaking. Set the processed transcript accordingly.
              console.log('Processed transcript (iOS): ', receivedTranscript);
              setProcessedTranscript(receivedTranscript);
              // Reset the transcript
              Voice.destroy().then(startListening);
              return null;
            }
            // They don't match - check again later
            return previousTranscript;
          });
        }, 1000);
      } else setCurrentTranscript(receivedTranscript);
    } else setCurrentTranscript(null);
  };

  const onSpeechPartialResults = (e: SpeechResultsEvent) => {
    if (e.value) setPartialResults(e.value[0]);
    else setPartialResults(null);
  };

  const onSpeechVolumeChanged = (e: SpeechVolumeChangeEvent) => {
    if (e.value) setVolume(e.value);
    else setVolume(null);
  };

  const onSpeechError = (e: SpeechErrorEvent) => {
    console.log('onSpeechError: ', e);
    if (e.error) setError(e);
    else setError(null);
  };

  useEffect(() => {
    Voice.onSpeechResults = onSpeechResults;
    Voice.onSpeechPartialResults = onSpeechPartialResults;
    Voice.onSpeechVolumeChanged = onSpeechVolumeChanged;
    Voice.onSpeechError = onSpeechError;

    return () => {
      Voice.destroy().then(Voice.removeAllListeners);
    };
  }, []);

  useEffect(() => {
    if (listen) {
      startListening();
      console.log('Listening started.');
    } else {
      Voice.destroy().catch((e) => console.error(e));
      console.log('Listening stopped.');
    }
  }, [listen]);

  return [Platform.OS === 'ios' ? processedTranscript : currentTranscript, { partialResults, volume, error }];
};

export default useVoice;
barghi commented 7 months ago

I changed @mfkrause code a bit and improved the performance This hook has 3 states: idle, listening, receiving and keeps internal transcript in ref to prevent redundant render

tested on React-Native 0.73.1 Redmi 9 pro (Android 10) iPhone 6s (iOS 15.7.3)


import {
   useCallback, useEffect, useRef, useState,
} from 'react';
import { Platform } from 'react-native';
import Voice, {
   SpeechErrorEvent,
   SpeechResultsEvent,
} from '@react-native-voice/voice';

type ParamTypes = {
   voice?: string;
}
export type VoiceStateType = 'idle' | 'listening' | 'receiving';

export default function useVoice({ voice = 'en-US' }: ParamTypes) {
   const [error, setError] = useState<SpeechErrorEvent | undefined>();
   const [state, setState] = useState<VoiceStateType>('idle');
   const [result, setResult] = useState<string | undefined>();

   const script = useRef<string | undefined>();
   const timeout = useRef<ReturnType<typeof setTimeout>>();

   const start = useCallback(() => {
    try {
       Voice.start(voice);
       setState('listening');
         }
    catch (err) {
       console.log(err);
    }
   }, [voice]);

   const stop = useCallback(() => {
    try {
       Voice.destroy();
            setResult(script.current);
        setState('idle');
    }
    catch (err) {
       console.log(err);
    }
   }, []);

   const onSpeechResults = useCallback((e: SpeechResultsEvent) => {
    if (!e.value) {
       script.current = undefined;
       return;
    }

    const receivedTranscript = e.value[0];
    script.current = receivedTranscript;
    setState('receiving');

    if (Platform.OS === 'ios') {
       clearTimeout(timeout.current);
       timeout.current = setTimeout(() => {
          if (script.current === receivedTranscript) {
             stop();
          }
        }, 1000);
    }
    else {
          stop();
    }
    // eslint-disable-next-line react-hooks/exhaustive-deps
   }, []);

   const onSpeechError = useCallback((e: SpeechErrorEvent) => {
    if (e.error) {
       setError(e);
       setState('idle');
    }
    else {
       setError(undefined);
    }
   }, []);

   const onSpeechPartialResults = useCallback(() => {
    setState('receiving');
   }, []);

   useEffect(() => {
    Voice.onSpeechResults = onSpeechResults;
    Voice.onSpeechError = onSpeechError;
    Voice.onSpeechPartialResults = onSpeechPartialResults;

    return () => {
       Voice.destroy().then(Voice.removeAllListeners);
    };
    // eslint-disable-next-line react-hooks/exhaustive-deps
   }, []);

   return {
      result,
      state,
      error,
      start,
      stop,
   };
}