csdcorp / speech_to_text

A Flutter plugin that exposes device specific text to speech recognition capability.
BSD 3-Clause "New" or "Revised" License
351 stars 218 forks source link

Record voice while speech recognition is active #295

Open FerasAlhammoud opened 2 years ago

FerasAlhammoud commented 2 years ago

I know this issue is already discussed, but I found this solution on stackoverflow, someone said he was able to record, here is what he said:

We can save that audio by using AudioRecord class. I have done that successfully.

public class MainActivity extends AppCompatActivity { TextView textView; ImageView imageView; static int request = 1; private static final int RECORDER_SAMPLERATE = 8000; private static final int RECORDER_CHANNELS = AudioFormat.CHANNEL_IN_MONO; private static final int RECORDER_AUDIO_ENCODING = AudioFormat.ENCODING_PCM_16BIT; private AudioRecord recorder = null; private Thread recordingThread = null; private boolean isRecording = false; private int[] mSampleRates = new int[]{8000, 11025, 22050, 44100}; int bufferSize;

@Override protected void onCreate(Bundle savedInstanceState) { super.onCreate(savedInstanceState); setContentView(R.layout.activity_main);

textView = findViewById(R.id.textView);
imageView = findViewById(R.id.mic);

int bufferSize = AudioRecord.getMinBufferSize(RECORDER_SAMPLERATE,
        RECORDER_CHANNELS, RECORDER_AUDIO_ENCODING);

recorder = findAudioRecord();

if (ContextCompat.checkSelfPermission(this,
        Manifest.permission.RECORD_AUDIO)
        != PackageManager.PERMISSION_GRANTED) {
    ActivityCompat.requestPermissions(this,
            new String[]{Manifest.permission.RECORD_AUDIO, Manifest.permission.WRITE_EXTERNAL_STORAGE, Manifest.permission.READ_EXTERNAL_STORAGE},
            1234);
}

imageView.setOnClickListener(new View.OnClickListener() {
    @Override
    public void onClick(View v) {
        Intent speech = new Intent(RecognizerIntent.ACTION_RECOGNIZE_SPEECH);
        speech.putExtra(RecognizerIntent.EXTRA_LANGUAGE_MODEL, RecognizerIntent.LANGUAGE_MODEL_FREE_FORM);
        speech.putExtra(RecognizerIntent.EXTRA_PROMPT, "Speak to Text");

        if (ContextCompat.checkSelfPermission(MainActivity.this,
                Manifest.permission.RECORD_AUDIO)
                == PackageManager.PERMISSION_GRANTED) {
            startRecording();
            startActivityForResult(speech, request);
        }

    }
});

textView.setOnClickListener(new View.OnClickListener() {
    @Override
    public void onClick(View v) {
        stopRecording();
    }
});

}

@Override protected void onActivityResult(int requestCode, int resultCode, @Nullable Intent data) { super.onActivityResult(requestCode, resultCode, data);

if (requestCode == request && resultCode == RESULT_OK) {
    stopRecording();
    ArrayList<String> dataa = data.getStringArrayListExtra(RecognizerIntent.EXTRA_RESULTS);
    textView.setText(dataa.get(0).toString());
}

}

int BufferElements2Rec = 1024; // want to play 2048 (2K) since 2 bytes we use only 1024 int BytesPerElement = 2; // 2 bytes in 16bit format

private void startRecording() {

recorder.startRecording();
isRecording = true;
recordingThread = new Thread(new Runnable() {
    public void run() {
        writeAudioDataToFile();
    }
}, "AudioRecorder Thread");
recordingThread.start();

}

@Override public void onRequestPermissionsResult(int requestCode, String permissions[], int[] grantResults) { switch (requestCode) { case 1234: { if (grantResults.length > 0 && grantResults[0] == PackageManager.PERMISSION_GRANTED) { } else { Log.d("TAG", "permission denied by user"); } return; } } } private byte[] short2byte(short[] sData) { int shortArrsize = sData.length; byte[] bytes = new byte[shortArrsize 2]; for (int i = 0; i < shortArrsize; i++) { bytes[i 2] = (byte) (sData[i] & 0x00FF); bytes[(i * 2) + 1] = (byte) (sData[i] >> 8); sData[i] = 0; } return bytes;

} public AudioRecord findAudioRecord() { for (int rate : mSampleRates) { for (short audioFormat : new short[]{ AudioFormat.ENCODING_PCM_8BIT, AudioFormat.ENCODING_PCM_16BIT}) { for (short channelConfig : new short[]{ AudioFormat.CHANNEL_IN_MONO, AudioFormat.CHANNEL_IN_STEREO}) { try { Log.d("Mic2", "Attempting rate " + rate

private void writeAudioDataToFile() { String filePath = Environment.getExternalStorageDirectory().getAbsolutePath() + "/file.pcm"; short sData[] = new short[BufferElements2Rec];

FileOutputStream os = null;
try {
    os = new FileOutputStream(filePath);
} catch (FileNotFoundException e) {
    e.printStackTrace();
}

while (isRecording) {

    recorder.read(sData, 0, BufferElements2Rec);
    System.out.println("Short writing to file" + sData.toString());
    try {
        byte bData[] = short2byte(sData);
        os.write(bData, 0, BufferElements2Rec * BytesPerElement);
    } catch (IOException e) {
        e.printStackTrace();
    }
}
try {
    os.close();
} catch (IOException e) {
    e.printStackTrace();
}

}

private void stopRecording() { if (null != recorder) { isRecording = false; recorder.stop(); recorder.release(); recorder = null; recordingThread = null; } }

@Override public boolean onKeyDown(int keyCode, KeyEvent event) { if (keyCode == KeyEvent.KEYCODE_BACK) { finish(); } return super.onKeyDown(keyCode, event); }

sowens-csd commented 2 years ago

Can you add a link to the SO thread where you found this?

FerasAlhammoud commented 2 years ago

Here is the link:

https://stackoverflow.com/questions/23047433/record-save-audio-from-voice-recognition-intent/23070879#23070879

hsangtini commented 2 years ago
public void startSpeechRecognition() {
   // Fire an intent to start the speech recognition activity.
   Intent intent = new Intent(RecognizerIntent.ACTION_RECOGNIZE_SPEECH);
   // secret parameters that when added provide audio url in the result
   intent.putExtra("android.speech.extra.GET_AUDIO_FORMAT", "audio/AMR");
   intent.putExtra("android.speech.extra.GET_AUDIO", true);

   startActivityForResult(intent, "<some code you choose>");
}

// handle result of speech recognition
@Override
public void onActivityResult(int requestCode, int resultCode, Intent data) {
    // the resulting text is in the getExtras:
    Bundle bundle = data.getExtras();
    ArrayList<String> matches = bundle.getStringArrayList(RecognizerIntent.EXTRA_RESULTS)
    // the recording url is in getData:
    Uri audioUri = data.getData();
    ContentResolver contentResolver = getContentResolver();
    InputStream filestream = contentResolver.openInputStream(audioUri);
    // TODO: read audio file from inputstream
}

Tested, It work. Could you implement it?

sowens-csd commented 2 years ago

Looks interesting, yes, I'll try it.

sowens-csd commented 2 years ago

I just tried this on a Samsung device running Android 11 and it did not work. Where did you try it?

hsangtini commented 2 years ago

Hi there, sorry for late. i really busy this week.

I tried on Samsung too. Galaxy Note20 Ultra 5G - android 12 ( and some Samsung devices android 11, worked too).

bind activity result binding.addActivityResultListener(this)

private val resultRequestCode = 111

override fun onActivityResult(requestCode: Int, resultCode: Int, data: Intent?): Boolean {
        when (requestCode) {
            resultRequestCode -> {
                if (resultCode == RESULT_OK && data != null) {

                    val filestream: InputStream? = data.data?.let { currentActivity?.getContentResolver()?.openInputStream(it) }

                    val outputFile = File(currentActivity?.cacheDir, "recording.amr")

                    var outputFilePath: String?;
                    if (outputFile.exists()) {
                        outputFile.delete()
                    } else {
                        outputFile.parentFile?.mkdirs()
                    }
                    val outputStream = FileOutputStream(outputFile)
                    filestream.use { input ->
                        outputStream.use { output ->
                            input?.copyTo(output)
                            outputFilePath = outputFile.absolutePath;
                        }
                    }

                    updateResults(data.extras, true, outputFilePath, true)
                    notifyListening(isRecording = false)
                } else {
                    onError(resultCode)
                }
                return true
            }

        }
        return false
    }
private fun updateResults(speechBundle: Bundle?, isFinal: Boolean, outputFilePath: String?, dialogMode: Boolean) {
        if (isDuplicateFinal(isFinal)) {
            debugLog("Discarding duplicate final")
            return
        }

        val userSaid = if(dialogMode) {
            speechBundle?.getStringArrayList(RecognizerIntent.EXTRA_RESULTS)
        } else {
            speechBundle?.getStringArrayList(SpeechRecognizer.RESULTS_RECOGNITION)
        }
        if (null != userSaid && userSaid.isNotEmpty()) {
            val speechResult = JSONObject()
            speechResult.put("finalResult", isFinal)
            val confidence = if(dialogMode) {
                speechBundle?.getFloatArray(RecognizerIntent.EXTRA_CONFIDENCE_SCORES)
            }else {
                speechBundle?.getFloatArray(SpeechRecognizer.CONFIDENCE_SCORES)
            }
            val alternates = JSONArray()
            for (resultIndex in 0..userSaid.size - 1) {
                val speechWords = JSONObject()
                speechWords.put("recognizedWords", userSaid[resultIndex])
                if (null != confidence && confidence.size >= userSaid.size) {
                    speechWords.put("confidence", confidence[resultIndex])
                } else {
                    speechWords.put("confidence", missingConfidence)
                }
                alternates.put(speechWords)
            }
            speechResult.put("alternates", alternates)
            speechResult.put("audioPath", outputFilePath)
            val jsonResult = speechResult.toString()
            debugLog("Calling results callback")
            resultSent = true
            channel?.invokeMethod(SpeechToTextCallbackMethods.textRecognition.name,
                    jsonResult)
        } else {
            debugLog("Results null or empty")
        }
    }

Startlisterning

handler.post {
            run {
                if (dialogMode) {
                    recognizerIntent?.addFlags(Intent.FLAG_ACTIVITY_CLEAR_TOP or Intent.FLAG_ACTIVITY_NO_HISTORY)
                    currentActivity?.startActivityForResult(recognizerIntent, resultRequestCode)
                } else {
                    speechRecognizer?.startListening(recognizerIntent)
                }
            }
        }

Also must add extra for intent

putExtra("android.speech.extra.GET_AUDIO_FORMAT", "audio/AMR")
putExtra("android.speech.extra.GET_AUDIO", true)

And add permissions

    <uses-permission android:name="android.permission.WRITE_EXTERNAL_STORAGE"/>
    <uses-permission android:name="android.permission.READ_EXTERNAL_STORAGE"/>
hsangtini commented 2 years ago

You can check at my fork. https://github.com/hsangtini/speech_to_text

For testing only, the source code is a bit dirty, my sincere apologies.

sowens-csd commented 1 year ago

I can now get the file reference, thanks for the link to your fork. The issue is that it seems to work only when the Google dialog is presented, not when going directly through the API. The key difference seems to be doing this:

                currentActivity?.startActivityForResult(recognizerIntent, resultRequestCode)

instead of this:

                speechRecognizer?.startListening(recognizerIntent)

From a quick look at your code it looks like that is the only case where you'd expect it to happen as well. Since the onActivityResult method is only called when you've done the startActivityForResult version.

Other than the Google dialog appearing I'm not sure what other differences there are. This might be a useful tool to add to the speech plugin for developers that need access to the speech content after recognition but it does rely on some undocumented features and changes the UI of the resulting application. I've also seen in some posts that the resulting audio is relatively low quality, not sure how important that is.

phamconganh commented 1 year ago

@sowens-csd Can this issue be resolved in the next release? I've been searching for this for a week. I found the solved is recorder sound and send to google speech service for high quality audio but not for local recognition.

sowens-csd commented 1 year ago

No, I wouldn't count on this feature in the next release. I'm not convinced yet that this approach resolves the issue for most users because of the UI changes it requires, and it relies on undocumented behaviour that may not be well supported across devices.

Tryad0 commented 1 year ago

Is it still not possible to get the recorded audio from speech recognition?