/// <summary> /// Function that is called when a speech-to-text result is received. If it is a final result and this widget /// is waiting for the last result of the session, then the widget will begin processing the end results /// of the session. /// </summary> /// <param name="result">The speech-to-text result</param> void OnTextResult(SpeechToTextResult result) { if (m_WillDisplayReceivedResults) { // For the purposes of comparing results, this just uses the first alternative m_LastResultWasFinal = result.IsFinal; if (result.IsFinal) { m_PreviousFinalResults += result.TextAlternatives[0].Text; m_ResultsTextUI.color = m_FinalTextResultColor; m_ResultsTextUI.text = m_PreviousFinalResults; SmartLogger.Log(DebugFlags.SpeechToTextWidgets, m_SpeechToTextService.GetType().ToString() + " final result"); if (m_WaitingForLastFinalResultOfSession) { m_WaitingForLastFinalResultOfSession = false; ProcessEndResults(); } } else { m_ResultsTextUI.color = m_InterimTextResultColor; m_ResultsTextUI.text = m_PreviousFinalResults + result.TextAlternatives[0].Text; } } }
/// <summary> /// Waits until the last processed result is a final result. /// If this does not happen before the timeout, the last result is treated as a final result. /// </summary> /// <returns></returns> IEnumerator FinishSession() { SmartLogger.Log(DebugFlags.WindowsSpeechToText, "finish session"); // Wait a specified number of seconds for a final result. float timeElapsedAfterRecording = 0; while (!m_LastResult.IsFinal && timeElapsedAfterRecording < m_SessionTimeoutAfterDoneRecording) { yield return(null); timeElapsedAfterRecording += Time.deltaTime; } // If still determining a final result, just treat the last result processed as a final result. if (!m_LastResult.IsFinal) { SmartLogger.Log(DebugFlags.WindowsSpeechToText, "treat last interim result as final"); m_LastResult.IsFinal = true; if (m_OnTextResult != null) { m_OnTextResult(m_LastResult); } } }
/// <summary> /// Extract the speech-to-text result info from the next response JSON in the queue. /// </summary> void ProcessNextResponseJSON() { // Create a JSON object from the next string in the queue and process the speech-to-text result. var responseJSON = new JSONObject(m_ResponseJSONsQueue.Dequeue(), int.MaxValue); SmartLogger.Log(DebugFlags.GoogleStreamingSpeechToText, responseJSON.ToString()); string errorText = GoogleSpeechToTextResponseJSONParser.GetErrorFromResponseJSON(responseJSON); if (errorText != null) { if (m_OnError != null) { m_OnError(errorText); } } JSONObject resultsJSON = responseJSON.GetField(Constants.GoogleResponseJSONResultsFieldKey); if (resultsJSON != null && resultsJSON.Count > 0) { JSONObject resultJSON = resultsJSON[0]; SpeechToTextResult textResult = GoogleSpeechToTextResponseJSONParser.GetTextResultFromResultJSON(resultJSON); bool isFinal = false; resultJSON.GetField(out isFinal, Constants.GoogleResponseJSONResultIsFinalFieldKey, isFinal); textResult.IsFinal = isFinal; SmartLogger.Log(DebugFlags.GoogleStreamingSpeechToText, "processing result - isFinal = " + isFinal); if (m_OnTextResult != null) { m_OnTextResult(textResult); } m_LastResult = textResult; } }
/// <summary> /// Function that is called when the recording times out. /// </summary> void OnSpeechToTextRecordingTimeout() { SmartLogger.Log(DebugFlags.SpeechToTextWidgets, SpeechToTextServiceString() + " call timeout"); if (m_OnRecordingTimeout != null) { m_OnRecordingTimeout(); } }
/// <summary> /// Unregisters callbacks with each SpeechToTextServiceWidget. /// </summary> void UnregisterSpeechToTextServiceWidgetsCallbacks() { if (m_SpeechToTextServiceWidgets != null) { SmartLogger.Log(DebugFlags.SpeechToTextWidgets, "unregister service widgets callbacks"); m_SpeechToTextServiceWidgets.RegisterOnRecordingTimeout(OnRecordTimeout); m_SpeechToTextServiceWidgets.RegisterOnReceivedLastResponse(OnSpeechToTextReceivedLastResponse); } }
/// <summary> /// Clears the current results text and tells the speech-to-text service to start recording. /// </summary> public void StartRecording() { SmartLogger.Log(DebugFlags.SpeechToTextWidgets, "Start service widget recording"); m_WillDisplayReceivedResults = true; m_WaitingForLastFinalResultOfSession = false; m_LastResultWasFinal = false; m_PreviousFinalResults = ""; m_ResultsTextUI.text = m_PreviousFinalResults; m_SpeechToTextService.StartRecording(); }
/// <summary> /// Function that is called when the MonoBehaviour will be destroyed. /// </summary> protected override void OnDestroy() { base.OnDestroy(); m_TempAudioComponent.ClearTempAudioFiles(); if (m_StreamingSpeechToTextProcessHasStarted && !m_StreamingSpeechToTextProcess.HasExited) { m_StreamingSpeechToTextProcess.Kill(); SmartLogger.Log(DebugFlags.GoogleStreamingSpeechToText, "kill streaming speech-to-text process"); } }
/// <summary> /// Function that is called when the given SpeechToTextServiceWidget has gotten its last response. If there are no waiting /// SpeechToTextServiceWidgets left, then this function will wrap-up the current comparison session. /// </summary> /// <param name="serviceWidget">The speech-to-text service widget that received a last response</param> void OnSpeechToTextReceivedLastResponse(SpeechToTextServiceWidget serviceWidget) { SmartLogger.Log(DebugFlags.SpeechToTextWidgets, "Response from " + serviceWidget.SpeechToTextServiceString()); m_WaitingSpeechToTextServiceWidgets.Remove(serviceWidget); if (m_WaitingSpeechToTextServiceWidgets.Count == 0) { SmartLogger.Log(DebugFlags.SpeechToTextWidgets, "Responses from everyone"); FinishComparisonSession(); } }
/// <summary> /// Registers callbacks with each SpeechToTextServiceWidget. /// </summary> void RegisterSpeechToTextServiceWidgetsCallbacks() { if (m_SpeechToTextServiceWidgets != null) { SmartLogger.Log(DebugFlags.SpeechToTextWidgets, "register service widgets callbacks"); foreach (var serviceWidget in m_SpeechToTextServiceWidgets) { SmartLogger.Log(DebugFlags.SpeechToTextWidgets, "register service widget callbacks"); serviceWidget.RegisterOnRecordingTimeout(OnRecordTimeout); serviceWidget.RegisterOnReceivedLastResponse(OnSpeechToTextReceivedLastResponse); } } }
/// <summary> /// Starts recording audio for each speech-to-text service widget if not already recording. /// </summary> void StartRecording() { if (!m_IsRecording) { SmartLogger.Log(DebugFlags.SpeechToTextWidgets, "Start comparison recording"); m_IsCurrentlyInSpeechToTextSession = true; m_IsRecording = true; m_WaitingSpeechToTextServiceWidgets.Clear(); SmartLogger.Log(DebugFlags.SpeechToTextWidgets, "tell service widget to start recording"); m_SpeechToTextServiceWidgets.StartRecording(); } }
/// <summary> /// Does any final processing necessary for the results of the last started session and then /// stops the widget from displaying results until the start of the next session. /// </summary> void ProcessEndResults() { SmartLogger.Log(DebugFlags.SpeechToTextWidgets, m_SpeechToTextService.GetType().ToString() + " got last response"); if (m_ComparisonPhrase != null) { DisplayAccuracyOfEndResults(m_ComparisonPhrase); } LogFileManager.Instance.WriteTextToFileIfShouldLog(SpeechToTextServiceString() + ": " + m_ResultsTextUI.text); if (m_OnReceivedLastResponse != null) { m_OnReceivedLastResponse(this); } m_WillDisplayReceivedResults = false; }
/// <summary> /// Translates speech to text by making a request to the speech-to-text API. /// </summary> protected override IEnumerator TranslateRecordingToText() { m_TempAudioComponent.ClearTempAudioFiles(); // Save recorded audio to a WAV file. string recordedAudioFilePath = SavWav.Save(m_TempAudioComponent.TempAudioRelativePath(), AudioRecordingManager.Instance.RecordedAudio); // Construct a request with the WAV file and send it. var request = new Request("POST", Constants.WitAiSpeechToTextBaseURL + "?" + Constants.WitAiVersionParameterName + "=" + DateTime.Now.ToString(Constants.WitAiVersionDateFormat)); request.headers.Add("Authorization", "Bearer " + m_APIAccessToken); request.headers.Add("Content-Type", "audio/wav"); request.Bytes = File.ReadAllBytes(recordedAudioFilePath); SmartLogger.Log(DebugFlags.WitAINonStreamingSpeechToText, "Sending request"); request.Send(); float startTime = Time.time; while (!request.isDone) { yield return(null); } SmartLogger.Log(DebugFlags.WitAINonStreamingSpeechToText, "response time: " + (Time.time - startTime)); // Finally, grab the response JSON once the request is done. var responseJSON = new JSONObject(request.response.Text, int.MaxValue); SmartLogger.Log(DebugFlags.WitAINonStreamingSpeechToText, "Received request result"); SmartLogger.Log(DebugFlags.WitAINonStreamingSpeechToText, responseJSON.ToString()); string errorText = WitAiSpeechToTextResponseJSONParser.GetErrorFromResponseJSON(responseJSON); if (errorText != null) { if (m_OnError != null) { m_OnError(errorText); } } if (m_OnTextResult != null) { m_OnTextResult(WitAiSpeechToTextResponseJSONParser.GetTextResultFromResponseJSON(responseJSON)); } m_TempAudioComponent.ClearTempAudioFiles(); }
/// <summary> /// Computes the accuracy (percentage) of the end text results in comparison to the given phrase, by using /// the Levenshtein Distance between the two strings, and displays this percentage in the results text UI. /// </summary> /// <param name="originalPhrase">The phrase to compare against</param> void DisplayAccuracyOfEndResults(string originalPhrase) { string speechToTextResult = StringUtilities.TrimSpecialFormatting(m_ResultsTextUI.text, new HashSet <char>(), m_LeadingCharsForSpecialWords, m_SurroundingCharsForSpecialText); originalPhrase = StringUtilities.TrimSpecialFormatting(originalPhrase, new HashSet <char>(), m_LeadingCharsForSpecialWords, m_SurroundingCharsForSpecialText); int levenDistance = StringUtilities.LevenshteinDistance(speechToTextResult, originalPhrase); SmartLogger.Log(DebugFlags.SpeechToTextWidgets, m_SpeechToTextService.GetType().ToString() + " compute accuracy of text: \"" + speechToTextResult + "\""); float accuracy = Mathf.Max(0, 100f - (100f * (float)levenDistance / (float)originalPhrase.Length)); m_PreviousFinalResults = "[Accuracy: " + accuracy + "%] " + m_PreviousFinalResults; m_ResultsTextUI.text = m_PreviousFinalResults; }
/// <summary> /// Starts recording audio for each speech-to-text service widget if not already recording. /// </summary> void StartRecording() { if (!m_IsRecording) { SmartLogger.Log(DebugFlags.SpeechToTextWidgets, "Start comparison recording"); m_IsCurrentlyInSpeechToTextSession = true; m_IsRecording = true; m_RecordButtonTextUI.text = m_RecordingText; m_RecordButtonImage.color = m_RecordingButtonColor; m_WaitingSpeechToTextServiceWidgets.Clear(); foreach (var serviceWidget in m_SpeechToTextServiceWidgets) { SmartLogger.Log(DebugFlags.SpeechToTextWidgets, "tell service widget to start recording"); serviceWidget.StartRecording(); m_WaitingSpeechToTextServiceWidgets.Add(serviceWidget); } } }
/// <summary> /// Callback function for when the streaming speech-to-text process receives output data. /// </summary> /// <param name="sender">Sender of this event</param> /// <param name="e">Arguments for data received event</param> void OnStreamingSpeechToTextProcessOutputDataReceived(object sender, DataReceivedEventArgs e) { if (e.Data != null) { string trimmedData = e.Data.Trim(); SmartLogger.Log(DebugFlags.GoogleStreamingSpeechToText, "process output: " + trimmedData); if (trimmedData == k_ReadyToStreamDataOutputPrompt) { SmartLogger.Log(DebugFlags.GoogleStreamingSpeechToText, "set ready to stream data"); m_ReadyToStreamData = true; } else if (trimmedData.StartsWith(k_ResponsePrefix)) { trimmedData = trimmedData.Remove(0, k_ResponsePrefix.Length); m_ResponseJSONsQueue.Enqueue(trimmedData); } } }
/// <summary> /// Computes the accuracy (percentage) of the end text results in comparison to the given phrase, by using /// the Levenshtein Distance between the two strings, and displays this percentage in the results text UI. /// </summary> /// <param name="originalPhrase">The phrase to compare against</param> void DisplayAccuracyOfEndResults(string[] originalPhrase) { print("The computer understood " + results); string speechToTextResult = StringUtilities.TrimSpecialFormatting(results, new HashSet <char>(), m_LeadingCharsForSpecialWords, m_SurroundingCharsForSpecialText); for (int i = 0; i < originalPhrase.Length; i++) { originalPhrase[i] = StringUtilities.TrimSpecialFormatting(originalPhrase[i], new HashSet <char>(), m_LeadingCharsForSpecialWords, m_SurroundingCharsForSpecialText); int levenDistance = StringUtilities.LevenshteinDistance(speechToTextResult, originalPhrase[i]); SmartLogger.Log(DebugFlags.SpeechToTextWidgets, m_SpeechToTextService.GetType().ToString() + " compute accuracy of text: \"" + speechToTextResult + "\""); float accuracy = Mathf.Max(0, 100f - (100f * (float)levenDistance / (float)originalPhrase[i].Length)); m_PreviousFinalResults = "[Accuracy: " + accuracy + "%] " + m_PreviousFinalResults; speechAccuracy.Add(accuracy); print(accuracy); } results = m_PreviousFinalResults; OnResult.Invoke(); }
/// <summary> /// Translates speech to text by making a request to the speech-to-text API. /// </summary> protected override IEnumerator TranslateRecordingToText() { m_TempAudioComponent.ClearTempAudioFiles(); // Save recorded audio to a WAV file. string recordedAudioFilePath = SavWav.Save(m_TempAudioComponent.TempAudioRelativePath(), AudioRecordingManager.Instance.RecordedAudio); //WWW request string _url = Constants.WitAiSpeechToTextBaseURL + "?" + Constants.WitAiVersionParameterName + "=" + DateTime.Now.ToString(Constants.WitAiVersionDateFormat); UnityWebRequest www = new UnityWebRequest(_url, UnityWebRequest.kHttpVerbPOST); byte[] bytes = File.ReadAllBytes(recordedAudioFilePath); UploadHandlerRaw uH = new UploadHandlerRaw(bytes); uH.contentType = "application/json"; www.uploadHandler = uH; www.downloadHandler = new DownloadHandlerBuffer(); www.SetRequestHeader("Content-Type", "application/json"); www.SetRequestHeader("Authorization", "Bearer " + m_APIAccessToken); SmartLogger.Log(DebugFlags.GoogleNonStreamingSpeechToText, "sent request"); float startTime = Time.time; yield return(www.Send()); while (!www.isDone) { yield return(null); } if (www.isError) { SmartLogger.Log(DebugFlags.GoogleNonStreamingSpeechToText, www.error); } else { SmartLogger.Log(DebugFlags.GoogleNonStreamingSpeechToText, "Form upload complete!"); } SmartLogger.Log(DebugFlags.WitAINonStreamingSpeechToText, "response time: " + (Time.time - startTime)); // Grab the response JSON once the request is done and parse it. var responseJSON = new JSONObject(www.downloadHandler.text, int.MaxValue); //END WWW // Construct a request with the WAV file and send it. //var request = new Request("POST", Constants.WitAiSpeechToTextBaseURL + "?" + // Constants.WitAiVersionParameterName + "=" + DateTime.Now.ToString(Constants.WitAiVersionDateFormat)); //request.headers.Add("Authorization", "Bearer " + m_APIAccessToken); //request.headers.Add("Content-Type", "audio/wav"); //request.Bytes = File.ReadAllBytes(recordedAudioFilePath); //SmartLogger.Log(DebugFlags.WitAINonStreamingSpeechToText, "Sending request"); //request.Send(); //while (!request.isDone) //{ // yield return null; //} //SmartLogger.Log(DebugFlags.WitAINonStreamingSpeechToText, "response time: " + (Time.time - startTime)); // Finally, grab the response JSON once the request is done. //var responseJSON = new JSONObject(request.response.Text, int.MaxValue); SmartLogger.Log(DebugFlags.WitAINonStreamingSpeechToText, "Received request result"); SmartLogger.Log(DebugFlags.WitAINonStreamingSpeechToText, responseJSON.ToString()); string errorText = WitAiSpeechToTextResponseJSONParser.GetErrorFromResponseJSON(responseJSON); if (errorText != null) { if (m_OnError != null) { m_OnError(errorText); } } if (m_OnTextResult != null) { m_OnTextResult(WitAiSpeechToTextResponseJSONParser.GetTextResultFromResponseJSON(responseJSON)); } m_TempAudioComponent.ClearTempAudioFiles(); }
/// <summary> /// Sends queued chunks of audio to the server and listens for responses. /// </summary> protected override IEnumerator StreamAudioAndListenForResponses() { m_TempAudioComponent.ClearTempAudioFiles(); m_ResponseJSONsQueue.Clear(); m_StreamingSpeechToTextProcessHasStarted = false; m_ReadyToStreamData = false; string jsonCredentialsPath = Path.Combine( Path.Combine(Application.streamingAssetsPath, k_StreamingSpeechToTextApplicationFolderName), m_JSONCredentialsFileName); if (!File.Exists(jsonCredentialsPath)) { if (m_OnError != null) { m_OnError("Missing JSON credentials file in StreamingAssets/GoogleStreamingSpeechToTextProgram"); } yield break; } // Initialize streaming speech-to-text process with appropriate start info, including the path to the credentials file. m_StreamingSpeechToTextProcess = new Process(); m_StreamingSpeechToTextProcess.StartInfo.FileName = Path.Combine( Path.Combine(Application.streamingAssetsPath, k_StreamingSpeechToTextApplicationFolderName), k_StreamingSpeechToTextApplicationFileName); m_StreamingSpeechToTextProcess.StartInfo.Arguments = jsonCredentialsPath; m_StreamingSpeechToTextProcess.StartInfo.CreateNoWindow = true; m_StreamingSpeechToTextProcess.StartInfo.UseShellExecute = false; m_StreamingSpeechToTextProcess.StartInfo.RedirectStandardInput = true; m_StreamingSpeechToTextProcess.StartInfo.RedirectStandardOutput = true; m_StreamingSpeechToTextProcess.OutputDataReceived += OnStreamingSpeechToTextProcessOutputDataReceived; SmartLogger.Log(DebugFlags.GoogleStreamingSpeechToText, "start streaming speech-to-text process"); m_StreamingSpeechToTextProcess.Start(); m_StreamingSpeechToTextProcess.BeginOutputReadLine(); m_StreamingSpeechToTextProcessHasStarted = true; while (!m_ReadyToStreamData) { yield return(null); } // TODO: I don't know why, but I need to write garbage text first. // For some reason the first standard input begins with "0x3F3F3F". SmartLogger.Log(DebugFlags.GoogleStreamingSpeechToText, "ready to stream data"); m_StreamingSpeechToTextProcess.StandardInput.WriteLine("clear input stream"); // Tell the process to start streaming. m_StreamingSpeechToTextProcess.StandardInput.WriteLine(k_StartStreamingDataInputPrompt); StartCoroutine(ProcessResponseJSONs()); // While still recording, send chunks as they arrive in the queue. while (m_IsRecording) { while (m_AudioChunksQueue.Count == 0) { yield return(null); } yield return(SaveAndSendNextChunk()); } SmartLogger.Log(DebugFlags.GoogleStreamingSpeechToText, "stopped recording"); // Send any remaining chunks. while (m_AudioChunksQueue.Count > 0) { yield return(SaveAndSendNextChunk()); } SmartLogger.Log(DebugFlags.GoogleStreamingSpeechToText, "sent all chunks"); // Tell the process to stop streaming. m_StreamingSpeechToTextProcess.StandardInput.WriteLine(k_StopStreamingDataInputPrompt); // Wait a specified number of seconds for a final result. float timeElapsedAfterRecording = 0; while (!m_LastResult.IsFinal && timeElapsedAfterRecording < m_SessionTimeoutAfterDoneRecording) { yield return(null); timeElapsedAfterRecording += Time.deltaTime; } SmartLogger.Log(DebugFlags.GoogleStreamingSpeechToText, "session timeout"); // If still determining a final result, just treat the last result processed as a final result. if (!m_LastResult.IsFinal) { SmartLogger.Log(DebugFlags.GoogleStreamingSpeechToText, "treat last result as final result"); m_LastResult.IsFinal = true; if (m_OnTextResult != null) { m_OnTextResult(m_LastResult); } } while (!m_StreamingSpeechToTextProcess.HasExited) { yield return(null); } SmartLogger.Log(DebugFlags.GoogleStreamingSpeechToText, "streaming speech-to-text process exited"); m_TempAudioComponent.ClearTempAudioFiles(); }
/// <summary> /// Removes a function from the recording timeout delegate. /// </summary> /// <param name="action">Function to unregister</param> public void UnregisterOnRecordingTimeout(Action action) { SmartLogger.Log(DebugFlags.SpeechToTextWidgets, SpeechToTextServiceString() + " unregister timeout"); m_OnRecordingTimeout -= action; }
/// <summary> /// Translates speech to text by making a request to the speech-to-text API. /// </summary> protected override IEnumerator TranslateRecordingToText() { m_TempAudioComponent.ClearTempAudioFiles(); // Save recorded audio to a WAV file and convert it to FLAC format. string wavAudioFilePath = SavWav.Save(m_TempAudioComponent.TempAudioRelativePath(), AudioRecordingManager.Instance.RecordedAudio); string flacAudioFilePath = IOUtilities.MakeFilePathUnique(Path.ChangeExtension(wavAudioFilePath, "flac")); SmartLogger.Log(DebugFlags.GoogleNonStreamingSpeechToText, "converting audio"); var audioConversionJob = new SoXAudioConversionJob(wavAudioFilePath, flacAudioFilePath, 16000); audioConversionJob.Start(); yield return(StartCoroutine(audioConversionJob.WaitFor())); if (audioConversionJob.ErrorMessage != null) { if (m_OnError != null) { m_OnError(audioConversionJob.ErrorMessage); } yield break; } var request = new Request("POST", Constants.GoogleNonStreamingSpeechToTextURL + "?" + Constants.GoogleAPIKeyParameterName + "=" + m_APIKey); request.headers.Add("Content-Type", "application/json"); // Construct JSON request body. JSONObject requestJSON = new JSONObject(); JSONObject requestConfig = new JSONObject(); requestConfig.AddField(Constants.GoogleRequestJSONConfigEncodingFieldKey, "FLAC"); requestConfig.AddField(Constants.GoogleRequestJSONConfigSampleRateFieldKey, "16000"); JSONObject requestAudio = new JSONObject(); requestAudio.AddField(Constants.GoogleRequestJSONAudioContentFieldKey, Convert.ToBase64String(File.ReadAllBytes(flacAudioFilePath))); requestJSON.AddField(Constants.GoogleRequestJSONConfigFieldKey, requestConfig); requestJSON.AddField(Constants.GoogleRequestJSONAudioFieldKey, requestAudio); request.Text = requestJSON.ToString(); request.Send(); SmartLogger.Log(DebugFlags.GoogleNonStreamingSpeechToText, "sent request"); while (!request.isDone) { yield return(null); } // Grab the response JSON once the request is done and parse it. var responseJSON = new JSONObject(request.response.Text, int.MaxValue); SmartLogger.Log(DebugFlags.GoogleNonStreamingSpeechToText, responseJSON.ToString()); string errorText = GoogleSpeechToTextResponseJSONParser.GetErrorFromResponseJSON(responseJSON); if (errorText != null) { if (m_OnError != null) { m_OnError(errorText); } } SpeechToTextResult textResult; JSONObject resultsJSON = responseJSON.GetField(Constants.GoogleResponseJSONResultsFieldKey); if (resultsJSON != null && resultsJSON.Count > 0) { JSONObject resultJSON = resultsJSON[0]; textResult = GoogleSpeechToTextResponseJSONParser.GetTextResultFromResultJSON(resultJSON); } else { textResult = GoogleSpeechToTextResponseJSONParser.GetDefaultGoogleSpeechToTextResult(); } if (m_OnTextResult != null) { m_OnTextResult(textResult); } m_TempAudioComponent.ClearTempAudioFiles(); }