/// <summary> /// Transcribe a short audio file with language detected from a list of possible languages /// </summary> /// <param name="localFilePath">Path to local audio file, e.g. /path/audio.wav</param> public static void SampleRecognize(string localFilePath) { SpeechClient speechClient = SpeechClient.Create(); // string localFilePath = "resources/brooklyn_bridge.flac" RecognizeRequest request = new RecognizeRequest { Config = new RecognitionConfig { // The language of the supplied audio. Even though additional languages are // provided by alternative_language_codes, a primary language is still required. LanguageCode = "fr", AlternativeLanguageCodes = { "es", "en", }, }, Audio = new RecognitionAudio { Content = ByteString.CopyFrom(File.ReadAllBytes(localFilePath)), }, }; RecognizeResponse response = speechClient.Recognize(request); foreach (var result in response.Results) { // The languageCode which was detected as the most likely being spoken in the audio Console.WriteLine($"Detected language: {result.LanguageCode}"); // First alternative is the most probable result SpeechRecognitionAlternative alternative = result.Alternatives[0]; Console.WriteLine($"Transcript: {alternative.Transcript}"); } }
/// <summary> /// Adds additional details short audio file included in this recognition request /// </summary> /// <param name="localFilePath">Path to local audio file, e.g. /path/audio.wav</param> public static void SampleRecognize(string localFilePath) { SpeechClient speechClient = SpeechClient.Create(); // string localFilePath = "resources/commercial_mono.wav" RecognizeRequest request = new RecognizeRequest { Config = new RecognitionConfig { Metadata = new RecognitionMetadata { InteractionType = RecognitionMetadata.Types.InteractionType.VoiceSearch, RecordingDeviceType = RecognitionMetadata.Types.RecordingDeviceType.Smartphone, RecordingDeviceName = "Pixel 3", }, // The language of the supplied audio. Even though additional languages are // provided by alternative_language_codes, a primary language is still required. LanguageCode = "en-US", }, Audio = new RecognitionAudio { Content = ByteString.CopyFrom(File.ReadAllBytes(localFilePath)), }, }; RecognizeResponse response = speechClient.Recognize(request); foreach (var result in response.Results) { // First alternative is the most probable result SpeechRecognitionAlternative alternative = result.Alternatives[0]; Console.WriteLine($"Transcript: {alternative.Transcript}"); } }
/// <summary> /// Transcribe a short audio file with punctuation /// </summary> /// <param name="localFilePath">Path to local audio file, e.g. /path/audio.wav</param> public static void SampleRecognize(string localFilePath) { SpeechClient speechClient = SpeechClient.Create(); // string localFilePath = "resources/commercial_mono.wav" RecognizeRequest request = new RecognizeRequest { Config = new RecognitionConfig { // When enabled, trascription results may include punctuation (available for select languages). EnableAutomaticPunctuation = true, // The language of the supplied audio. Even though additional languages are // provided by alternative_language_codes, a primary language is still required. LanguageCode = "en-US", }, Audio = new RecognitionAudio { Content = ByteString.CopyFrom(File.ReadAllBytes(localFilePath)), }, }; RecognizeResponse response = speechClient.Recognize(request); foreach (var result in response.Results) { // First alternative is the most probable result SpeechRecognitionAlternative alternative = result.Alternatives[0]; Console.WriteLine($"Transcript: {alternative.Transcript}"); } }
public void IBM_audio_guess() { if (IBM_APIkey.TextLength == 0 || IBM_URL.TextLength == 0) { return; } IamAuthenticator authenticator = new IamAuthenticator( apikey: IBM_APIkey.Text); SpeechToTextService service = new SpeechToTextService(authenticator); service.SetServiceUrl(IBM_URL.Text); var model_to_use = "en-US_BroadbandModel"; using (var reader = new WaveFileReader(folder_path + "\\" + transcriptions.ElementAt(current_line_number).Key)) { if (reader.WaveFormat.SampleRate < 16000) { model_to_use = "en-US_NarrowbandModel"; } } DetailedResponse <SpeechRecognitionResults> result = service.Recognize( audio: File.ReadAllBytes(folder_path + "\\" + transcriptions.ElementAt(current_line_number).Key), contentType: "audio/wav", profanityFilter: false, model: model_to_use ); SpeechRecognitionResults results = result.Result; SpeechRecognitionResult final_result = results.Results[0]; SpeechRecognitionAlternative real_result = final_result.Alternatives[0]; TranscriptionBox.Text = real_result.Transcript; SaveTranscriptionLine(); }
/// <summary> /// Print confidence level for individual words in a transcription of a short audio file /// </summary> /// <param name="localFilePath">Path to local audio file, e.g. /path/audio.wav</param> public static void SampleRecognize(string localFilePath) { SpeechClient speechClient = SpeechClient.Create(); // string localFilePath = "resources/brooklyn_bridge.flac" RecognizeRequest request = new RecognizeRequest { Config = new RecognitionConfig { // When enabled, the first result returned by the API will include a list // of words and the confidence level for each of those words. EnableWordConfidence = true, // The language of the supplied audio LanguageCode = "en-US", }, Audio = new RecognitionAudio { Content = ByteString.CopyFrom(File.ReadAllBytes(localFilePath)), }, }; RecognizeResponse response = speechClient.Recognize(request); // The first result includes confidence levels per word SpeechRecognitionResult result = response.Results[0]; // First alternative is the most probable result SpeechRecognitionAlternative alternative = result.Alternatives[0]; Console.WriteLine($"Transcript: {alternative.Transcript}"); // Print the confidence level of each word foreach (var word in alternative.Words) { Console.WriteLine($"Word: {word.Word}"); Console.WriteLine($"Confidence: {word.Confidence}"); } }
private void recognizeSpeaker(string lastUtterance, SpeechRecognitionAlternative alt) { string[] utteranceWords = lastUtterance.Split(" "); int numWords = utteranceWords.Length; if (numWords == 0) { return; } double startTime = 0; double endTime = 0; for (int i = 0; i < utteranceWords.Length; ++i) { string utteranceWord = utteranceWords[utteranceWords.Length - 1 - i].Trim(); WordInfo wordInfo = alt.Words[alt.Words.Count - 1 - i]; if (wordInfo.Word.Trim() != utteranceWord) { // Word mismatch: this is not expected. return; } if (i == 0) { endTime = wordInfo.EndTime.Seconds + wordInfo.EndTime.Nanos / 1e9; } if (i == utteranceWords.Length - 1) { startTime = wordInfo.StartTime.Seconds + wordInfo.StartTime.Nanos / 1e9; } } if (endTime - startTime < SPEAKER_ID_MIN_DURATION_SECONDS) { Debug.WriteLine( $"Utterance duration too short for speaker ID: " + $"{endTime - startTime} < {SPEAKER_ID_MIN_DURATION_SECONDS}"); return; } int bytesPerSample = audioFormat.BitsPerSample / 8; int bufferStartIndex = bytesPerSample * (int)(audioFormat.SampleRate * startTime); int bufferEndIndex = bytesPerSample * (int)(audioFormat.SampleRate * endTime); byte[] snippetBuffer = new byte[bufferEndIndex - bufferStartIndex]; lock (speakerIdBufferLock) { Array.Copy( speakerIdBuffer, bufferStartIndex, snippetBuffer, 0, bufferEndIndex - bufferStartIndex); } SendSpeakerIdHttpRequest(snippetBuffer); }
/// <summary> /// Performs synchronous speech recognition with speech adaptation. /// </summary> /// <param name="sampleRateHertz">Sample rate in Hertz of the audio data sent in all `RecognitionAudio` /// messages. Valid values are: 8000-48000.</param> /// <param name="languageCode">The language of the supplied audio.</param> /// <param name="phrase">Phrase "hints" help Speech-to-Text API recognize the specified phrases from /// your audio data.</param> /// <param name="boost">Positive value will increase the probability that a specific phrase will be /// recognized over other similar sounding phrases.</param> /// <param name="uriPath">Path to the audio file stored on GCS.</param> public static void SampleRecognize(int sampleRateHertz, string languageCode, string phrase, float boost, string uriPath) { SpeechClient speechClient = SpeechClient.Create(); // int sampleRateHertz = 44100 // string languageCode = "en-US" // string phrase = "Brooklyn Bridge" // float boost = 20f // string uriPath = "gs://cloud-samples-data/speech/brooklyn_bridge.mp3" RecognizeRequest request = new RecognizeRequest { Config = new RecognitionConfig { Encoding = RecognitionConfig.Types.AudioEncoding.Mp3, // Sample rate in Hertz of the audio data sent in all `RecognitionAudio` messages. Valid values are: // 8000-48000. SampleRateHertz = 44100, // The language of the supplied audio. LanguageCode = "en-US", SpeechContexts = { new SpeechContext { Phrases = { "Brooklyn Bridge", }, // Positive value will increase the probability that a specific phrase will be recognized over other // similar sounding phrases. Boost = 20f, }, }, }, Audio = new RecognitionAudio { // Path to the audio file stored on GCS. Uri = "gs://cloud-samples-data/speech/brooklyn_bridge.mp3", }, }; RecognizeResponse response = speechClient.Recognize(request); foreach (var result in response.Results) { // First alternative is the most probable result SpeechRecognitionAlternative alternative = result.Alternatives[0]; Console.WriteLine($"Transcript: {alternative.Transcript}"); } }
/// <summary> /// Populates and returns a SpeechToTextResult object from a given Watson SpeechResult object. /// </summary> /// <param name="watsonResult">Watson SpeechResult object</param> /// <returns>A SpeechToTextResult object</returns> public SpeechToTextResult CreateSpeechToTextResult(SpeechRecognitionResult watsonResult) { var textResult = new SpeechToTextResult(); textResult.IsFinal = watsonResult.final; textResult.TextAlternatives = new TextAlternative[watsonResult.alternatives.Length]; for (int i = 0; i < textResult.TextAlternatives.Length; ++i) { SpeechRecognitionAlternative watsonAlternative = watsonResult.alternatives[i]; var alternative = new WatsonTextAlternative(); alternative.Text = watsonAlternative.transcript; alternative.Confidence = (float)watsonAlternative.confidence; alternative.TimeStamps = watsonAlternative.Timestamps; alternative.WordConfidenceValues = watsonAlternative.WordConfidence; textResult.TextAlternatives[i] = alternative; } return(textResult); }
/// <summary> /// Performs synchronous speech recognition with static context classes. /// </summary> /// <param name="sampleRateHertz">Sample rate in Hertz of the audio data sent in all `RecognitionAudio` /// messages. Valid values are: 8000-48000.</param> /// <param name="languageCode">The language of the supplied audio.</param> /// <param name="phrase">Phrase "hints" help Speech-to-Text API recognize the specified phrases from /// your audio data. In this sample we are using a static class phrase ($TIME). Classes represent /// groups of words that represent common concepts that occur in natural language. We recommend /// checking out the docs page for more info on static classes.</param> /// <param name="uriPath">Path to the audio file stored on GCS.</param> public static void SampleRecognize(int sampleRateHertz, string languageCode, string phrase, string uriPath) { SpeechClient speechClient = SpeechClient.Create(); // int sampleRateHertz = 24000 // string languageCode = "en-US" // string phrase = "$TIME" // string uriPath = "gs://cloud-samples-data/speech/time.mp3" RecognizeRequest request = new RecognizeRequest { Config = new RecognitionConfig { Encoding = RecognitionConfig.Types.AudioEncoding.Mp3, // Sample rate in Hertz of the audio data sent in all `RecognitionAudio` messages. Valid values are: // 8000-48000. SampleRateHertz = 24000, // The language of the supplied audio. LanguageCode = "en-US", SpeechContexts = { new SpeechContext { Phrases = { "$TIME", }, }, }, }, Audio = new RecognitionAudio { // Path to the audio file stored on GCS. Uri = "gs://cloud-samples-data/speech/time.mp3", }, }; RecognizeResponse response = speechClient.Recognize(request); foreach (var result in response.Results) { // First alternative is the most probable result SpeechRecognitionAlternative alternative = result.Alternatives[0]; Console.WriteLine($"Transcript: {alternative.Transcript}"); } }
/// <summary> /// Print confidence level for individual words in a transcription of a short audio file /// Separating different speakers in an audio file recording /// </summary> /// <param name="localFilePath">Path to local audio file, e.g. /path/audio.wav</param> public static void SampleLongRunningRecognize(string localFilePath) { SpeechClient speechClient = SpeechClient.Create(); // string localFilePath = "resources/commercial_mono.wav" LongRunningRecognizeRequest request = new LongRunningRecognizeRequest { Config = new RecognitionConfig { // If enabled, each word in the first alternative of each result will be // tagged with a speaker tag to identify the speaker. EnableSpeakerDiarization = true, // Optional. Specifies the estimated number of speakers in the conversation. DiarizationSpeakerCount = 2, // The language of the supplied audio LanguageCode = "en-US", }, Audio = new RecognitionAudio { Content = ByteString.CopyFrom(File.ReadAllBytes(localFilePath)), }, }; // Poll until the returned long-running operation is complete LongRunningRecognizeResponse response = speechClient.LongRunningRecognize(request).PollUntilCompleted().Result; foreach (var result in response.Results) { // First alternative has words tagged with speakers SpeechRecognitionAlternative alternative = result.Alternatives[0]; Console.WriteLine($"Transcript: {alternative.Transcript}"); // Print the speakerTag of each word foreach (var word in alternative.Words) { Console.WriteLine($"Word: {word.Word}"); Console.WriteLine($"Speaker tag: {word.SpeakerTag}"); } } }
/* === TransformResponse.Simplify method === * We want to extract all the useful data from the response that comes back from the cloud. * But we don't want the superlous fields that make it more complicated to use. * * The raw response structure contains: * A single unnamed object with a "Results" array. * The "Results" array consists of unnamed objects, each containing: * "Alternatives" array, "ChannelTag" integer, "LanguageCode" string * The "Alternatives" arrays appear to always consists of a single unnamed object containing: * "Transcript" string, "Confidence" decimal, "Words" array * WHEN DOES THIS EVER CONSIST OF MORE THEN ONE ALTERNATIVE? * The "Words" array consists of unnamed objects containing: * "StartTime" object, "EndTime" object, "Word" object * The "StartTime" and "EndTime" objects both contain: * "Seconds" int, "Nanos" integer * The "Word" objects contain: * "Word" string, "Confidence" decimal, "SpeakerTag" integer * * The new structure contains: * A single unnamed object with a "Results" array. * The "Results" array consists of unnamed objects, each containing: * "Transcript" string, "Confidence" decimal, "Words" array and "WordCount" integer * The "Words" array consists of unnamed objects, eash containing: * "Word" string, "Confidence" decimal, "StartTime" integer, "EndTime integer, "speakerTag" integer, * and "WordNum" integer. * Both StartTime and EndTime integers are in milliseconds. * "WordCount" and "WordNum" are new fields added to help in fixing speaker tags, * but we leave them in the final structure for possible future use. */ public static Transcribed_Dto Simpify(RepeatedField <SpeechRecognitionResult> recogResults) { Transcribed_Dto transcript = new Transcribed_Dto(); int altCount = 0; int wordNum = 0; foreach (SpeechRecognitionResult recogResult in recogResults) { if (recogResult.Alternatives.Count > 1) { altCount++; Console.WriteLine($"ERROR: more than 1 alternative - result {altCount}"); } ; SpeechRecognitionAlternative recogAlt = recogResult.Alternatives[0]; TranscribedTalk_Dto result = new TranscribedTalk_Dto(recogAlt.Transcript, recogAlt.Confidence) { // The new "WordCount" field in Result is populated with the total word count. WordCount = recogAlt.Words.Count, }; Console.WriteLine($"Next result: {recogAlt.Words.Count} words"); foreach (var item in recogAlt.Words) { long startTime = item.StartTime.Seconds * 1000 + item.StartTime.Nanos / 1000000; long endTime = item.EndTime.Seconds * 1000 + item.EndTime.Nanos / 1000000; // The new "WordNum" field in RespWord is popluated with the sequencial "wordnum" wordNum++; result.Words.Add(new TranscribedWord_Dto(item.Word, item.Confidence, startTime, endTime, item.SpeakerTag, wordNum)); } transcript.Talks.Add(result); } return(transcript); }
private string FormatLine(SpeechRecognitionAlternative alternative) { return($"[{decimal.Round((decimal)alternative.Confidence * 100, 2)}%] {alternative.Transcript}\r\n"); }
internal AlternativeBridge(SpeechRecognitionAlternative s) { this.Confidence = s.Confidence; this.Transcript = s.Transcript; }
private static async Task <object> StreamingMicRecognizeAsync(int seconds) { if (WaveIn.DeviceCount < 1) { File.WriteAllText("error.txt", "No microphone!"); return((object)-1); } string lower = INISetting.GetValueWithAdd <string>("CredentialsFilePath", "credentials.json").ToLower(); Console.WriteLine(lower); GoogleCredential googleCredential; using (Stream stream = (Stream) new FileStream(lower, FileMode.Open)) googleCredential = GoogleCredential.FromStream(stream); SpeechClient.StreamingRecognizeStream streamingCall = SpeechClient.Create(new Channel(SpeechClient.DefaultEndpoint.Host, googleCredential.ToChannelCredentials())).StreamingRecognize(); await streamingCall.WriteAsync(new StreamingRecognizeRequest() { StreamingConfig = new StreamingRecognitionConfig() { Config = new RecognitionConfig() { Encoding = RecognitionConfig.Types.AudioEncoding.Linear16, SampleRateHertz = 16000, LanguageCode = "ru" }, InterimResults = true } }); Task printResponses = Task.Run((Func <Task>)(async() => { string s = ""; while (true) { if (await streamingCall.ResponseStream.MoveNext(new CancellationToken())) { using (IEnumerator <StreamingRecognitionResult> enumerator1 = streamingCall.ResponseStream.Current.Results.GetEnumerator()) { if (enumerator1.MoveNext()) { using (IEnumerator <SpeechRecognitionAlternative> enumerator2 = enumerator1.Current.Alternatives.GetEnumerator()) { if (enumerator2.MoveNext()) { SpeechRecognitionAlternative current = enumerator2.Current; Console.WriteLine(current.Transcript); s += current.Transcript; } } } } File.WriteAllText(Path.GetTempPath() + "\\speechtext\\speechtext.txt", s); s = ""; } else { break; } } })); object writeLock = new object(); bool writeMore = true; WaveInEvent waveIn = new WaveInEvent(); waveIn.DeviceNumber = 0; waveIn.WaveFormat = new WaveFormat(16000, 1); waveIn.DataAvailable += (EventHandler <WaveInEventArgs>)((sender, args) => { lock (writeLock) { if (!writeMore) { return; } streamingCall.WriteAsync(new StreamingRecognizeRequest() { AudioContent = ByteString.CopyFrom(args.Buffer, 0, args.BytesRecorded) }).Wait(); } }); waveIn.StartRecording(); Console.WriteLine("Speak now " + (object)seconds); await Task.Delay(TimeSpan.FromSeconds((double)seconds)); waveIn.StopRecording(); lock (writeLock) writeMore = false; await streamingCall.WriteCompleteAsync(); await printResponses; return((object)0); }
private SpeechRecognitionEvent ParseRecognizeResponse(IDictionary resp) { if (resp == null) { return(null); } try { List <SpeechRecognitionResult> results = new List <SpeechRecognitionResult>(); IList iresults = resp["results"] as IList; if (iresults == null) { return(null); } foreach (var r in iresults) { IDictionary iresult = r as IDictionary; if (iresults == null) { continue; } SpeechRecognitionResult result = new SpeechRecognitionResult(); result.final = (bool)iresult["final"]; IList iwordAlternatives = iresult["word_alternatives"] as IList; if (iwordAlternatives != null) { List <WordAlternativeResults> wordAlternatives = new List <WordAlternativeResults>(); foreach (var w in iwordAlternatives) { IDictionary iwordAlternative = w as IDictionary; if (iwordAlternative == null) { continue; } WordAlternativeResults wordAlternativeResults = new WordAlternativeResults(); if (iwordAlternative.Contains("start_time")) { wordAlternativeResults.start_time = (double)iwordAlternative["start_time"]; } if (iwordAlternative.Contains("end_time")) { wordAlternativeResults.end_time = (double)iwordAlternative["end_time"]; } if (iwordAlternative.Contains("alternatives")) { List <WordAlternativeResult> wordAlternativeResultList = new List <WordAlternativeResult>(); IList iwordAlternativeResult = iwordAlternative["alternatives"] as IList; if (iwordAlternativeResult == null) { continue; } foreach (var a in iwordAlternativeResult) { WordAlternativeResult wordAlternativeResult = new WordAlternativeResult(); IDictionary ialternative = a as IDictionary; if (ialternative.Contains("word")) { wordAlternativeResult.word = (string)ialternative["word"]; } if (ialternative.Contains("confidence")) { wordAlternativeResult.confidence = (double)ialternative["confidence"]; } wordAlternativeResultList.Add(wordAlternativeResult); } wordAlternativeResults.alternatives = wordAlternativeResultList.ToArray(); } wordAlternatives.Add(wordAlternativeResults); } result.word_alternatives = wordAlternatives.ToArray(); } IList ialternatives = iresult["alternatives"] as IList; if (ialternatives != null) { List <SpeechRecognitionAlternative> alternatives = new List <SpeechRecognitionAlternative>(); foreach (var a in ialternatives) { IDictionary ialternative = a as IDictionary; if (ialternative == null) { continue; } SpeechRecognitionAlternative alternative = new SpeechRecognitionAlternative(); alternative.transcript = (string)ialternative["transcript"]; if (ialternative.Contains("confidence")) { alternative.confidence = (double)ialternative["confidence"]; } if (ialternative.Contains("timestamps")) { IList itimestamps = ialternative["timestamps"] as IList; TimeStamp[] timestamps = new TimeStamp[itimestamps.Count]; for (int i = 0; i < itimestamps.Count; ++i) { IList itimestamp = itimestamps[i] as IList; if (itimestamp == null) { continue; } TimeStamp ts = new TimeStamp(); ts.Word = (string)itimestamp[0]; ts.Start = (double)itimestamp[1]; ts.End = (double)itimestamp[2]; timestamps[i] = ts; } alternative.Timestamps = timestamps; } if (ialternative.Contains("word_confidence")) { IList iconfidence = ialternative["word_confidence"] as IList; WordConfidence[] confidence = new WordConfidence[iconfidence.Count]; for (int i = 0; i < iconfidence.Count; ++i) { IList iwordconf = iconfidence[i] as IList; if (iwordconf == null) { continue; } WordConfidence wc = new WordConfidence(); wc.Word = (string)iwordconf[0]; wc.Confidence = (double)iwordconf[1]; confidence[i] = wc; } alternative.WordConfidence = confidence; } alternatives.Add(alternative); } result.alternatives = alternatives.ToArray(); } IDictionary iKeywords = iresult["keywords_result"] as IDictionary; if (iKeywords != null) { result.keywords_result = new KeywordResults(); List <KeywordResult> keywordResults = new List <KeywordResult>(); foreach (string keyword in Keywords) { if (iKeywords[keyword] != null) { IList iKeywordList = iKeywords[keyword] as IList; if (iKeywordList == null) { continue; } foreach (var k in iKeywordList) { IDictionary iKeywordDictionary = k as IDictionary; KeywordResult keywordResult = new KeywordResult(); keywordResult.keyword = keyword; keywordResult.confidence = (double)iKeywordDictionary["confidence"]; keywordResult.end_time = (double)iKeywordDictionary["end_time"]; keywordResult.start_time = (double)iKeywordDictionary["start_time"]; keywordResult.normalized_text = (string)iKeywordDictionary["normalized_text"]; keywordResults.Add(keywordResult); } } } result.keywords_result.keyword = keywordResults.ToArray(); } results.Add(result); } return(new SpeechRecognitionEvent(results.ToArray())); } catch (Exception e) { Log.Error("SpeechToText.ParseRecognizeResponse()", "ParseJsonResponse exception: {0}", e.ToString()); return(null); } }
/** (Re-)initializes the Cloud-based streaming speech recognizer. */ private void ReInitStreamRecognizer() { lock (speakerIdBufferLock) { speakerIdBufferPos = 0; } recogStream = speechClient.StreamingRecognize(); SpeakerDiarizationConfig diarizationConfig = new SpeakerDiarizationConfig() { EnableSpeakerDiarization = ENABLE_SPEAKER_DIARIZATION, MaxSpeakerCount = MAX_SPEAKER_COUNT, MinSpeakerCount = MIN_SPEAKER_COUNT, }; recogStream.WriteAsync(new StreamingRecognizeRequest() { StreamingConfig = new StreamingRecognitionConfig() { Config = new RecognitionConfig() { Encoding = RecognitionConfig.Types.AudioEncoding.Linear16, AudioChannelCount = 1, SampleRateHertz = audioFormat.SampleRate, LanguageCode = LANGUAGE_CODE, DiarizationConfig = diarizationConfig, }, SingleUtterance = false, }, });; Task.Run(async() => { while (await recogStream.GetResponseStream().MoveNextAsync()) { foreach (var result in recogStream.GetResponseStream().Current.Results) { if (result.Alternatives.Count == 0) { continue; } // Identify the alternative with the highest confidence. SpeechRecognitionAlternative bestAlt = null; foreach (var alternative in result.Alternatives) { if (bestAlt == null || alternative.Confidence > bestAlt.Confidence) { bestAlt = alternative; } } string transcript = bestAlt.Transcript.Trim(); if (transcript.Length == 0) { continue; } string transcriptInfo = $"Speech transcript: {DateTime.Now}: \"" + $"{transcript}\" (confidence={bestAlt.Confidence})"; if (ENABLE_SPEAKER_DIARIZATION) { int speakerTag = bestAlt.Words[bestAlt.Words.Count - 1].SpeakerTag; transcriptInfo += $" (speakerTag={speakerTag})"; } Debug.WriteLine(transcriptInfo); if (ENABLE_SPEAKER_DIARIZATION && ENABLE_SPEAKER_ID) { recognizeSpeaker(transcript, bestAlt); } } } }); cummulativeRecogSeconds = 0f; }
private LexiconSpeechResult CreateSpeechResult(SpeechRecognitionResult watsonResult, float realtimeStart) { if (watsonResult.alternatives.Length == 0) { return(null); } LexiconSpeechResult speechResult = new LexiconSpeechResult(); SpeechRecognitionAlternative bestAlternative = watsonResult.alternatives[0]; speechResult.Transcript = bestAlternative.transcript.Trim(); speechResult.IsFinal = watsonResult.final; speechResult.Confidence = (float)bestAlternative.confidence; speechResult.RealtimeStart = realtimeStart; speechResult.RealtimeEnd = -1; string[] words = speechResult.Transcript.Split(' '); int wordCount = words.Length; if (wordCount > 0) { speechResult.WordResults = new LexiconSpeechResult.WordResult[wordCount]; for (int i = 0; i < wordCount; i++) { speechResult.WordResults[i] = new LexiconSpeechResult.WordResult(); speechResult.WordResults[i].Word = words[i]; } if (bestAlternative.Timestamps != null) { if (bestAlternative.Timestamps.Length == wordCount) { for (int i = 0; i < wordCount; i++) { if (string.Equals(words[i], bestAlternative.Timestamps[i].Word, StringComparison.OrdinalIgnoreCase)) { speechResult.WordResults[i].TimeStart = (float)bestAlternative.Timestamps[i].Start; speechResult.WordResults[i].TimeEnd = (float)bestAlternative.Timestamps[i].End; speechResult.WordResults[i].RealtimeStart = realtimeStart + speechResult.WordResults[i].TimeStart; speechResult.WordResults[i].RealtimeEnd = realtimeStart + speechResult.WordResults[i].TimeEnd; } else { Debug.LogWarning("word: " + words[i] + " does not match timestamp word: " + bestAlternative.Timestamps[i].Word); } } if (speechResult.WordResults.Length > 0) { speechResult.RealtimeEnd = speechResult.WordResults[speechResult.WordResults.Length - 1].RealtimeEnd; } } else { Debug.LogWarning("word count: " + wordCount + ", timestamp count: " + bestAlternative.Timestamps.Length); } } if (bestAlternative.WordConfidence != null) { if (bestAlternative.WordConfidence.Length == wordCount) { for (int i = 0; i < wordCount; i++) { if (string.Equals(words[i], bestAlternative.WordConfidence[i].Word, StringComparison.OrdinalIgnoreCase)) { speechResult.WordResults[i].Confidence = (float)bestAlternative.WordConfidence[i].Confidence; } else { Debug.LogWarning("word: " + words[i] + " does not match confidence word: " + bestAlternative.WordConfidence[i].Word); } } } else { Debug.LogWarning("word count: " + wordCount + ", confidence count: " + bestAlternative.WordConfidence.Length); } } } if (watsonResult.keywords_result != null && watsonResult.keywords_result.keyword != null && watsonResult.keywords_result.keyword.Length > 0) { speechResult.KeywordResults = new LexiconSpeechResult.KeywordResult[watsonResult.keywords_result.keyword.Length]; for (int i = 0; i < watsonResult.keywords_result.keyword.Length; i++) { KeywordResult watsonKeywordResult = watsonResult.keywords_result.keyword[i]; LexiconSpeechResult.KeywordResult keywordResult = new LexiconSpeechResult.KeywordResult(); keywordResult.Keyword = watsonKeywordResult.keyword; keywordResult.TranscriptText = watsonKeywordResult.normalized_text; keywordResult.Confidence = (float)watsonKeywordResult.confidence; keywordResult.TimeStart = (float)watsonKeywordResult.start_time; keywordResult.TimeEnd = (float)watsonKeywordResult.end_time; keywordResult.RealtimeStart = realtimeStart + keywordResult.TimeStart; keywordResult.RealtimeEnd = realtimeStart + keywordResult.TimeEnd; speechResult.KeywordResults[i] = keywordResult; } } if (watsonResult.word_alternatives != null && watsonResult.word_alternatives.Length > 0) { speechResult.AlternativeWordResults = new LexiconSpeechResult.WordAlternativeResults[watsonResult.word_alternatives.Length]; for (int i = 0; i < watsonResult.word_alternatives.Length; i++) { WordAlternativeResults watsonAlternativeResults = watsonResult.word_alternatives[i]; LexiconSpeechResult.WordAlternativeResults alternativeResults = new LexiconSpeechResult.WordAlternativeResults(); alternativeResults.Alternatives = new LexiconSpeechResult.WordAlternative[watsonAlternativeResults.alternatives.Length]; alternativeResults.TimeStart = (float)watsonAlternativeResults.start_time; alternativeResults.TimeEnd = (float)watsonAlternativeResults.end_time; alternativeResults.RealtimeStart = realtimeStart + alternativeResults.TimeStart; alternativeResults.RealtimeEnd = realtimeStart + alternativeResults.TimeEnd; for (int j = 0; j < watsonAlternativeResults.alternatives.Length; j++) { LexiconSpeechResult.WordAlternative alternative = new LexiconSpeechResult.WordAlternative(); alternative.Word = watsonAlternativeResults.alternatives[j].word; alternative.Confidence = (float)watsonAlternativeResults.alternatives[j].confidence; alternativeResults.Alternatives[j] = alternative; } speechResult.AlternativeWordResults[i] = alternativeResults; } } return(speechResult); }