Example #1
0
        /// <summary>
        /// Transcribe a short audio file with language detected from a list of possible languages
        /// </summary>
        /// <param name="localFilePath">Path to local audio file, e.g. /path/audio.wav</param>
        public static void SampleRecognize(string localFilePath)
        {
            SpeechClient speechClient = SpeechClient.Create();
            // string localFilePath = "resources/brooklyn_bridge.flac"
            RecognizeRequest request = new RecognizeRequest
            {
                Config = new RecognitionConfig
                {
                    // The language of the supplied audio. Even though additional languages are
                    // provided by alternative_language_codes, a primary language is still required.
                    LanguageCode             = "fr",
                    AlternativeLanguageCodes =
                    {
                        "es",
                        "en",
                    },
                },
                Audio = new RecognitionAudio
                {
                    Content = ByteString.CopyFrom(File.ReadAllBytes(localFilePath)),
                },
            };
            RecognizeResponse response = speechClient.Recognize(request);

            foreach (var result in response.Results)
            {
                // The languageCode which was detected as the most likely being spoken in the audio
                Console.WriteLine($"Detected language: {result.LanguageCode}");
                // First alternative is the most probable result
                SpeechRecognitionAlternative alternative = result.Alternatives[0];
                Console.WriteLine($"Transcript: {alternative.Transcript}");
            }
        }
Example #2
0
        /// <summary>
        /// Adds additional details short audio file included in this recognition request
        /// </summary>
        /// <param name="localFilePath">Path to local audio file, e.g. /path/audio.wav</param>
        public static void SampleRecognize(string localFilePath)
        {
            SpeechClient speechClient = SpeechClient.Create();
            // string localFilePath = "resources/commercial_mono.wav"
            RecognizeRequest request = new RecognizeRequest
            {
                Config = new RecognitionConfig
                {
                    Metadata = new RecognitionMetadata
                    {
                        InteractionType     = RecognitionMetadata.Types.InteractionType.VoiceSearch,
                        RecordingDeviceType = RecognitionMetadata.Types.RecordingDeviceType.Smartphone,
                        RecordingDeviceName = "Pixel 3",
                    },
                    // The language of the supplied audio. Even though additional languages are
                    // provided by alternative_language_codes, a primary language is still required.
                    LanguageCode = "en-US",
                },
                Audio = new RecognitionAudio
                {
                    Content = ByteString.CopyFrom(File.ReadAllBytes(localFilePath)),
                },
            };
            RecognizeResponse response = speechClient.Recognize(request);

            foreach (var result in response.Results)
            {
                // First alternative is the most probable result
                SpeechRecognitionAlternative alternative = result.Alternatives[0];
                Console.WriteLine($"Transcript: {alternative.Transcript}");
            }
        }
Example #3
0
        /// <summary>
        /// Transcribe a short audio file with punctuation
        /// </summary>
        /// <param name="localFilePath">Path to local audio file, e.g. /path/audio.wav</param>
        public static void SampleRecognize(string localFilePath)
        {
            SpeechClient speechClient = SpeechClient.Create();
            // string localFilePath = "resources/commercial_mono.wav"
            RecognizeRequest request = new RecognizeRequest
            {
                Config = new RecognitionConfig
                {
                    // When enabled, trascription results may include punctuation (available for select languages).
                    EnableAutomaticPunctuation = true,
                    // The language of the supplied audio. Even though additional languages are
                    // provided by alternative_language_codes, a primary language is still required.
                    LanguageCode = "en-US",
                },
                Audio = new RecognitionAudio
                {
                    Content = ByteString.CopyFrom(File.ReadAllBytes(localFilePath)),
                },
            };
            RecognizeResponse response = speechClient.Recognize(request);

            foreach (var result in response.Results)
            {
                // First alternative is the most probable result
                SpeechRecognitionAlternative alternative = result.Alternatives[0];
                Console.WriteLine($"Transcript: {alternative.Transcript}");
            }
        }
Example #4
0
        public void IBM_audio_guess()
        {
            if (IBM_APIkey.TextLength == 0 || IBM_URL.TextLength == 0)
            {
                return;
            }
            IamAuthenticator authenticator = new IamAuthenticator(
                apikey: IBM_APIkey.Text);

            SpeechToTextService service = new SpeechToTextService(authenticator);

            service.SetServiceUrl(IBM_URL.Text);
            var model_to_use = "en-US_BroadbandModel";

            using (var reader = new WaveFileReader(folder_path + "\\" + transcriptions.ElementAt(current_line_number).Key))
            {
                if (reader.WaveFormat.SampleRate < 16000)
                {
                    model_to_use = "en-US_NarrowbandModel";
                }
            }

            DetailedResponse <SpeechRecognitionResults> result = service.Recognize(
                audio: File.ReadAllBytes(folder_path + "\\" + transcriptions.ElementAt(current_line_number).Key),
                contentType: "audio/wav",
                profanityFilter: false,
                model: model_to_use
                );
            SpeechRecognitionResults     results      = result.Result;
            SpeechRecognitionResult      final_result = results.Results[0];
            SpeechRecognitionAlternative real_result  = final_result.Alternatives[0];

            TranscriptionBox.Text = real_result.Transcript;
            SaveTranscriptionLine();
        }
        /// <summary>
        /// Print confidence level for individual words in a transcription of a short audio file
        /// </summary>
        /// <param name="localFilePath">Path to local audio file, e.g. /path/audio.wav</param>
        public static void SampleRecognize(string localFilePath)
        {
            SpeechClient speechClient = SpeechClient.Create();
            // string localFilePath = "resources/brooklyn_bridge.flac"
            RecognizeRequest request = new RecognizeRequest
            {
                Config = new RecognitionConfig
                {
                    // When enabled, the first result returned by the API will include a list
                    // of words and the confidence level for each of those words.
                    EnableWordConfidence = true,
                    // The language of the supplied audio
                    LanguageCode = "en-US",
                },
                Audio = new RecognitionAudio
                {
                    Content = ByteString.CopyFrom(File.ReadAllBytes(localFilePath)),
                },
            };
            RecognizeResponse response = speechClient.Recognize(request);
            // The first result includes confidence levels per word
            SpeechRecognitionResult result = response.Results[0];
            // First alternative is the most probable result
            SpeechRecognitionAlternative alternative = result.Alternatives[0];

            Console.WriteLine($"Transcript: {alternative.Transcript}");
            // Print the confidence level of each word
            foreach (var word in alternative.Words)
            {
                Console.WriteLine($"Word: {word.Word}");
                Console.WriteLine($"Confidence: {word.Confidence}");
            }
        }
Example #6
0
        private void recognizeSpeaker(string lastUtterance, SpeechRecognitionAlternative alt)
        {
            string[] utteranceWords = lastUtterance.Split(" ");
            int      numWords       = utteranceWords.Length;

            if (numWords == 0)
            {
                return;
            }
            double startTime = 0;
            double endTime   = 0;

            for (int i = 0; i < utteranceWords.Length; ++i)
            {
                string   utteranceWord = utteranceWords[utteranceWords.Length - 1 - i].Trim();
                WordInfo wordInfo      = alt.Words[alt.Words.Count - 1 - i];
                if (wordInfo.Word.Trim() != utteranceWord)
                {
                    // Word mismatch: this is not expected.
                    return;
                }
                if (i == 0)
                {
                    endTime = wordInfo.EndTime.Seconds + wordInfo.EndTime.Nanos / 1e9;
                }
                if (i == utteranceWords.Length - 1)
                {
                    startTime = wordInfo.StartTime.Seconds + wordInfo.StartTime.Nanos / 1e9;
                }
            }
            if (endTime - startTime < SPEAKER_ID_MIN_DURATION_SECONDS)
            {
                Debug.WriteLine(
                    $"Utterance duration too short for speaker ID: " +
                    $"{endTime - startTime} < {SPEAKER_ID_MIN_DURATION_SECONDS}");
                return;
            }

            int bytesPerSample   = audioFormat.BitsPerSample / 8;
            int bufferStartIndex = bytesPerSample * (int)(audioFormat.SampleRate * startTime);
            int bufferEndIndex   = bytesPerSample * (int)(audioFormat.SampleRate * endTime);

            byte[] snippetBuffer = new byte[bufferEndIndex - bufferStartIndex];
            lock (speakerIdBufferLock)
            {
                Array.Copy(
                    speakerIdBuffer, bufferStartIndex, snippetBuffer, 0,
                    bufferEndIndex - bufferStartIndex);
            }
            SendSpeakerIdHttpRequest(snippetBuffer);
        }
        /// <summary>
        /// Performs synchronous speech recognition with speech adaptation.
        /// </summary>
        /// <param name="sampleRateHertz">Sample rate in Hertz of the audio data sent in all `RecognitionAudio`
        /// messages. Valid values are: 8000-48000.</param>
        /// <param name="languageCode">The language of the supplied audio.</param>
        /// <param name="phrase">Phrase "hints" help Speech-to-Text API recognize the specified phrases from
        /// your audio data.</param>
        /// <param name="boost">Positive value will increase the probability that a specific phrase will be
        /// recognized over other similar sounding phrases.</param>
        /// <param name="uriPath">Path to the audio file stored on GCS.</param>
        public static void SampleRecognize(int sampleRateHertz, string languageCode, string phrase, float boost, string uriPath)
        {
            SpeechClient speechClient = SpeechClient.Create();
            // int sampleRateHertz = 44100
            // string languageCode = "en-US"
            // string phrase = "Brooklyn Bridge"
            // float boost = 20f
            // string uriPath = "gs://cloud-samples-data/speech/brooklyn_bridge.mp3"
            RecognizeRequest request = new RecognizeRequest
            {
                Config = new RecognitionConfig
                {
                    Encoding = RecognitionConfig.Types.AudioEncoding.Mp3,
                    // Sample rate in Hertz of the audio data sent in all `RecognitionAudio` messages. Valid values are:
                    // 8000-48000.
                    SampleRateHertz = 44100,
                    // The language of the supplied audio.
                    LanguageCode   = "en-US",
                    SpeechContexts =
                    {
                        new SpeechContext
                        {
                            Phrases =
                            {
                                "Brooklyn Bridge",
                            },
                            // Positive value will increase the probability that a specific phrase will be recognized over other
                            // similar sounding phrases.
                            Boost = 20f,
                        },
                    },
                },
                Audio = new RecognitionAudio
                {
                    // Path to the audio file stored on GCS.
                    Uri = "gs://cloud-samples-data/speech/brooklyn_bridge.mp3",
                },
            };
            RecognizeResponse response = speechClient.Recognize(request);

            foreach (var result in response.Results)
            {
                // First alternative is the most probable result
                SpeechRecognitionAlternative alternative = result.Alternatives[0];
                Console.WriteLine($"Transcript: {alternative.Transcript}");
            }
        }
Example #8
0
        /// <summary>
        /// Populates and returns a SpeechToTextResult object from a given Watson SpeechResult object.
        /// </summary>
        /// <param name="watsonResult">Watson SpeechResult object</param>
        /// <returns>A SpeechToTextResult object</returns>
        public SpeechToTextResult CreateSpeechToTextResult(SpeechRecognitionResult watsonResult)
        {
            var textResult = new SpeechToTextResult();

            textResult.IsFinal          = watsonResult.final;
            textResult.TextAlternatives = new TextAlternative[watsonResult.alternatives.Length];
            for (int i = 0; i < textResult.TextAlternatives.Length; ++i)
            {
                SpeechRecognitionAlternative watsonAlternative = watsonResult.alternatives[i];
                var alternative = new WatsonTextAlternative();
                alternative.Text                 = watsonAlternative.transcript;
                alternative.Confidence           = (float)watsonAlternative.confidence;
                alternative.TimeStamps           = watsonAlternative.Timestamps;
                alternative.WordConfidenceValues = watsonAlternative.WordConfidence;
                textResult.TextAlternatives[i]   = alternative;
            }
            return(textResult);
        }
Example #9
0
        /// <summary>
        /// Performs synchronous speech recognition with static context classes.
        /// </summary>
        /// <param name="sampleRateHertz">Sample rate in Hertz of the audio data sent in all `RecognitionAudio`
        /// messages. Valid values are: 8000-48000.</param>
        /// <param name="languageCode">The language of the supplied audio.</param>
        /// <param name="phrase">Phrase "hints" help Speech-to-Text API recognize the specified phrases from
        /// your audio data. In this sample we are using a static class phrase ($TIME). Classes represent
        /// groups of words that represent common concepts that occur in natural language. We recommend
        /// checking out the docs page for more info on static classes.</param>
        /// <param name="uriPath">Path to the audio file stored on GCS.</param>
        public static void SampleRecognize(int sampleRateHertz, string languageCode, string phrase, string uriPath)
        {
            SpeechClient speechClient = SpeechClient.Create();
            // int sampleRateHertz = 24000
            // string languageCode = "en-US"
            // string phrase = "$TIME"
            // string uriPath = "gs://cloud-samples-data/speech/time.mp3"
            RecognizeRequest request = new RecognizeRequest
            {
                Config = new RecognitionConfig
                {
                    Encoding = RecognitionConfig.Types.AudioEncoding.Mp3,
                    // Sample rate in Hertz of the audio data sent in all `RecognitionAudio` messages. Valid values are:
                    // 8000-48000.
                    SampleRateHertz = 24000,
                    // The language of the supplied audio.
                    LanguageCode   = "en-US",
                    SpeechContexts =
                    {
                        new SpeechContext
                        {
                            Phrases =
                            {
                                "$TIME",
                            },
                        },
                    },
                },
                Audio = new RecognitionAudio
                {
                    // Path to the audio file stored on GCS.
                    Uri = "gs://cloud-samples-data/speech/time.mp3",
                },
            };
            RecognizeResponse response = speechClient.Recognize(request);

            foreach (var result in response.Results)
            {
                // First alternative is the most probable result
                SpeechRecognitionAlternative alternative = result.Alternatives[0];
                Console.WriteLine($"Transcript: {alternative.Transcript}");
            }
        }
Example #10
0
        /// <summary>
        /// Print confidence level for individual words in a transcription of a short audio file
        /// Separating different speakers in an audio file recording
        /// </summary>
        /// <param name="localFilePath">Path to local audio file, e.g. /path/audio.wav</param>
        public static void SampleLongRunningRecognize(string localFilePath)
        {
            SpeechClient speechClient = SpeechClient.Create();
            // string localFilePath = "resources/commercial_mono.wav"
            LongRunningRecognizeRequest request = new LongRunningRecognizeRequest
            {
                Config = new RecognitionConfig
                {
                    // If enabled, each word in the first alternative of each result will be
                    // tagged with a speaker tag to identify the speaker.
                    EnableSpeakerDiarization = true,
                    // Optional. Specifies the estimated number of speakers in the conversation.
                    DiarizationSpeakerCount = 2,
                    // The language of the supplied audio
                    LanguageCode = "en-US",
                },
                Audio = new RecognitionAudio
                {
                    Content = ByteString.CopyFrom(File.ReadAllBytes(localFilePath)),
                },
            };
            // Poll until the returned long-running operation is complete
            LongRunningRecognizeResponse response = speechClient.LongRunningRecognize(request).PollUntilCompleted().Result;

            foreach (var result in response.Results)
            {
                // First alternative has words tagged with speakers
                SpeechRecognitionAlternative alternative = result.Alternatives[0];
                Console.WriteLine($"Transcript: {alternative.Transcript}");
                // Print the speakerTag of each word
                foreach (var word in alternative.Words)
                {
                    Console.WriteLine($"Word: {word.Word}");
                    Console.WriteLine($"Speaker tag: {word.SpeakerTag}");
                }
            }
        }
Example #11
0
        /*  === TransformResponse.Simplify method ===
         *  We want to extract all the useful data from the response that comes back from the cloud.
         *  But we don't want the superlous fields that make it more complicated to use.
         *
         *  The raw response structure contains:
         *  A single unnamed object with a "Results" array.
         *  The "Results" array consists of unnamed objects, each containing:
         *      "Alternatives" array, "ChannelTag" integer, "LanguageCode" string
         *  The "Alternatives" arrays appear to always consists of a single unnamed object containing:
         *      "Transcript" string, "Confidence" decimal, "Words" array
         *      WHEN DOES THIS EVER CONSIST OF MORE THEN ONE ALTERNATIVE?
         *  The "Words" array consists of unnamed objects containing:
         *      "StartTime" object, "EndTime" object, "Word" object
         *  The "StartTime" and "EndTime" objects both contain:
         *      "Seconds" int, "Nanos" integer
         *  The "Word" objects contain:
         *      "Word" string, "Confidence" decimal, "SpeakerTag" integer
         *
         *  The new structure contains:
         *  A single unnamed object with a "Results" array.
         *  The "Results" array consists of unnamed objects, each containing:
         *      "Transcript" string, "Confidence" decimal, "Words" array and "WordCount" integer
         *  The "Words" array consists of unnamed objects, eash containing:
         *      "Word" string, "Confidence" decimal, "StartTime" integer, "EndTime integer, "speakerTag" integer,
         *      and "WordNum" integer.
         *      Both StartTime and EndTime integers are in milliseconds.
         *      "WordCount" and "WordNum" are new fields added to help in fixing speaker tags,
         *      but we leave them in the final structure for possible future use.
         */

        public static Transcribed_Dto Simpify(RepeatedField <SpeechRecognitionResult> recogResults)
        {
            Transcribed_Dto transcript = new Transcribed_Dto();
            int             altCount   = 0;
            int             wordNum    = 0;

            foreach (SpeechRecognitionResult recogResult in recogResults)
            {
                if (recogResult.Alternatives.Count > 1)
                {
                    altCount++;
                    Console.WriteLine($"ERROR: more than 1 alternative - result {altCount}");
                }
                ;

                SpeechRecognitionAlternative recogAlt = recogResult.Alternatives[0];

                TranscribedTalk_Dto result = new TranscribedTalk_Dto(recogAlt.Transcript, recogAlt.Confidence)
                {
                    // The new "WordCount" field in Result is populated with the total word count.
                    WordCount = recogAlt.Words.Count,
                };
                Console.WriteLine($"Next result: {recogAlt.Words.Count} words");

                foreach (var item in recogAlt.Words)
                {
                    long startTime = item.StartTime.Seconds * 1000 + item.StartTime.Nanos / 1000000;
                    long endTime   = item.EndTime.Seconds * 1000 + item.EndTime.Nanos / 1000000;

                    // The new "WordNum" field in RespWord is popluated with the sequencial "wordnum"
                    wordNum++;
                    result.Words.Add(new TranscribedWord_Dto(item.Word, item.Confidence, startTime, endTime, item.SpeakerTag, wordNum));
                }
                transcript.Talks.Add(result);
            }
            return(transcript);
        }
Example #12
0
 private string FormatLine(SpeechRecognitionAlternative alternative)
 {
     return($"[{decimal.Round((decimal)alternative.Confidence * 100, 2)}%] {alternative.Transcript}\r\n");
 }
Example #13
0
 internal AlternativeBridge(SpeechRecognitionAlternative s)
 {
     this.Confidence = s.Confidence;
     this.Transcript = s.Transcript;
 }
Example #14
0
        private static async Task <object> StreamingMicRecognizeAsync(int seconds)
        {
            if (WaveIn.DeviceCount < 1)
            {
                File.WriteAllText("error.txt", "No microphone!");
                return((object)-1);
            }
            string lower = INISetting.GetValueWithAdd <string>("CredentialsFilePath", "credentials.json").ToLower();

            Console.WriteLine(lower);
            GoogleCredential googleCredential;

            using (Stream stream = (Stream) new FileStream(lower, FileMode.Open))
                googleCredential = GoogleCredential.FromStream(stream);
            SpeechClient.StreamingRecognizeStream streamingCall = SpeechClient.Create(new Channel(SpeechClient.DefaultEndpoint.Host, googleCredential.ToChannelCredentials())).StreamingRecognize();
            await streamingCall.WriteAsync(new StreamingRecognizeRequest()
            {
                StreamingConfig = new StreamingRecognitionConfig()
                {
                    Config = new RecognitionConfig()
                    {
                        Encoding        = RecognitionConfig.Types.AudioEncoding.Linear16,
                        SampleRateHertz = 16000,
                        LanguageCode    = "ru"
                    },
                    InterimResults = true
                }
            });

            Task printResponses = Task.Run((Func <Task>)(async() =>
            {
                string s = "";
                while (true)
                {
                    if (await streamingCall.ResponseStream.MoveNext(new CancellationToken()))
                    {
                        using (IEnumerator <StreamingRecognitionResult> enumerator1 = streamingCall.ResponseStream.Current.Results.GetEnumerator())
                        {
                            if (enumerator1.MoveNext())
                            {
                                using (IEnumerator <SpeechRecognitionAlternative> enumerator2 = enumerator1.Current.Alternatives.GetEnumerator())
                                {
                                    if (enumerator2.MoveNext())
                                    {
                                        SpeechRecognitionAlternative current = enumerator2.Current;
                                        Console.WriteLine(current.Transcript);
                                        s += current.Transcript;
                                    }
                                }
                            }
                        }
                        File.WriteAllText(Path.GetTempPath() + "\\speechtext\\speechtext.txt", s);
                        s = "";
                    }
                    else
                    {
                        break;
                    }
                }
            }));
            object      writeLock = new object();
            bool        writeMore = true;
            WaveInEvent waveIn    = new WaveInEvent();

            waveIn.DeviceNumber   = 0;
            waveIn.WaveFormat     = new WaveFormat(16000, 1);
            waveIn.DataAvailable += (EventHandler <WaveInEventArgs>)((sender, args) =>
            {
                lock (writeLock)
                {
                    if (!writeMore)
                    {
                        return;
                    }
                    streamingCall.WriteAsync(new StreamingRecognizeRequest()
                    {
                        AudioContent = ByteString.CopyFrom(args.Buffer, 0, args.BytesRecorded)
                    }).Wait();
                }
            });
            waveIn.StartRecording();
            Console.WriteLine("Speak now " + (object)seconds);
            await Task.Delay(TimeSpan.FromSeconds((double)seconds));

            waveIn.StopRecording();
            lock (writeLock)
                writeMore = false;
            await streamingCall.WriteCompleteAsync();

            await printResponses;

            return((object)0);
        }
Example #15
0
        private SpeechRecognitionEvent ParseRecognizeResponse(IDictionary resp)
        {
            if (resp == null)
            {
                return(null);
            }

            try
            {
                List <SpeechRecognitionResult> results = new List <SpeechRecognitionResult>();
                IList iresults = resp["results"] as IList;
                if (iresults == null)
                {
                    return(null);
                }

                foreach (var r in iresults)
                {
                    IDictionary iresult = r as IDictionary;
                    if (iresults == null)
                    {
                        continue;
                    }

                    SpeechRecognitionResult result = new SpeechRecognitionResult();
                    result.final = (bool)iresult["final"];

                    IList iwordAlternatives = iresult["word_alternatives"] as IList;
                    if (iwordAlternatives != null)
                    {
                        List <WordAlternativeResults> wordAlternatives = new List <WordAlternativeResults>();
                        foreach (var w in iwordAlternatives)
                        {
                            IDictionary iwordAlternative = w as IDictionary;
                            if (iwordAlternative == null)
                            {
                                continue;
                            }

                            WordAlternativeResults wordAlternativeResults = new WordAlternativeResults();
                            if (iwordAlternative.Contains("start_time"))
                            {
                                wordAlternativeResults.start_time = (double)iwordAlternative["start_time"];
                            }
                            if (iwordAlternative.Contains("end_time"))
                            {
                                wordAlternativeResults.end_time = (double)iwordAlternative["end_time"];
                            }
                            if (iwordAlternative.Contains("alternatives"))
                            {
                                List <WordAlternativeResult> wordAlternativeResultList = new List <WordAlternativeResult>();
                                IList iwordAlternativeResult = iwordAlternative["alternatives"] as IList;
                                if (iwordAlternativeResult == null)
                                {
                                    continue;
                                }

                                foreach (var a in iwordAlternativeResult)
                                {
                                    WordAlternativeResult wordAlternativeResult = new WordAlternativeResult();
                                    IDictionary           ialternative          = a as IDictionary;
                                    if (ialternative.Contains("word"))
                                    {
                                        wordAlternativeResult.word = (string)ialternative["word"];
                                    }
                                    if (ialternative.Contains("confidence"))
                                    {
                                        wordAlternativeResult.confidence = (double)ialternative["confidence"];
                                    }
                                    wordAlternativeResultList.Add(wordAlternativeResult);
                                }

                                wordAlternativeResults.alternatives = wordAlternativeResultList.ToArray();
                            }

                            wordAlternatives.Add(wordAlternativeResults);
                        }

                        result.word_alternatives = wordAlternatives.ToArray();
                    }

                    IList ialternatives = iresult["alternatives"] as IList;
                    if (ialternatives != null)
                    {
                        List <SpeechRecognitionAlternative> alternatives = new List <SpeechRecognitionAlternative>();
                        foreach (var a in ialternatives)
                        {
                            IDictionary ialternative = a as IDictionary;
                            if (ialternative == null)
                            {
                                continue;
                            }

                            SpeechRecognitionAlternative alternative = new SpeechRecognitionAlternative();
                            alternative.transcript = (string)ialternative["transcript"];
                            if (ialternative.Contains("confidence"))
                            {
                                alternative.confidence = (double)ialternative["confidence"];
                            }

                            if (ialternative.Contains("timestamps"))
                            {
                                IList itimestamps = ialternative["timestamps"] as IList;

                                TimeStamp[] timestamps = new TimeStamp[itimestamps.Count];
                                for (int i = 0; i < itimestamps.Count; ++i)
                                {
                                    IList itimestamp = itimestamps[i] as IList;
                                    if (itimestamp == null)
                                    {
                                        continue;
                                    }

                                    TimeStamp ts = new TimeStamp();
                                    ts.Word       = (string)itimestamp[0];
                                    ts.Start      = (double)itimestamp[1];
                                    ts.End        = (double)itimestamp[2];
                                    timestamps[i] = ts;
                                }

                                alternative.Timestamps = timestamps;
                            }
                            if (ialternative.Contains("word_confidence"))
                            {
                                IList iconfidence = ialternative["word_confidence"] as IList;

                                WordConfidence[] confidence = new WordConfidence[iconfidence.Count];
                                for (int i = 0; i < iconfidence.Count; ++i)
                                {
                                    IList iwordconf = iconfidence[i] as IList;
                                    if (iwordconf == null)
                                    {
                                        continue;
                                    }

                                    WordConfidence wc = new WordConfidence();
                                    wc.Word       = (string)iwordconf[0];
                                    wc.Confidence = (double)iwordconf[1];
                                    confidence[i] = wc;
                                }

                                alternative.WordConfidence = confidence;
                            }

                            alternatives.Add(alternative);
                        }

                        result.alternatives = alternatives.ToArray();
                    }

                    IDictionary iKeywords = iresult["keywords_result"] as IDictionary;
                    if (iKeywords != null)
                    {
                        result.keywords_result = new KeywordResults();
                        List <KeywordResult> keywordResults = new List <KeywordResult>();
                        foreach (string keyword in Keywords)
                        {
                            if (iKeywords[keyword] != null)
                            {
                                IList iKeywordList = iKeywords[keyword] as IList;
                                if (iKeywordList == null)
                                {
                                    continue;
                                }

                                foreach (var k in iKeywordList)
                                {
                                    IDictionary   iKeywordDictionary = k as IDictionary;
                                    KeywordResult keywordResult      = new KeywordResult();
                                    keywordResult.keyword         = keyword;
                                    keywordResult.confidence      = (double)iKeywordDictionary["confidence"];
                                    keywordResult.end_time        = (double)iKeywordDictionary["end_time"];
                                    keywordResult.start_time      = (double)iKeywordDictionary["start_time"];
                                    keywordResult.normalized_text = (string)iKeywordDictionary["normalized_text"];
                                    keywordResults.Add(keywordResult);
                                }
                            }
                        }
                        result.keywords_result.keyword = keywordResults.ToArray();
                    }

                    results.Add(result);
                }

                return(new SpeechRecognitionEvent(results.ToArray()));
            }
            catch (Exception e)
            {
                Log.Error("SpeechToText.ParseRecognizeResponse()", "ParseJsonResponse exception: {0}", e.ToString());
                return(null);
            }
        }
Example #16
0
        /** (Re-)initializes the Cloud-based streaming speech recognizer. */
        private void ReInitStreamRecognizer()
        {
            lock (speakerIdBufferLock)
            {
                speakerIdBufferPos = 0;
            }
            recogStream = speechClient.StreamingRecognize();
            SpeakerDiarizationConfig diarizationConfig = new SpeakerDiarizationConfig()
            {
                EnableSpeakerDiarization = ENABLE_SPEAKER_DIARIZATION,
                MaxSpeakerCount          = MAX_SPEAKER_COUNT,
                MinSpeakerCount          = MIN_SPEAKER_COUNT,
            };

            recogStream.WriteAsync(new StreamingRecognizeRequest()
            {
                StreamingConfig = new StreamingRecognitionConfig()
                {
                    Config = new RecognitionConfig()
                    {
                        Encoding          = RecognitionConfig.Types.AudioEncoding.Linear16,
                        AudioChannelCount = 1,
                        SampleRateHertz   = audioFormat.SampleRate,
                        LanguageCode      = LANGUAGE_CODE,
                        DiarizationConfig = diarizationConfig,
                    },
                    SingleUtterance = false,
                },
            });;
            Task.Run(async() =>
            {
                while (await recogStream.GetResponseStream().MoveNextAsync())
                {
                    foreach (var result in recogStream.GetResponseStream().Current.Results)
                    {
                        if (result.Alternatives.Count == 0)
                        {
                            continue;
                        }
                        // Identify the alternative with the highest confidence.
                        SpeechRecognitionAlternative bestAlt = null;
                        foreach (var alternative in result.Alternatives)
                        {
                            if (bestAlt == null || alternative.Confidence > bestAlt.Confidence)
                            {
                                bestAlt = alternative;
                            }
                        }
                        string transcript = bestAlt.Transcript.Trim();
                        if (transcript.Length == 0)
                        {
                            continue;
                        }
                        string transcriptInfo =
                            $"Speech transcript: {DateTime.Now}: \"" +
                            $"{transcript}\" (confidence={bestAlt.Confidence})";
                        if (ENABLE_SPEAKER_DIARIZATION)
                        {
                            int speakerTag  = bestAlt.Words[bestAlt.Words.Count - 1].SpeakerTag;
                            transcriptInfo += $" (speakerTag={speakerTag})";
                        }
                        Debug.WriteLine(transcriptInfo);
                        if (ENABLE_SPEAKER_DIARIZATION && ENABLE_SPEAKER_ID)
                        {
                            recognizeSpeaker(transcript, bestAlt);
                        }
                    }
                }
            });
            cummulativeRecogSeconds = 0f;
        }
Example #17
0
        private LexiconSpeechResult CreateSpeechResult(SpeechRecognitionResult watsonResult, float realtimeStart)
        {
            if (watsonResult.alternatives.Length == 0)
            {
                return(null);
            }

            LexiconSpeechResult speechResult = new LexiconSpeechResult();

            SpeechRecognitionAlternative bestAlternative = watsonResult.alternatives[0];

            speechResult.Transcript    = bestAlternative.transcript.Trim();
            speechResult.IsFinal       = watsonResult.final;
            speechResult.Confidence    = (float)bestAlternative.confidence;
            speechResult.RealtimeStart = realtimeStart;
            speechResult.RealtimeEnd   = -1;

            string[] words     = speechResult.Transcript.Split(' ');
            int      wordCount = words.Length;

            if (wordCount > 0)
            {
                speechResult.WordResults = new LexiconSpeechResult.WordResult[wordCount];

                for (int i = 0; i < wordCount; i++)
                {
                    speechResult.WordResults[i]      = new LexiconSpeechResult.WordResult();
                    speechResult.WordResults[i].Word = words[i];
                }

                if (bestAlternative.Timestamps != null)
                {
                    if (bestAlternative.Timestamps.Length == wordCount)
                    {
                        for (int i = 0; i < wordCount; i++)
                        {
                            if (string.Equals(words[i], bestAlternative.Timestamps[i].Word, StringComparison.OrdinalIgnoreCase))
                            {
                                speechResult.WordResults[i].TimeStart     = (float)bestAlternative.Timestamps[i].Start;
                                speechResult.WordResults[i].TimeEnd       = (float)bestAlternative.Timestamps[i].End;
                                speechResult.WordResults[i].RealtimeStart = realtimeStart + speechResult.WordResults[i].TimeStart;
                                speechResult.WordResults[i].RealtimeEnd   = realtimeStart + speechResult.WordResults[i].TimeEnd;
                            }
                            else
                            {
                                Debug.LogWarning("word: " + words[i] + " does not match timestamp word: " + bestAlternative.Timestamps[i].Word);
                            }
                        }

                        if (speechResult.WordResults.Length > 0)
                        {
                            speechResult.RealtimeEnd = speechResult.WordResults[speechResult.WordResults.Length - 1].RealtimeEnd;
                        }
                    }
                    else
                    {
                        Debug.LogWarning("word count: " + wordCount + ", timestamp count: " + bestAlternative.Timestamps.Length);
                    }
                }

                if (bestAlternative.WordConfidence != null)
                {
                    if (bestAlternative.WordConfidence.Length == wordCount)
                    {
                        for (int i = 0; i < wordCount; i++)
                        {
                            if (string.Equals(words[i], bestAlternative.WordConfidence[i].Word, StringComparison.OrdinalIgnoreCase))
                            {
                                speechResult.WordResults[i].Confidence = (float)bestAlternative.WordConfidence[i].Confidence;
                            }
                            else
                            {
                                Debug.LogWarning("word: " + words[i] + " does not match confidence word: " + bestAlternative.WordConfidence[i].Word);
                            }
                        }
                    }
                    else
                    {
                        Debug.LogWarning("word count: " + wordCount + ", confidence count: " + bestAlternative.WordConfidence.Length);
                    }
                }
            }

            if (watsonResult.keywords_result != null && watsonResult.keywords_result.keyword != null && watsonResult.keywords_result.keyword.Length > 0)
            {
                speechResult.KeywordResults = new LexiconSpeechResult.KeywordResult[watsonResult.keywords_result.keyword.Length];

                for (int i = 0; i < watsonResult.keywords_result.keyword.Length; i++)
                {
                    KeywordResult watsonKeywordResult = watsonResult.keywords_result.keyword[i];
                    LexiconSpeechResult.KeywordResult keywordResult = new LexiconSpeechResult.KeywordResult();

                    keywordResult.Keyword        = watsonKeywordResult.keyword;
                    keywordResult.TranscriptText = watsonKeywordResult.normalized_text;
                    keywordResult.Confidence     = (float)watsonKeywordResult.confidence;
                    keywordResult.TimeStart      = (float)watsonKeywordResult.start_time;
                    keywordResult.TimeEnd        = (float)watsonKeywordResult.end_time;
                    keywordResult.RealtimeStart  = realtimeStart + keywordResult.TimeStart;
                    keywordResult.RealtimeEnd    = realtimeStart + keywordResult.TimeEnd;

                    speechResult.KeywordResults[i] = keywordResult;
                }
            }

            if (watsonResult.word_alternatives != null && watsonResult.word_alternatives.Length > 0)
            {
                speechResult.AlternativeWordResults = new LexiconSpeechResult.WordAlternativeResults[watsonResult.word_alternatives.Length];

                for (int i = 0; i < watsonResult.word_alternatives.Length; i++)
                {
                    WordAlternativeResults watsonAlternativeResults = watsonResult.word_alternatives[i];
                    LexiconSpeechResult.WordAlternativeResults alternativeResults = new LexiconSpeechResult.WordAlternativeResults();

                    alternativeResults.Alternatives  = new LexiconSpeechResult.WordAlternative[watsonAlternativeResults.alternatives.Length];
                    alternativeResults.TimeStart     = (float)watsonAlternativeResults.start_time;
                    alternativeResults.TimeEnd       = (float)watsonAlternativeResults.end_time;
                    alternativeResults.RealtimeStart = realtimeStart + alternativeResults.TimeStart;
                    alternativeResults.RealtimeEnd   = realtimeStart + alternativeResults.TimeEnd;

                    for (int j = 0; j < watsonAlternativeResults.alternatives.Length; j++)
                    {
                        LexiconSpeechResult.WordAlternative alternative = new LexiconSpeechResult.WordAlternative();

                        alternative.Word       = watsonAlternativeResults.alternatives[j].word;
                        alternative.Confidence = (float)watsonAlternativeResults.alternatives[j].confidence;

                        alternativeResults.Alternatives[j] = alternative;
                    }

                    speechResult.AlternativeWordResults[i] = alternativeResults;
                }
            }

            return(speechResult);
        }