Beispiel #1
0
        public void IBM_audio_guess()
        {
            if (IBM_APIkey.TextLength == 0 || IBM_URL.TextLength == 0)
            {
                return;
            }
            IamAuthenticator authenticator = new IamAuthenticator(
                apikey: IBM_APIkey.Text);

            SpeechToTextService service = new SpeechToTextService(authenticator);

            service.SetServiceUrl(IBM_URL.Text);
            var model_to_use = "en-US_BroadbandModel";

            using (var reader = new WaveFileReader(folder_path + "\\" + transcriptions.ElementAt(current_line_number).Key))
            {
                if (reader.WaveFormat.SampleRate < 16000)
                {
                    model_to_use = "en-US_NarrowbandModel";
                }
            }

            DetailedResponse <SpeechRecognitionResults> result = service.Recognize(
                audio: File.ReadAllBytes(folder_path + "\\" + transcriptions.ElementAt(current_line_number).Key),
                contentType: "audio/wav",
                profanityFilter: false,
                model: model_to_use
                );
            SpeechRecognitionResults     results      = result.Result;
            SpeechRecognitionResult      final_result = results.Results[0];
            SpeechRecognitionAlternative real_result  = final_result.Alternatives[0];

            TranscriptionBox.Text = real_result.Transcript;
            SaveTranscriptionLine();
        }
        public static async Task <dynamic> SpeechToText(AppSettings appSettings, SpeechToTextRequest requestBody)
        {
            string  methodName = "SpeechToText";
            dynamic result     = new ExpandoObject();

            try
            {
                WatsonSettings   settings      = appSettings.WatsonServices.SpeechToText;
                IamAuthenticator authenticator = new IamAuthenticator(apikey: $"{requestBody.Apikey}");
                IBM.Watson.SpeechToText.v1.SpeechToTextService speechToText = new IBM.Watson.SpeechToText.v1.SpeechToTextService(authenticator);
                speechToText.SetServiceUrl($"{requestBody.Endpoint}");
                List <string> audioWavList = await CommonService.AudioToWav(appSettings, requestBody.Url);

                List <SpeechRecognitionResults> speechRecognitionResultsList = new List <SpeechRecognitionResults>();
                string text = "";
                foreach (var audioWav in audioWavList)
                {
                    string     audioWavTemp = CommonService.GetExternalPlatforms(appSettings).STTAudioFileUrl + audioWav;
                    HttpClient client       = new HttpClient();
                    byte[]     audio        = client.GetByteArrayAsync(audioWavTemp).Result;
                    SpeechRecognitionResults speechRecognitionResults = new SpeechRecognitionResults();
                    //HttpClient httpClient = new HttpClient();
                    //httpClient.DefaultRequestHeaders.Accept.Add(new MediaTypeWithQualityHeaderValue("audio/wav"));
                    //httpClient.DefaultRequestHeaders.Authorization = new System.Net.Http.Headers.AuthenticationHeaderValue("Basic", $"apikey:{requestBody.Apikey}");
                    //HttpContent httpContent = new ByteArrayContent(audio);
                    //var r = await httpClient.PostAsync("https://stream.watsonplatform.net/speech-to-text/api/v1/recognize", httpContent);
                    try
                    {
                        speechRecognitionResults = speechToText.Recognize(
                            audio: new MemoryStream(audio),
                            contentType: "audio/wav",
                            model: requestBody.Model != null ? requestBody.Model : "es-ES_NarrowbandModel"
                            ).Result;
                        speechRecognitionResultsList.Add(speechRecognitionResults);
                        foreach (var item in speechRecognitionResults.Results)
                        {
                            text += item.Alternatives[0].Transcript;
                        }
                        SpeechRecognitionResultsDTO speechRecognitionResultsDTO = new SpeechRecognitionResultsDTO();
                        speechRecognitionResultsDTO.Text = text;
                        speechRecognitionResultsDTO.SpeechRecognitionResults = speechRecognitionResultsList;
                        result = speechRecognitionResultsDTO;
                    }
                    catch (Exception e)
                    {
                        Log.Write(appSettings, LogEnum.ERROR.ToString(), label, className, methodName, $"ERROR: {JsonConvert.SerializeObject(requestBody)}");
                        Log.Write(appSettings, LogEnum.ERROR.ToString(), label, className, methodName, $"ERROR: {e.Source + Environment.NewLine + e.Message + Environment.NewLine + e.StackTrace}");
                        result = e.Message;
                    }
                }
                return(result);
            }
            catch (Exception e)
            {
                Log.Write(appSettings, LogEnum.ERROR.ToString(), label, className, methodName, $"ERROR: {JsonConvert.SerializeObject(requestBody)}");
                Log.Write(appSettings, LogEnum.ERROR.ToString(), label, className, methodName, $"ERROR: {e.Source + Environment.NewLine + e.Message + Environment.NewLine + e.StackTrace}");
                throw e;
            }
        }
Beispiel #3
0
    public IEnumerator RunRecognize()
    {
        Debug.Log("[SpeachToText] Attempting to Recognize...");
        audioPath = Application.persistentDataPath + "/voice_query" + VoiceRecorder.RecordNumber + ".wav";
        Debug.Log("[SpeachToText] audioPath = " + audioPath);
        SpeechRecognitionResults recognizeResponse = null;

        byte [] audioBytes = File.ReadAllBytes(audioPath);
        service.Recognize(
            callback: (DetailedResponse <SpeechRecognitionResults> response, IBMError error) => {
            Debug.Log("[SpeachToText] response.Result = " + response.Response);
            if (error != null)
            {
                Debug.Log("[SpeachToText] error = " + error);
            }
            recognizeResponse = response.Result;
            debugText.text    = response.Response;
            try {
                JSONObject json = new JSONObject(debugText.text);
                debugText.text  = json ["results"] [0] ["alternatives"] [0] ["transcript"].str;
                ChatBot.PostAMessage(debugText.text);
            } catch {
                debugText.text         = DidnunderstandSentences [Random.Range(0, DidnunderstandSentences.Length)];
                textToSpeach.InputText = debugText.text;
                textToSpeach.Synthesize();
            }
        },
            audio: audioBytes,
            model: usBroadbandModel,
            contentType: "audio/wav"
            );

        while (recognizeResponse == null)
        {
            yield return(null);
        }
        ReadyToSend = true;
        SelectableUIElement.ChangeLock(-1);
        Debug.Log("[SpeachToText] Ready");
    }
Beispiel #4
0
        /// <summary>
        /// Sends audio and returns transcription results for a sessionless recognition request. Returns only the final results; to enable interim results, use Sessions or WebSockets. The service imposes a data size limit of 100 MB. It automatically detects the endianness of the incoming audio and, for audio that includes multiple channels, downmixes the audio to one-channel mono during transcoding.
        /// You specify the parameters of the request as a path parameter, request headers, and query parameters. You provide the audio as the body of the request. This method is preferred to the multipart approach for submitting a sessionless recognition request.
        /// For requests to transcribe live audio as it becomes available, you must set the Transfer-Encoding header to chunked to use streaming mode. In streaming mode, the server closes the connection (response code 408) if the service receives no data chunk for 30 seconds and the service has no audio to transcribe for 30 seconds. The server also closes the connection (response code 400) if no speech is detected for inactivity_timeout seconds of audio (not processing time); use the inactivity_timeout parameter to change the default of 30 seconds.
        /// </summary>
        /// <param name="contentType"></param>
        /// <param name="transferEncoding"></param>
        /// <param name="audio"></param>
        /// <param name="model"></param>
        /// <param name="customizationId"></param>
        /// <param name="continuous"></param>
        /// <param name="inactivityTimeout"></param>
        /// <param name="keywords"></param>
        /// <param name="keywordsThreshold"></param>
        /// <param name="maxAlternatives"></param>
        /// <param name="wordAlternativesThreshold"></param>
        /// <param name="wordConfidence"></param>
        /// <param name="timestamps"></param>
        /// <param name="profanityFilter"></param>
        /// <param name="smartFormatting"></param>
        /// <param name="speakerLabels"></param>
        /// <returns></returns>
        public SpeechRecognitionResults Recognize(string contentType, Stream audio, string transferEncoding = "", string model = "en-US_BroadbandModel", string languageCustomizationId = null, bool?continuous = null, int?inactivityTimeout = null, string[] keywords = null, double?keywordsThreshold = null, int?maxAlternatives = null, double?wordAlternativesThreshold = null, bool?wordConfidence = null, bool?timestamps = null, bool profanityFilter = false, bool?smartFormatting = null, bool?speakerLabels = null, string customizationId = null)
        {
            if (audio == null)
            {
                throw new ArgumentNullException($"{nameof(audio)}");
            }

            SpeechRecognitionResults result = null;

            try
            {
                string   urlService  = string.Empty;
                IRequest restRequest = null;

                IClient client;
                if (_tokenManager == null)
                {
                    client = this.Client.WithAuthentication(this.UserName, this.Password);
                }
                else
                {
                    client = this.Client.WithAuthentication(_tokenManager.GetToken());
                }

                restRequest = client.PostAsync($"{this.Endpoint}/v1/recognize");

                if (!string.IsNullOrEmpty(model))
                {
                    restRequest.WithArgument("model", model);
                }

                if (!string.IsNullOrEmpty(transferEncoding))
                {
                    restRequest.WithHeader("Transfer-Encoding", transferEncoding);
                }

                if (!string.IsNullOrEmpty(languageCustomizationId))
                {
                    restRequest.WithArgument("language_customization_id", languageCustomizationId);
                }

                if (continuous.HasValue)
                {
                    restRequest.WithArgument("continuous", continuous.Value);
                }

                if (inactivityTimeout.HasValue && inactivityTimeout.Value > 0)
                {
                    restRequest.WithArgument("inactivity_timeout", inactivityTimeout.Value);
                }

                if (keywords != null && keywords.Length > 0)
                {
                    restRequest.WithArgument("keywords", keywords);
                }

                if (keywordsThreshold.HasValue && keywordsThreshold.Value > 0)
                {
                    restRequest.WithArgument("keywords_threshold", keywordsThreshold.Value);
                }

                if (maxAlternatives.HasValue && maxAlternatives.Value > 0)
                {
                    restRequest.WithArgument("max_alternatives", maxAlternatives.Value);
                }

                if (wordAlternativesThreshold.HasValue && wordAlternativesThreshold.Value > 0)
                {
                    restRequest.WithArgument("word_alternatives_threshold", wordAlternativesThreshold.Value);
                }

                if (wordConfidence.HasValue)
                {
                    restRequest.WithArgument("word_confidence", wordConfidence.Value);
                }

                if (timestamps.HasValue)
                {
                    restRequest.WithArgument("timestamps", timestamps.Value);
                }

                if (profanityFilter)
                {
                    restRequest.WithArgument("profanity_filter", profanityFilter);
                }

                if (smartFormatting.HasValue)
                {
                    restRequest.WithArgument("smart_formatting", smartFormatting.Value);
                }

                if (speakerLabels.HasValue)
                {
                    restRequest.WithArgument("speaker_labels", speakerLabels.Value);
                }

                if (!string.IsNullOrEmpty(customizationId))
                {
                    restRequest.WithArgument("customization_id", customizationId);
                }

                StreamContent bodyContent = new StreamContent(audio);
                if (!string.IsNullOrEmpty(contentType))
                {
                    bodyContent.Headers.Add("Content-Type", contentType);
                }

                restRequest.WithBodyContent(bodyContent);
                restRequest.WithHeader("X-IBMCloud-SDK-Analytics", "service_name=speech_to_text;service_version=v1;operation_id=Recognize");

                result = restRequest.As <SpeechRecognitionResults>()
                         .Result;
            }
            catch (AggregateException ae)
            {
                throw ae.InnerException as ServiceResponseException;
            }

            return(result);
        }
        /// <summary>
        /// Sends audio and returns transcription results for a sessionless recognition request. Returns only the final results; to enable interim results, use Sessions or WebSockets. The service imposes a data size limit of 100 MB. It automatically detects the endianness of the incoming audio and, for audio that includes multiple channels, downmixes the audio to one-channel mono during transcoding.
        /// You specify the parameters of the request as a path parameter, request headers, and query parameters. You provide the audio as the body of the request. This method is preferred to the multipart approach for submitting a sessionless recognition request.
        /// For requests to transcribe live audio as it becomes available, you must set the Transfer-Encoding header to chunked to use streaming mode. In streaming mode, the server closes the connection (response code 408) if the service receives no data chunk for 30 seconds and the service has no audio to transcribe for 30 seconds. The server also closes the connection (response code 400) if no speech is detected for inactivity_timeout seconds of audio (not processing time); use the inactivity_timeout parameter to change the default of 30 seconds.
        /// </summary>
        /// <param name="sessionId"></param>
        /// <param name="contentType"></param>
        /// <param name="transferEncoding"></param>
        /// <param name="audio"></param>
        /// <param name="model"></param>
        /// <param name="customizationId"></param>
        /// <param name="continuous"></param>
        /// <param name="inactivityTimeout"></param>
        /// <param name="keywords"></param>
        /// <param name="keywordsThreshold"></param>
        /// <param name="maxAlternatives"></param>
        /// <param name="wordAlternativesThreshold"></param>
        /// <param name="wordConfidence"></param>
        /// <param name="timestamps"></param>
        /// <param name="profanityFilter"></param>
        /// <param name="smartFormatting"></param>
        /// <param name="speakerLabels"></param>
        /// <returns></returns>
        private SpeechRecognitionResults Recognize(string sessionId, string contentType, Metadata metaData, Stream audio, string transferEncoding = "", string model = "", string customizationId = "", bool?continuous = null, int?inactivityTimeout = null, string[] keywords = null, double?keywordsThreshold = null, int?maxAlternatives = null, double?wordAlternativesThreshold = null, bool?wordConfidence = null, bool?timestamps = null, bool profanityFilter = false, bool?smartFormatting = null, bool?speakerLabels = null)
        {
            if (string.IsNullOrEmpty(contentType))
            {
                throw new ArgumentNullException($"{nameof(contentType)}");
            }

            SpeechRecognitionResults result = null;

            try
            {
                string   urlService  = string.Empty;
                IRequest restRequest = null;

                IClient client;
                if (_tokenManager == null)
                {
                    client = this.Client.WithAuthentication(this.UserName, this.Password);
                }
                else
                {
                    client = this.Client.WithAuthentication(_tokenManager.GetToken());
                }

                if (string.IsNullOrEmpty(sessionId))
                {
                    restRequest = client.PostAsync($"{this.Endpoint}/v1/recognize");
                }
                else
                {
                    restRequest = client.PostAsync($"{this.Endpoint}/v1/sessions/{sessionId}")
                                  .WithHeader("Cookie", sessionId);
                }

                if (!string.IsNullOrEmpty(transferEncoding))
                {
                    restRequest.WithHeader("Transfer-Encoding", transferEncoding);
                }

                if (metaData == null)
                {
                    // if a session exists, the model should not be sent
                    if (string.IsNullOrEmpty(sessionId))
                    {
                        restRequest.WithArgument("model", model);
                    }

                    if (!string.IsNullOrEmpty(customizationId))
                    {
                        restRequest.WithArgument("customization_id", customizationId);
                    }

                    if (continuous.HasValue)
                    {
                        restRequest.WithArgument("continuous", continuous.Value);
                    }

                    if (inactivityTimeout.HasValue && inactivityTimeout.Value > 0)
                    {
                        restRequest.WithArgument("inactivity_timeout", inactivityTimeout.Value);
                    }

                    if (keywords != null && keywords.Length > 0)
                    {
                        restRequest.WithArgument("keywords", keywords);
                    }

                    if (keywordsThreshold.HasValue && keywordsThreshold.Value > 0)
                    {
                        restRequest.WithArgument("keywords_threshold", keywordsThreshold.Value);
                    }

                    if (maxAlternatives.HasValue && maxAlternatives.Value > 0)
                    {
                        restRequest.WithArgument("max_alternatives", maxAlternatives.Value);
                    }

                    if (wordAlternativesThreshold.HasValue && wordAlternativesThreshold.Value > 0)
                    {
                        restRequest.WithArgument("word_alternatives_threshold", wordAlternativesThreshold.Value);
                    }

                    if (wordConfidence.HasValue)
                    {
                        restRequest.WithArgument("word_confidence", wordConfidence.Value);
                    }

                    if (timestamps.HasValue)
                    {
                        restRequest.WithArgument("timestamps", timestamps.Value);
                    }

                    if (profanityFilter)
                    {
                        restRequest.WithArgument("profanity_filter", profanityFilter);
                    }

                    if (smartFormatting.HasValue)
                    {
                        restRequest.WithArgument("smart_formatting", smartFormatting.Value);
                    }

                    if (speakerLabels.HasValue)
                    {
                        restRequest.WithArgument("speaker_labels", speakerLabels.Value);
                    }

                    StreamContent bodyContent = new StreamContent(audio);
                    bodyContent.Headers.Add("Content-Type", contentType);

                    restRequest.WithBodyContent(bodyContent);
                }
                else
                {
                    var json = JsonConvert.SerializeObject(metaData);

                    StringContent metadata = new StringContent(json);
                    metadata.Headers.ContentType = MediaTypeHeaderValue.Parse(HttpMediaType.APPLICATION_JSON);

                    var audioContent = new ByteArrayContent((audio as Stream).ReadAllBytes());
                    audioContent.Headers.ContentType = MediaTypeHeaderValue.Parse(contentType);

                    MultipartFormDataContent formData = new MultipartFormDataContent();

                    // if a session exists, the model should not be sent
                    if (string.IsNullOrEmpty(sessionId))
                    {
                        restRequest.WithArgument("model", model);
                    }

                    formData.Add(metadata, "metadata");
                    formData.Add(audioContent, "upload");

                    restRequest.WithBodyContent(formData);
                }

                result = restRequest.As <SpeechRecognitionResults>()
                         .Result;
            }
            catch (AggregateException ae)
            {
                throw ae.InnerException as ServiceResponseException;
            }

            return(result);
        }