Esempio n. 1
0
        /// <summary>
        /// Returns speech to text from selected Opus audiofile streamed from a blobcontainer in Azure Storage.
        /// </summary>
        /// <param name="opusBlob">Name of opus file</param>
        /// <param name="container">Azure blob container name</param>
        /// <returns>List<Speech> container speechresults</returns>
        public async Task <List <Speech> > RunRecognitionAsync(string opusBlob, string container)
        {
            SpeechResult = new List <Speech>();

            var blobService = new BlobService();
            var blobClient  = await blobService.GetBlobFromContainerAsync(opusBlob, container);

            using var audioInputStream = AudioInputStream.CreatePushStream();
            using var audioConfig      = AudioConfig.FromStreamInput(audioInputStream);
            using (var recognizer = new SpeechRecognizer(_speechConfig, _languagesToDetect, audioConfig))
            {
                recognizer.Recognizing    += Recognizing;
                recognizer.Recognized     += Recognized;
                recognizer.SessionStarted += SessionStarted;
                recognizer.SessionStopped += SessionStopped;
                recognizer.Canceled       += SessionCanceled;

                await InjectStreamIntoRecognizerAsync(audioInputStream, blobClient);

                await recognizer.StartContinuousRecognitionAsync();

                Task.WaitAny(new[] { _stopRecognition.Task });
                await recognizer.StopContinuousRecognitionAsync();
            }

            return(SpeechResult);
        }
        public async Task <string> DetectLanguage(byte[] audioBytes, string fileExtension, string locale1, string locale2)
        {
            var wavBytes = ConvertToWaveBytes(audioBytes, fileExtension);

            var autoDetectSourceLanguageConfig = AutoDetectSourceLanguageConfig.FromLanguages(new string[] { locale1, locale2 });

            var config          = SpeechConfig.FromSubscription(SubscriptionKey, SubscriptionRegion);
            var stopRecognition = new TaskCompletionSource <int>();
            var detected        = new List <string>();

            using var pushStream = AudioInputStream.CreatePushStream();
            using (var audioInput = AudioConfig.FromStreamInput(pushStream))
            {
                using var recognizer = new SpeechRecognizer(
                          config,
                          autoDetectSourceLanguageConfig,
                          audioInput);
                pushStream.Write(wavBytes);
                pushStream.Close();

                recognizer.Recognized += (s, e) =>
                {
                    var autoDetectSourceLanguageResult = AutoDetectSourceLanguageResult.FromResult(e.Result);
                    var detectedLanguage = autoDetectSourceLanguageResult.Language;
                    detected.Add(detectedLanguage);
                    if (detected.Count > UtteranceCount)
                    {
                        stopRecognition.TrySetResult(0);
                    }
                };

                recognizer.SessionStopped += (s, e) =>
                {
                    stopRecognition.TrySetResult(0);
                };

                await recognizer.StartContinuousRecognitionAsync().ConfigureAwait(false);

                var t = Task.Factory.StartNew(async() => { await SetTimeOutForRecognition(stopRecognition).ConfigureAwait(false); }, CancellationToken.None, TaskCreationOptions.None, TaskScheduler.Default);

                Task.WaitAny(new[] { stopRecognition.Task });

                await recognizer.StopKeywordRecognitionAsync().ConfigureAwait(false);
            }

            if (detected.Count == 0)
            {
                throw new TimeoutException("Did not get any language identification results back in time.");
            }

            var detectedByCount = detected.GroupBy(i => i);
            var mostFreq        = detectedByCount.OrderBy(t => t.Count()).LastOrDefault().Key;

            if (string.IsNullOrEmpty(mostFreq) || (!mostFreq.Equals(locale1, StringComparison.OrdinalIgnoreCase) && !mostFreq.Equals(locale2, StringComparison.OrdinalIgnoreCase)))
            {
                return(locale1);
            }

            return(mostFreq);
        }
Esempio n. 3
0
        //private const string speechEndpoint = "https://YOUR_LOCATION.api.cognitive.microsoft.com/";

        //public async Task<IActionResult> OnGetAsync()
        //{
        //    return Page();
        //}

        public async Task <IActionResult> OnPostAsync()
        {
            var speechConfig = SpeechConfig.FromSubscription(speechKey, speechLocation);

            speechConfig.SpeechRecognitionLanguage = "ja-JP";

            byte[] readBytes;

            using var audioInputStream = AudioInputStream.CreatePushStream();
            using var reader           = new BinaryReader(VoiceFile.OpenReadStream());
            do
            {
                readBytes = reader.ReadBytes(1024);
                audioInputStream.Write(readBytes, readBytes.Length);
            } while (readBytes.Length > 0);

            var audioConfig = AudioConfig.FromStreamInput(audioInputStream);

            using var speechRecognizer = new SpeechRecognizer(speechConfig, audioConfig);
            var result = await speechRecognizer.RecognizeOnceAsync();

            if (result.Reason == ResultReason.RecognizedSpeech)
            {
                Result         = "Œ‹‰Ê:";
                RecognizedText = result.Text;
            }

            return(Page());
        }
        /// <summary>
        /// Remote audio transcription of the given audioFile with CognitiveServices
        /// </summary>
        public static AnalysisResult TranscribeAudio(ref AnalysisResult audioResponse, IFormFile audioFile)
        {
            // needed for speaker diarization to resolve at the word level
            SPEECH_CONFIG.RequestWordLevelTimestamps();

            var audioFormat128 = AudioStreamFormat.GetWaveFormatPCM(8000, 16, 1);
            var audioFormat256 = AudioStreamFormat.GetWaveFormatPCM(16000, 16, 1);

            // load bytestream -> audio stream
            // load audio config from audio stream
            // initialize speech recognizer
            using (var br = new BinaryReader(audioFile.OpenReadStream()))
                using (var audioInputStream = AudioInputStream.CreatePushStream(audioFormat128))
                    using (var audioConfig = AudioConfig.FromStreamInput(audioInputStream))
                        using (var recognizer = new SpeechRecognizer(SPEECH_CONFIG, audioConfig))
                        {
                            long nbytes = audioFile.Length;
                            var  buff   = new List <byte>();

                            // read through bytes of audio
                            byte[] readBytes;
                            do
                            {
                                readBytes = br.ReadBytes(1024);
                                buff.AddRange(readBytes);
                                audioInputStream.Write(readBytes, readBytes.Length);
                            } while (readBytes.Length > 0);

                            var transcript = ExecuteRecognizer(recognizer).Result;
                            audioResponse.Transcript = transcript;
                            return(audioResponse);
                        }
        }
        private void SetupTranscriptionAndTranslationService()
        {
            try
            {
                var lCognitiveKey    = _settings.AzureCognitiveKey;
                var lCognitiveRegion = _settings.AzureCognitiveRegion;

                _eventPublisher.Publish("MySTT Setup", $"Got region: {lCognitiveRegion}, key starting from: {lCognitiveKey??lCognitiveKey.Substring(0, lCognitiveKey.Length /2)}");

                this.mTransSpeechConfig = SpeechTranslationConfig.FromSubscription(lCognitiveKey, lCognitiveRegion);
                var fromLanguage = "en-US";
                var toLanguages  = new List <string> {
                    "el-GR"
                };
                //var toLanguages = new List<string> { "ru-RU" };
                this.mTransSpeechConfig.SpeechRecognitionLanguage = fromLanguage;
                toLanguages.ForEach(this.mTransSpeechConfig.AddTargetLanguage);
                this.mInputStream = AudioInputStream.CreatePushStream(AudioStreamFormat.GetWaveFormatPCM(SAMPLESPERSECOND, BITSPERSAMPLE, NUMBEROFCHANNELS));

                this.mAudioConfig           = AudioConfig.FromStreamInput(this.mInputStream);
                this.mTranslationRecognizer = new TranslationRecognizer(this.mTransSpeechConfig, this.mAudioConfig);

                this.mTranslationRecognizer.Recognizing       += this.MSpeechRecognizer_Recognizing;
                this.mTranslationRecognizer.Recognized        += this.MSpeechRecognizer_Recognized;
                this.mTranslationRecognizer.SpeechEndDetected += this.MSpeechRecognizer_SpeechEndDetected;

                this.StartRecognisionIfNeeded();
            }
            catch (Exception ex)
            {
                _eventPublisher.Publish("MySTT Setup - Failed", $"Failed to initialize: {ex.Message}");
            }
        }
Esempio n. 6
0
        private async Task <PushAudioInputStream> CreatePushStreamAsync(Stream stream)
        {
            var read = 0;
            var recognitionStream = AudioInputStream.CreatePushStream();
            var buffer            = ArrayPool <byte> .Shared.Rent(80000);

            var sumRead = 0;

            try
            {
                while ((read = await stream.ReadAsync(buffer, 0, buffer.Length)) != 0)
                {
                    sumRead += read;
                    recognitionStream.Write(buffer, read);
                }
                recognitionStream.Close();
                if (sumRead == 0)
                {
                    return(null);
                }
                return(recognitionStream);
            }
            finally
            {
                ArrayPool <byte> .Shared.Return(buffer);
            }
        }
Esempio n. 7
0
        /// <summary>
        /// Initializes a new instance of the <see cref="ContinuousSpeechRecognizer"/> class.
        /// </summary>
        /// <param name="pipeline">The pipeline in which to create the component.</param>
        /// <param name="subscriptionKey">The subscription key for the Azure speech resource.</param>
        /// <param name="region">The service region of the Azure speech resource.</param>
        public ContinuousSpeechRecognizer(Pipeline pipeline, string subscriptionKey, string region)
            : base(pipeline)
        {
            var config = SpeechConfig.FromSubscription(subscriptionKey, region);

            this.pushStream = AudioInputStream.CreatePushStream();
            this.audioInput = AudioConfig.FromStreamInput(this.pushStream);
            this.recognizer = new SpeechRecognizer(config, this.audioInput);
        }
        /// <summary>
        /// Constructs an <see cref="AudioConfig"/> from <see cref="Config"/>.
        /// Depending on the available services, this may either use the audio features built into the Speech SDK (such as <see cref="AudioConfig.FromDefaultMicrophoneInput"/>),
        /// or it may construct a <see cref="IStreamAudioSource"/> that accesses the requested <see cref="AudioDevice"/> with resampling and noise gates as required.
        /// </summary>
        /// <returns></returns>
        protected AudioConfig GetAudioConfig()
        {
            var streamSource = GetStreamAudioSource(Config.AudioSource);

            if (streamSource != null)
            {
                //use this stream source and convert to an Azure audio stream
                try
                {
                    var azureInput = AudioInputStream.CreatePushStream(AudioStreamFormat.GetWaveFormatPCM(
                                                                           (uint)streamSource.Format.SampleRate,
                                                                           (byte)streamSource.Format.BitsPerSample,
                                                                           (byte)streamSource.Format.ChannelCount));

                    byte[] bufferOptional = null;
                    streamSource.DataAvailable += (s, e) =>
                    {
                        azureInput.Write(e.Buffer.GetArray(ref bufferOptional), e.Buffer.Count);
                    };
                    streamSource.Stopped += (s, e) =>
                    {
                        if (e.Cause == StreamAudioSourceStoppedCause.Stopped)
                        {
                            //signal end-of-stream to Azure
                            azureInput.Close();
                        }
                    };

                    this.StreamAudioSource = streamSource;
                    return(AudioConfig.FromStreamInput(azureInput));
                }
                catch (Exception ex)
                {
                    Logger.LogError(ex, $"Error while creating an Azure AudioConfig from an IStreamAudioSource. Format: SampleRate={streamSource.Format.SampleRate}, BitsPerSample={streamSource.Format.BitsPerSample}, Channels={streamSource.Format.ChannelCount}");
                    streamSource.Dispose();
                }
            }

            this.StreamAudioSource    = null;
            this.StreamAudioNoiseGate = null;

            //try and use the built-in audio engine
            if (Config.AudioSource is AudioDevice audioDevice)
            {
                if (audioDevice.UseDefaultAudioInputDevice)
                {
                    return(AudioConfig.FromDefaultMicrophoneInput());
                }
            }

            return(null);
        }
Esempio n. 9
0
    void Start()
    {
        if (outputText == null)
        {
            UnityEngine.Debug.LogError("outputText property is null! Assign a UI Text element to it.");
        }
        else if (recoButton == null)
        {
            message = "recoButton property is null! Assign a UI Button to it.";
            UnityEngine.Debug.LogError(message);
        }
        else
        {
            // Continue with normal initialization, Text and Button objects are present.
#if PLATFORM_ANDROID
            // Request to use the microphone, cf.
            // https://docs.unity3d.com/Manual/android-RequestingPermissions.html
            message = "Waiting for mic permission";
            if (!Permission.HasUserAuthorizedPermission(Permission.Microphone))
            {
                Permission.RequestUserPermission(Permission.Microphone);
            }
#elif PLATFORM_IOS
            if (!Application.HasUserAuthorization(UserAuthorization.Microphone))
            {
                Application.RequestUserAuthorization(UserAuthorization.Microphone);
            }
#else
            micPermissionGranted = true;
            message = "Click button to recognize speech";
#endif

            grabacionCompleta = new StringBuilder(200);

            config = SpeechConfig.FromSubscription("b899f4a3bc2b4b30b3e690476b1af952", "westus");
            config.SpeechRecognitionLanguage = "es-ES";
            pushStream              = AudioInputStream.CreatePushStream();
            audioInput              = AudioConfig.FromStreamInput(pushStream);
            recognizer              = new SpeechRecognizer(config, audioInput);
            recognizer.Recognizing += RecognizingHandler;
            recognizer.Recognized  += RecognizedHandler;
            recognizer.Canceled    += CanceledHandler;

            recoButton.onClick.AddListener(ButtonClick);
            foreach (var device in Microphone.devices)
            {
                Debug.Log("DeviceName: " + device);
            }
            audioSource = GameObject.Find("MyAudioSource").GetComponent <AudioSource>();
        }
    }
Esempio n. 10
0
 void ConfigureSpeechRecognizer()
 {
     _speechConfig = SpeechConfig.FromSubscription(SubscriptionKey, SubscriptionRegion);
     _speechConfig.SpeechRecognitionLanguage = "es-US";
     _speechConfig.OutputFormat = OutputFormat.Detailed;
     _pushStream                    = AudioInputStream.CreatePushStream();
     _audioInput                    = AudioConfig.FromStreamInput(_pushStream);
     _speechRecognizer              = new SpeechRecognizer(_speechConfig, _audioInput);
     _speechRecognizer.Recognizing += SpeechRecognizingHandler;
     _speechRecognizer.Recognized  += SpeechRecognizedHandler;
     _speechRecognizer.Canceled    += SpeechCanceledHandler;
     _audioSource                   = GameObject.Find("AudioSource").GetComponent <AudioSource>();
     _audioSource.loop              = false;
     _audioSource.playOnAwake       = false;
 }
Esempio n. 11
0
    void Start()
    {
        if (outputText == null)
        {
            UnityEngine.Debug.LogError("outputText property is null! Assign a UI Text element to it.");
        }
        else if (recoButton == null)
        {
            _message = "recoButton property is null! Assign a UI Button to it.";
            UnityEngine.Debug.LogError(_message);
        }
        else
        {
            // Continue with normal initialization, Text and Button objects are present.
#if PLATFORM_ANDROID
            // Request to use the microphone, cf.
            // https://docs.unity3d.com/Manual/android-RequestingPermissions.html
            message = "Waiting for mic permission";
            if (!Permission.HasUserAuthorizedPermission(Permission.Microphone))
            {
                Permission.RequestUserPermission(Permission.Microphone);
            }
#elif PLATFORM_IOS
            if (!Application.HasUserAuthorization(UserAuthorization.Microphone))
            {
                Application.RequestUserAuthorization(UserAuthorization.Microphone);
            }
#else
            _micPermissionGranted = true;
            _message = "Click button to recognize speech";
#endif
            _config = SpeechTranslationConfig.FromSubscription(SubscriptionKey, SubscriptionRegion);
            _config.SpeechRecognitionLanguage = "es-US";
            _config.AddTargetLanguage("en-US");
            _pushStream              = AudioInputStream.CreatePushStream();
            _audioInput              = AudioConfig.FromStreamInput(_pushStream);
            _recognizer              = new TranslationRecognizer(_config, _audioInput);
            _recognizer.Recognizing += RecognizingHandler;
            _recognizer.Recognized  += RecognizedHandler;
            _recognizer.Canceled    += CanceledHandler;

            foreach (var device in Microphone.devices)
            {
                Debug.Log("DeviceName: " + device);
            }
            _audioSource = GameObject.Find("AudioSource").GetComponent <AudioSource>();
        }
    }
Esempio n. 12
0
    public async UniTask STTBytes(byte[] readBytes, int sampleRate, int bitRate, int channels)
    {
        var speechConfig = SpeechConfig.FromSubscription(subscription_key, region);

        speechConfig.SpeechRecognitionLanguage = location;
        var audioStreamFormat = AudioStreamFormat.GetWaveFormatPCM((uint)sampleRate, (byte)bitRate, (byte)channels);
        var audioInputStream  = AudioInputStream.CreatePushStream(audioStreamFormat);
        var audioConfig       = AudioConfig.FromStreamInput(audioInputStream);
        var recognizer        = new SpeechRecognizer(speechConfig, audioConfig);

        audioInputStream.Write(readBytes, readBytes.Length);

        var result = await recognizer.RecognizeOnceAsync();

        Debug.Log($"Recognized Line : = {result.Text}");
    }
        public async Task Start()
        {
            var config = SpeechConfig.FromSubscription(_projectSettings.AzureSpeechServiceSubscriptionKey, _projectSettings.AzureSpeechServiceRegionName);

            var audioFormat = AudioStreamFormat.GetWaveFormatPCM(8000, 16, 1);

            _inputStream = AudioInputStream.CreatePushStream(audioFormat);
            _audioInput  = AudioConfig.FromStreamInput(_inputStream);

            _recognizer = new SpeechRecognizer(config, _audioInput);
            _recognizer.SessionStarted += RecognizerStarted;
            _recognizer.Recognized     += RecognizerRecognized;
            _recognizer.Canceled       += RecognizerCancelled;

            await _recognizer.StartContinuousRecognitionAsync();
        }
Esempio n. 14
0
        private void Init(string from, string to)
        {
            this.toLanguage = to;

            Profile       = MediaEncodingProfile.CreateWav(AudioEncodingQuality.Low);
            Profile.Audio = AudioEncodingProperties.CreatePcm(16000, 1, 16);

            byte channels         = 1;
            byte bitsPerSample    = 16;
            uint samplesPerSecond = 16000; // or 8000
            var  audioFormat      = AudioStreamFormat.GetWaveFormatPCM(samplesPerSecond, bitsPerSample, channels);

            // Init Push Stream

            pushStream = AudioInputStream.CreatePushStream(audioFormat);

            if (from == to)
            {
                var config = SpeechConfig.FromSubscription(apiKey, region);
                config.SpeechRecognitionLanguage = from;

                speechRecognizer = new SpeechRecognizer(config, AudioConfig.FromStreamInput(pushStream));

                speechRecognizer.Recognizing += RecognisingSpeechHandler;
                speechRecognizer.Recognized  += RecognisingSpeechHandler;

                speechRecognizer.SessionStarted += (sender, args) => this.RecognisionStarted?.Invoke();
                speechRecognizer.SessionStopped += (sender, args) => this.RecognisionStopped?.Invoke();
            }
            else
            {
                var config = SpeechTranslationConfig.FromSubscription(apiKey, region);
                config.SpeechRecognitionLanguage = from;
                config.AddTargetLanguage(to);

                translationRecognizer = new TranslationRecognizer(config, AudioConfig.FromStreamInput(pushStream));

                translationRecognizer.SessionStarted += (sender, args) => this.RecognisionStarted?.Invoke();
                translationRecognizer.SessionStopped += (sender, args) => this.RecognisionStopped?.Invoke();

                translationRecognizer.Recognizing += RecognisingTranslationHandler;
                translationRecognizer.Recognized  += RecognisingTranslationHandler;
            }
        }
Esempio n. 15
0
        async static Task FromStream(SpeechConfig speechConfig)
        {
            var reader = new BinaryReader(File.OpenRead(DEMO_FILE));

            Console.WriteLine(reader.ToString());
            using var audioInputStream = AudioInputStream.CreatePushStream();
            using var audioConfig      = AudioConfig.FromStreamInput(audioInputStream);
            using var recognizer       = new SpeechRecognizer(speechConfig, audioConfig);

            byte[] readBytes;
            do
            {
                readBytes = reader.ReadBytes(1024);
                audioInputStream.Write(readBytes, readBytes.Length);
            } while (readBytes.Length > 0);

            var result = await recognizer.RecognizeOnceAsync();

            Console.WriteLine($"RECOGNIZED: Text={result.Text}");
        }
Esempio n. 16
0
        public AzureSpeechRecognizer(string key, string region, WaveStream stream)
        {
            var speechConfig = SpeechConfig.FromSubscription(key, region);

            this.stream     = NormalizeStream(stream);
            this.pushStream = AudioInputStream.CreatePushStream();
            this.recognizer = new SpeechRecognizer(speechConfig, AudioConfig.FromStreamInput(this.pushStream));
            this.resultId   = Guid.NewGuid().ToString();
            this.lockObj    = new object();

            this.recognizer.Recognized += (snd, evt) =>
            {
                string id = null;
                lock (this.lockObj)
                {
                    id            = this.resultId;
                    this.resultId = Guid.NewGuid().ToString();
                }

                if (!string.IsNullOrWhiteSpace(evt.Result.Text))
                {
                    this.SpeechRecognized?.Invoke(this, new RecognitionEventArgs(evt, id));
                }
            };

            this.recognizer.Recognizing += (snd, evt) =>
            {
                string id = null;
                lock (this.lockObj)
                {
                    id = this.resultId;
                }

                this.SpeechPredicted?.Invoke(this, new RecognitionEventArgs(evt, id));
            };

            this.recognizer.Canceled += (snd, evt) =>
            {
                Debug.WriteLine("lost recognizer");
            };
        }
Esempio n. 17
0
        /// <summary>语音转文字 从内存流识别</summary>
        public static async Task <string> RecognizeFromStreamAsync(string inputFileName)
        {
            var config = SpeechConfig.FromSubscription(subscriptionKey, region);

            var reader = new BinaryReader(File.OpenRead(inputFileName));

            using var audioInputStream = AudioInputStream.CreatePushStream();
            using var audioConfig      = AudioConfig.FromStreamInput(audioInputStream);
            using var recognizer       = new SpeechRecognizer(config, audioConfig);

            byte[] readBytes;
            do
            {
                readBytes = reader.ReadBytes(1024);
                audioInputStream.Write(readBytes, readBytes.Length);
            } while (readBytes.Length > 0);

            var result = await recognizer.RecognizeOnceAsync();

            return(result.Text);
        }
        /// <summary>
        /// Creates Recognizer with baseline model and selected language:
        /// Creates a config with subscription key and selected region
        /// If input source is audio file, creates recognizer with audio file otherwise with default mic
        /// Waits on RunRecognition.
        /// </summary>
        private async Task CreateRecognizer(byte[] channel)
        {
            // Todo: suport users to specifiy a different region.
            var config = SpeechConfig.FromSubscription(this.SubscriptionKey, this.Region);

            config.SpeechRecognitionLanguage = this.RecognitionLanguage;
            config.OutputFormat = OutputFormat.Detailed;

            SpeechRecognizer basicRecognizer;

            PushAudioInputStream pushStream = AudioInputStream.CreatePushStream();

            pushStream.Write(channel);
            pushStream.Close();
            using (var audioInput = AudioConfig.FromStreamInput(pushStream))
            {
                using (basicRecognizer = new SpeechRecognizer(config, audioInput))
                {
                    await this.RunRecognizer(basicRecognizer, stopBaseRecognitionTaskCompletionSource).ConfigureAwait(false);
                }
            }
        }
Esempio n. 19
0
        public async Task <string> AudioToTextAsync(byte[] pcm)
        {
            var guid = Guid.NewGuid();

            if (!Text.ContainsKey(guid))
            {
                Text[guid] = null;
            }

            // Build out the speech recognizer
            using (var pushStream = AudioInputStream.CreatePushStream(AudioStreamFormat.GetDefaultInputFormat()))
                using (var audioInput = AudioConfig.FromStreamInput(pushStream))
                    using (var recognizer = new SpeechRecognizer(SpeechConfig, audioInput))
                    {
                        // Subscribe to speech recognizer events.
                        recognizer.SessionStarted += OnSpeechRecognitionSessionStarted;
                        recognizer.Recognizing    += OnSpeechRecognizing;
                        recognizer.Recognized     += (s, e) => OnSpeechRecognized(s, e, guid);
                        recognizer.Canceled       += OnSpeechCanceled;
                        recognizer.SessionStopped += OnSpeechRecognitionSessionStopped;

                        // Start continuous recognition. Uses StopContinuousRecognitionAsync() to stop recognition.
                        await recognizer.StartContinuousRecognitionAsync().ConfigureAwait(false);

                        // Send the pcm data to the speech recognizer
                        pushStream.Write(pcm);
                        pushStream.Close();

                        // Wait for completion.
                        // Use Task.WaitAny to keep the task rooted.
                        Task.WaitAny(StopRecognition.Task);

                        // Stop recognition.
                        await recognizer.StopContinuousRecognitionAsync().ConfigureAwait(false);

                        return(Text[guid]);
                    }
        }
Esempio n. 20
0
 private PushAudioInputStream CreateAudioInputStream()
 {
     return(AudioInputStream.CreatePushStream(AudioStreamFormat.GetWaveFormatPCM((uint)SamplesPerMillisecond * 1000, 16, 1)));
 }
Esempio n. 21
0
        public static async Task RecognitionWithPushAudioStreamAsync()
        {
            // Creates an instance of a speech config with specified subscription key and service region.
            // Replace with your own subscription key and service region (e.g., "westus").
            var config = SpeechConfig.FromSubscription("YourSubscriptionKey", "YourServiceRegion");

            var stopRecognition = new TaskCompletionSource <int>();

            // Create a push stream
            using (var pushStream = AudioInputStream.CreatePushStream())
            {
                using (var audioInput = AudioConfig.FromStreamInput(pushStream))
                {
                    // Creates a speech recognizer using audio stream input.
                    using (var recognizer = new SpeechRecognizer(config, audioInput))
                    {
                        // Subscribes to events.
                        recognizer.Recognizing += (s, e) =>
                        {
                            Console.WriteLine($"RECOGNIZING: Text={e.Result.Text}");
                        };

                        recognizer.Recognized += (s, e) =>
                        {
                            if (e.Result.Reason == ResultReason.RecognizedSpeech)
                            {
                                Console.WriteLine($"RECOGNIZED: Text={e.Result.Text}");
                            }
                            else if (e.Result.Reason == ResultReason.NoMatch)
                            {
                                Console.WriteLine($"NOMATCH: Speech could not be recognized.");
                            }
                        };

                        recognizer.Canceled += (s, e) =>
                        {
                            Console.WriteLine($"CANCELED: Reason={e.Reason}");

                            if (e.Reason == CancellationReason.Error)
                            {
                                Console.WriteLine($"CANCELED: ErrorCode={e.ErrorCode}");
                                Console.WriteLine($"CANCELED: ErrorDetails={e.ErrorDetails}");
                                Console.WriteLine($"CANCELED: Did you update the subscription info?");
                            }

                            stopRecognition.TrySetResult(0);
                        };

                        recognizer.SessionStarted += (s, e) =>
                        {
                            Console.WriteLine("\nSession started event.");
                        };

                        recognizer.SessionStopped += (s, e) =>
                        {
                            Console.WriteLine("\nSession stopped event.");
                            Console.WriteLine("\nStop recognition.");
                            stopRecognition.TrySetResult(0);
                        };

                        // Starts continuous recognition. Uses StopContinuousRecognitionAsync() to stop recognition.
                        await recognizer.StartContinuousRecognitionAsync().ConfigureAwait(false);

                        // open and read the wave file and push the buffers into the recognizer
                        using (BinaryAudioStreamReader reader = Helper.CreateWavReader(@"whatstheweatherlike.wav"))
                        {
                            byte[] buffer = new byte[1000];
                            while (true)
                            {
                                var readSamples = reader.Read(buffer, (uint)buffer.Length);
                                if (readSamples == 0)
                                {
                                    break;
                                }
                                pushStream.Write(buffer, readSamples);
                            }
                        }
                        pushStream.Close();

                        // Waits for completion.
                        // Use Task.WaitAny to keep the task rooted.
                        Task.WaitAny(new[] { stopRecognition.Task });

                        // Stops recognition.
                        await recognizer.StopContinuousRecognitionAsync().ConfigureAwait(false);
                    }
                }
            }
        }
Esempio n. 22
0
        public static async Task RecognitionWithPushAudioStreamAsync()
        {
            var capture = new WasapiCapture();
            // Creates an instance of a speech config with specified subscription key and service region.
            // Replace with your own subscription key and service region (e.g., "westus").            
            var config = SpeechConfig.FromSubscription("your key", "your region");

            var stopRecognition = new TaskCompletionSource<int>();

            // Create a push stream
            using (var pushStream = AudioInputStream.CreatePushStream())
            {
                using (var audioInput = AudioConfig.FromStreamInput(pushStream))
                {
                    // Creates a speech recognizer using audio stream input.
                    using (var recognizer = new SpeechRecognizer(config, audioInput))
                    {
                        Console.WriteLine("Say something...");

                        // Subscribes to events.
                        recognizer.Recognizing += (s, e) =>
                        {
                            Console.WriteLine($"RECOGNIZING: Text={e.Result.Text}");
                        };

                        recognizer.Recognized += (s, e) =>
                        {
                            if (e.Result.Reason == ResultReason.RecognizedSpeech)
                            {
                                Console.WriteLine($"RECOGNIZED: Text={e.Result.Text}");
                                stopRecognition.TrySetResult(0);
                            }
                            else if (e.Result.Reason == ResultReason.NoMatch)
                            {
                                Console.WriteLine($"NOMATCH: Speech could not be recognized.");
                            }
                        };

                        recognizer.Canceled += (s, e) =>
                        {
                            Console.WriteLine($"CANCELED: Reason={e.Reason}");

                            if (e.Reason == CancellationReason.Error)
                            {
                                Console.WriteLine($"CANCELED: ErrorCode={e.ErrorCode}");
                                Console.WriteLine($"CANCELED: ErrorDetails={e.ErrorDetails}");
                                Console.WriteLine($"CANCELED: Did you update the subscription info?");
                            }

                            stopRecognition.TrySetResult(0);
                        };

                        recognizer.SessionStarted += (s, e) =>
                        {
                            Console.WriteLine("\nSession started event.");
                        };

                        recognizer.SessionStopped += (s, e) =>
                        {
                            Console.WriteLine("\nSession stopped event.");
                            Console.WriteLine("\nStop recognition.");
                            stopRecognition.TrySetResult(0);
                        };

                        capture.DataAvailable += (s, e) =>
                        {
                            if (e.BytesRecorded != 0)
                            {
                                var floatArray = new float[e.BytesRecorded / 4];
                                Buffer.BlockCopy(e.Buffer, 0, floatArray, 0, e.BytesRecorded);

                                byte[] ba = ConvertFloatArrayToInt16ByteArray(floatArray);
                                pushStream.Write(ba); // try to push buffer here
                            }
                        };

                        // Starts continuous recognition. Uses StopContinuousRecognitionAsync() to stop recognition.
                        await recognizer.StartContinuousRecognitionAsync().ConfigureAwait(false);

                        capture.StartRecording();

                        // Waits for completion.
                        // Use Task.WaitAny to keep the task rooted.
                        Task.WaitAny(new[] { stopRecognition.Task });

                        // Stops recognition.
                        await recognizer.StopContinuousRecognitionAsync().ConfigureAwait(false);
                        capture.StopRecording();
                    }
                }
            }
        }
        public static async Task <List <JsonResult> > TranscribeAsync(byte[] audio, SpeechConfig speechConfig, ILogger logger)
        {
            audio        = audio ?? throw new ArgumentNullException(nameof(audio));
            speechConfig = speechConfig ?? throw new ArgumentNullException(nameof(speechConfig));

            var jsonResults = new List <JsonResult>();

            var chunkId = Encoding.ASCII.GetString(audio.Take(4).ToArray());

            // Verify that first 4 bytes are RIFF in ascii:
            // (see https://docs.fileformat.com/audio/wav/ for details)
            if (chunkId != "RIFF")
            {
                throw new InvalidOperationException($"Expected file header containing RIFF, received {chunkId} instead.");
            }

            // Get bytes for sampleRate, bitsPerSample, and channels:
            var sampleRate    = BitConverter.ToUInt32(audio.Skip(24).Take(4).ToArray());
            var bitsPerSample = audio[34];
            var channels      = audio[22];

            var audioStreamFormat = AudioStreamFormat.GetWaveFormatPCM(sampleRate, bitsPerSample, channels);

            var stopRecognition = new TaskCompletionSource <int>();

            using var pushStream = AudioInputStream.CreatePushStream(audioStreamFormat);
            using (var audioInput = AudioConfig.FromStreamInput(pushStream))
            {
                using var recognizer = new SpeechRecognizer(
                          speechConfig,
                          audioInput);

                using var connection = Connection.FromRecognizer(recognizer);

                pushStream.Write(audio);
                pushStream.Close();

                logger.LogInformation("Starting speech recognition.");
                recognizer.Recognized += (s, e) =>
                {
                    if (e.Result.Reason == ResultReason.RecognizedSpeech)
                    {
                        var stringResult       = e.Result.Properties.GetProperty(PropertyId.SpeechServiceResponse_JsonResult);
                        var deserializedResult = JsonConvert.DeserializeObject <JsonResult>(stringResult);
                        jsonResults.Add(deserializedResult);
                    }
                };

                recognizer.SessionStarted += (s, e) =>
                {
                    logger.LogInformation($"Starting session. Session id: {e.SessionId}.");
                };

                recognizer.SessionStopped += (s, e) =>
                {
                    logger.LogInformation("Session stopped");
                    stopRecognition.TrySetResult(0);
                };

                recognizer.Canceled += (s, e) =>
                {
                    var cancellation = CancellationDetails.FromResult(e.Result);
                    logger.LogError($"Recognition canceled: Reason={cancellation.Reason}");

                    if (cancellation.Reason == CancellationReason.Error)
                    {
                        logger.LogError($"ErrorCode={cancellation.ErrorCode}");
                        logger.LogError($"ErrorDetails={cancellation.ErrorDetails}");

                        if (cancellation.ErrorCode != CancellationErrorCode.NoError)
                        {
                            throw new RealtimeTranscriptionException(cancellation.ErrorCode, cancellation.ErrorDetails);
                        }
                    }

                    stopRecognition.TrySetResult(0);
                };

                await recognizer.StartContinuousRecognitionAsync().ConfigureAwait(false);

                await Task.WhenAll(stopRecognition.Task).ConfigureAwait(false);

                await recognizer.StopContinuousRecognitionAsync().ConfigureAwait(false);

                logger.LogInformation("Recognition stopped.");
            }

            return(jsonResults);
        }
Esempio n. 24
0
        /// <summary>
        /// Sets up the initial state needed for Direct Line Speech, including creation of the
        /// underlying DialogServiceConnector and wiring of its events.
        /// </summary>
        /// <param name="keywordFile"> The keyword file to be loaded as part of initialization.</param>
        /// <returns> A task that completes once initialization is complete. </returns>
        public Task InitializeAsync(StorageFile keywordFile)
        {
            Contract.Requires(keywordFile != null);

            var configRefreshRequired = this.TryRefreshConfigValues();

            var refreshConnector = configRefreshRequired || (this.keywordFilePath != keywordFile.Path);

            if (LocalSettingsHelper.SetProperty != null)
            {
                this.enableKwsLogging = true;
            }

            if (this.enableKwsLogging)
            {
                refreshConnector      = true;
                this.enableKwsLogging = false;
            }

            if (refreshConnector)
            {
                var newConnectorConfiguration = this.CreateConfiguration();

                this.ConfirmationModel      = KeywordRecognitionModel.FromFile(keywordFile.Path);
                this.keywordFilePath        = keywordFile.Path;
                this.ConnectorConfiguration = newConnectorConfiguration;
                this.connectorInputStream   = AudioInputStream.CreatePushStream();

                this.connector?.Dispose();
                this.connector = new DialogServiceConnector(
                    this.ConnectorConfiguration,
                    AudioConfig.FromStreamInput(this.connectorInputStream));

                this.connector.SessionStarted += (s, e) => this.SessionStarted?.Invoke(e.SessionId);
                this.connector.SessionStopped += (s, e) => this.SessionStopped?.Invoke(e.SessionId);
                this.connector.Recognizing    += (s, e) =>
                {
                    switch (e.Result.Reason)
                    {
                    case ResultReason.RecognizingKeyword:
                        this.logger.Log(LogMessageLevel.SignalDetection, $"Local model recognized keyword \"{e.Result.Text}\"");
                        this.KeywordRecognizing?.Invoke(e.Result.Text);
                        this.secondStageConfirmed = true;
                        break;

                    case ResultReason.RecognizingSpeech:
                        this.logger.Log(LogMessageLevel.SignalDetection, $"Recognized speech in progress: \"{e.Result.Text}\"");
                        this.SpeechRecognizing?.Invoke(e.Result.Text);
                        break;

                    default:
                        throw new InvalidOperationException();
                    }
                };
                this.connector.Recognized += (s, e) =>
                {
                    KwsPerformanceLogger.KwsEventFireTime = TimeSpan.FromTicks(DateTime.Now.Ticks);
                    switch (e.Result.Reason)
                    {
                    case ResultReason.RecognizedKeyword:
                        var thirdStageStartTime = KwsPerformanceLogger.KwsStartTime.Ticks;
                        thirdStageStartTime = DateTime.Now.Ticks;
                        this.logger.Log(LogMessageLevel.SignalDetection, $"Cloud model recognized keyword \"{e.Result.Text}\"");
                        this.KeywordRecognized?.Invoke(e.Result.Text);
                        this.kwsPerformanceLogger.LogSignalReceived("SWKWS", "A", "3", KwsPerformanceLogger.KwsEventFireTime.Ticks, thirdStageStartTime, DateTime.Now.Ticks);
                        this.secondStageConfirmed = false;
                        break;

                    case ResultReason.RecognizedSpeech:
                        this.logger.Log(LogMessageLevel.SignalDetection, $"Recognized final speech: \"{e.Result.Text}\"");
                        this.SpeechRecognized?.Invoke(e.Result.Text);
                        break;

                    case ResultReason.NoMatch:
                        // If a KeywordRecognized handler is available, this is a final stage
                        // keyword verification rejection.
                        this.logger.Log(LogMessageLevel.SignalDetection, $"Cloud model rejected keyword");
                        if (this.secondStageConfirmed)
                        {
                            var thirdStageStartTimeRejected = KwsPerformanceLogger.KwsStartTime.Ticks;
                            thirdStageStartTimeRejected = DateTime.Now.Ticks;
                            this.kwsPerformanceLogger.LogSignalReceived("SWKWS", "R", "3", KwsPerformanceLogger.KwsEventFireTime.Ticks, thirdStageStartTimeRejected, DateTime.Now.Ticks);
                            this.secondStageConfirmed = false;
                        }

                        this.KeywordRecognized?.Invoke(null);
                        break;

                    default:
                        throw new InvalidOperationException();
                    }
                };
                this.connector.Canceled += (s, e) =>
                {
                    var code    = (int)e.ErrorCode;
                    var message = $"{e.Reason.ToString()}: {e.ErrorDetails}";
                    this.ErrorReceived?.Invoke(new DialogErrorInformation(code, message));
                };
                this.connector.ActivityReceived += (s, e) =>
                {
                    // Note: the contract of when to end a turn is unique to your dialog system. In this sample,
                    // it's assumed that receiving a message activity without audio marks the end of a turn. Your
                    // dialog system may have a different contract!
                    var wrapper = new ActivityWrapper(e.Activity);

                    if (wrapper.Type == ActivityWrapper.ActivityType.Event)
                    {
                        if (!this.startEventReceived)
                        {
                            this.startEventReceived = true;
                            return;
                        }
                        else
                        {
                            this.startEventReceived = false;
                        }
                    }

                    var payload = new DialogResponse(
                        messageBody: e.Activity,
                        messageMedia: e.HasAudio ? new DirectLineSpeechAudioOutputStream(e.Audio, LocalSettingsHelper.OutputFormat) : null,
                        shouldEndTurn: (e.Audio == null && wrapper.Type == ActivityWrapper.ActivityType.Message) || wrapper.Type == ActivityWrapper.ActivityType.Event,
                        shouldStartNewTurn: wrapper.InputHint == ActivityWrapper.InputHintType.ExpectingInput);

                    this.DialogResponseReceived?.Invoke(payload);
                };
            }

            return(Task.FromResult(0));
        }
Esempio n. 25
0
        /// <summary>
        /// Sets up the initial state needed for Direct Line Speech, including creation of the
        /// underlying DialogServiceConnector and wiring of its events.
        /// </summary>
        /// <param name="keywordFile"> The keyword file to be loaded as part of initialization.</param>
        /// <returns> A task that completes once initialization is complete. </returns>
        public Task InitializeAsync(StorageFile keywordFile)
        {
            Contract.Requires(keywordFile != null);

            // Default values -- these can be updated
            this.ConnectorConfiguration = this.CreateConfiguration();
            this.ConfirmationModel      = KeywordRecognitionModel.FromFile(keywordFile.Path);

            this.connectorInputStream = AudioInputStream.CreatePushStream();
            this.connector            = new DialogServiceConnector(
                this.ConnectorConfiguration,
                AudioConfig.FromStreamInput(this.connectorInputStream));

            this.connector.SessionStarted += (s, e) => this.SessionStarted?.Invoke(e.SessionId);
            this.connector.SessionStopped += (s, e) => this.SessionStopped?.Invoke(e.SessionId);
            this.connector.Recognizing    += (s, e) =>
            {
                switch (e.Result.Reason)
                {
                case ResultReason.RecognizingKeyword:
                    this.logger.Log($"Local model recognized keyword \"{e.Result.Text}\"");
                    this.KeywordRecognizing?.Invoke(e.Result.Text);
                    break;

                case ResultReason.RecognizingSpeech:
                    this.logger.Log($"Recognized speech in progress: \"{e.Result.Text}\"");
                    this.SpeechRecognizing?.Invoke(e.Result.Text);
                    break;

                default:
                    throw new InvalidOperationException();
                }
            };
            this.connector.Recognized += (s, e) =>
            {
                switch (e.Result.Reason)
                {
                case ResultReason.RecognizedKeyword:
                    this.logger.Log($"Cloud model recognized keyword \"{e.Result.Text}\"");
                    this.KeywordRecognized?.Invoke(e.Result.Text);
                    break;

                case ResultReason.RecognizedSpeech:
                    this.logger.Log($"Recognized final speech: \"{e.Result.Text}\"");
                    this.SpeechRecognized?.Invoke(e.Result.Text);
                    break;

                case ResultReason.NoMatch:
                    // If a KeywordRecognized handler is available, this is a final stage
                    // keyword verification rejection.
                    this.logger.Log($"Cloud model rejected keyword");
                    this.KeywordRecognized?.Invoke(null);
                    break;

                default:
                    throw new InvalidOperationException();
                }
            };
            this.connector.Canceled += (s, e) =>
            {
                var code    = (int)e.ErrorCode;
                var message = $"{e.Reason.ToString()}: {e.ErrorDetails}";
                this.ErrorReceived?.Invoke(new DialogErrorInformation(code, message));
            };
            this.connector.ActivityReceived += (s, e) =>
            {
                // Note: the contract of when to end a turn is unique to your dialog system. In this sample,
                // it's assumed that receiving a message activity without audio marks the end of a turn. Your
                // dialog system may have a different contract!
                var wrapper = new ActivityWrapper(e.Activity);
                var payload = new DialogResponse(
                    messageBody: e.Activity,
                    messageMedia: e.HasAudio ? new DirectLineSpeechAudioOutputStream(e.Audio, LocalSettingsHelper.OutputFormat) : null,
                    shouldEndTurn: e.Audio == null && wrapper.Type == ActivityWrapper.ActivityType.Message,
                    shouldStartNewTurn: wrapper.InputHint == ActivityWrapper.InputHintType.ExpectingInput);
                this.logger.Log($"Connector activity received");
                this.DialogResponseReceived?.Invoke(payload);
            };

            return(Task.FromResult(0));
        }
        // Translation using compressed file input.
        public static async Task TranslationWithFileCompressedInputAsync()
        {
            // <TranslationWithFileCompressedInputAsync>
            // Translation source language with compressed format.
            // Replace with a language of your choice.
            string fromLanguage = "en-US";

            // Creates an instance of a speech translation config with specified subscription key and service region.
            // Replace with your own subscription key and service region (e.g., "westus").
            var config = SpeechTranslationConfig.FromSubscription("YourSubscriptionKey", "YourServiceRegion");

            config.SpeechRecognitionLanguage = fromLanguage;

            // Translation target language(s).
            // Replace with language(s) of your choice.
            config.AddTargetLanguage("de");
            config.AddTargetLanguage("fr");

            var stopTranslation = new TaskCompletionSource <int>();

            // Creates a translation recognizer using file as audio input.
            using (var pushStream = AudioInputStream.CreatePushStream(AudioStreamFormat.GetCompressedFormat(AudioStreamContainerFormat.MP3)))
            {
                using (var audioInput = AudioConfig.FromStreamInput(pushStream))
                {
                    using (var recognizer = new TranslationRecognizer(config, audioInput))
                    {
                        // Subscribes to events.
                        recognizer.Recognizing += (s, e) =>
                        {
                            Console.WriteLine($"RECOGNIZING in '{fromLanguage}': Text={e.Result.Text}");
                            foreach (var element in e.Result.Translations)
                            {
                                Console.WriteLine($"    TRANSLATING into '{element.Key}': {element.Value}");
                            }
                        };

                        recognizer.Recognized += (s, e) =>
                        {
                            if (e.Result.Reason == ResultReason.TranslatedSpeech)
                            {
                                Console.WriteLine($"RECOGNIZED in '{fromLanguage}': Text={e.Result.Text}");
                                foreach (var element in e.Result.Translations)
                                {
                                    Console.WriteLine($"    TRANSLATED into '{element.Key}': {element.Value}");
                                }
                            }
                            else if (e.Result.Reason == ResultReason.RecognizedSpeech)
                            {
                                Console.WriteLine($"RECOGNIZED: Text={e.Result.Text}");
                                Console.WriteLine($"    Speech not translated.");
                            }
                            else if (e.Result.Reason == ResultReason.NoMatch)
                            {
                                Console.WriteLine($"NOMATCH: Speech could not be recognized.");
                            }
                        };

                        recognizer.Canceled += (s, e) =>
                        {
                            Console.WriteLine($"CANCELED: Reason={e.Reason}");

                            if (e.Reason == CancellationReason.Error)
                            {
                                Console.WriteLine($"CANCELED: ErrorCode={e.ErrorCode}");
                                Console.WriteLine($"CANCELED: ErrorDetails={e.ErrorDetails}");
                                Console.WriteLine($"CANCELED: Did you update the subscription info?");
                            }

                            stopTranslation.TrySetResult(0);
                        };

                        recognizer.SpeechStartDetected += (s, e) =>
                        {
                            Console.WriteLine("\nSpeech start detected event.");
                        };

                        recognizer.SpeechEndDetected += (s, e) =>
                        {
                            Console.WriteLine("\nSpeech end detected event.");
                        };

                        recognizer.SessionStarted += (s, e) =>
                        {
                            Console.WriteLine("\nSession started event.");
                        };

                        recognizer.SessionStopped += (s, e) =>
                        {
                            Console.WriteLine("\nSession stopped event.");
                            Console.WriteLine($"\nStop translation.");
                            stopTranslation.TrySetResult(0);
                        };

                        // Starts continuous recognition. Uses StopContinuousRecognitionAsync() to stop recognition.
                        Console.WriteLine("Start translation...");
                        await recognizer.StartContinuousRecognitionAsync().ConfigureAwait(false);

                        // Replace with your own audio file name.
                        using (BinaryAudioStreamReader reader = Helper.CreateBinaryFileReader(@"whatstheweatherlike.mp3"))
                        {
                            byte[] buffer = new byte[1000];
                            while (true)
                            {
                                var readSamples = reader.Read(buffer, (uint)buffer.Length);
                                if (readSamples == 0)
                                {
                                    break;
                                }
                                pushStream.Write(buffer, readSamples);
                            }
                        }
                        pushStream.Close();
                        // Waits for completion.
                        // Use Task.WaitAny to keep the task rooted.
                        Task.WaitAny(new[] { stopTranslation.Task });

                        // Stops translation.
                        await recognizer.StopContinuousRecognitionAsync().ConfigureAwait(false);
                    }
                }
            }
            // </TranslationWithFileCompressedInputAsync>
        }
        /// <summary>
        /// Initializes the connection to the Bot.
        /// </summary>
        /// <param name="settings">Application settings object, built from the input JSON file supplied as run-time argument.</param>
        public void InitConnector(AppSettings settings)
        {
            DialogServiceConfig config;

            this.BotReplyList = new List <BotReply>();
            this.stopWatch    = new Stopwatch();
            this.appsettings  = settings;

            if (!string.IsNullOrWhiteSpace(this.appsettings.CustomCommandsAppId))
            {
                // NOTE: Custom commands is a preview Azure Service.
                // Set the custom commands configuration object based on three items:
                // - The Custom commands application ID
                // - Cognitive services speech subscription key.
                // - The Azure region of the subscription key(e.g. "westus").
                config = CustomCommandsConfig.FromSubscription(this.appsettings.CustomCommandsAppId, this.appsettings.SpeechSubscriptionKey, this.appsettings.SpeechRegion);
            }
            else
            {
                // Set the bot framework configuration object based on two items:
                // - Cognitive services speech subscription key. It is needed for billing and is tied to the bot registration.
                // - The Azure region of the subscription key(e.g. "westus").
                config = BotFrameworkConfig.FromSubscription(this.appsettings.SpeechSubscriptionKey, this.appsettings.SpeechRegion);
            }

            if (this.appsettings.SpeechSDKLogEnabled)
            {
                // Speech SDK has verbose logging to local file, which may be useful when reporting issues.
                config.SetProperty(PropertyId.Speech_LogFilename, $"{this.appsettings.OutputFolder}SpeechSDKLog-{DateTime.Now.ToString("yyyy-MM-dd-HH-mm-ss", CultureInfo.CurrentCulture)}.log");
            }

            if (!string.IsNullOrWhiteSpace(this.appsettings.SRLanguage))
            {
                // Set the speech recognition language. If not set, the default is "en-us".
                config.Language = this.appsettings.SRLanguage;
            }

            if (!string.IsNullOrWhiteSpace(this.appsettings.CustomSREndpointId))
            {
                // Set your custom speech end-point id here, as given to you by the speech portal https://speech.microsoft.com/portal.
                // Otherwise the standard speech end-point will be used.
                config.SetServiceProperty("cid", this.appsettings.CustomSREndpointId, ServicePropertyChannel.UriQueryParameter);

                // Custom Speech does not support cloud Keyword Verification at the moment. If this is not done, there will be an error
                // from the service and connection will close. Remove line below when supported.
                config.SetProperty("KeywordConfig_EnableKeywordVerification", "false");
            }

            if (!string.IsNullOrWhiteSpace(this.appsettings.CustomVoiceDeploymentIds))
            {
                // Set one or more IDs associated with the custom TTS voice your bot will use.
                // The format of the string is one or more GUIDs separated by comma (no spaces). You get these GUIDs from
                // your custom TTS on the speech portal https://speech.microsoft.com/portal.
                config.SetProperty(PropertyId.Conversation_Custom_Voice_Deployment_Ids, this.appsettings.CustomVoiceDeploymentIds);
            }

            this.timeout = this.appsettings.Timeout;

            if (!string.IsNullOrWhiteSpace(this.appsettings.KeywordRecognitionModel))
            {
                this.kwsTable = KeywordRecognitionModel.FromFile(this.appsettings.KeywordRecognitionModel);
            }

            if (this.appsettings.SetPropertyId != null)
            {
                foreach (KeyValuePair <string, JToken> setPropertyIdPair in this.appsettings.SetPropertyId)
                {
                    config.SetProperty(setPropertyIdPair.Key, setPropertyIdPair.Value.ToString());
                }
            }

            if (this.appsettings.SetPropertyString != null)
            {
                foreach (KeyValuePair <string, JToken> setPropertyStringPair in this.appsettings.SetPropertyString)
                {
                    config.SetProperty(setPropertyStringPair.Key.ToString(CultureInfo.CurrentCulture), setPropertyStringPair.Value.ToString());
                }
            }

            if (this.appsettings.SetServiceProperty != null)
            {
                foreach (KeyValuePair <string, JToken> setServicePropertyPair in this.appsettings.SetServiceProperty)
                {
                    config.SetServiceProperty(setServicePropertyPair.Key.ToString(CultureInfo.CurrentCulture), setServicePropertyPair.Value.ToString(), ServicePropertyChannel.UriQueryParameter);
                }
            }

            if (this.appsettings.RealTimeAudio)
            {
                config.SetProperty("SPEECH-AudioThrottleAsPercentageOfRealTime", "100");
                config.SetProperty("SPEECH-TransmitLengthBeforeThrottleMs", "0");
            }

            if (this.connector != null)
            {
                // Then dispose the object
                this.connector.Dispose();
                this.connector = null;
            }

            this.pushAudioInputStream = AudioInputStream.CreatePushStream();
            this.connector            = new DialogServiceConnector(config, AudioConfig.FromStreamInput(this.pushAudioInputStream));

            if (this.appsettings.BotGreeting)
            {
                // Starting the timer to calculate latency for Bot Greeting.
                this.stopWatch.Restart();
            }

            this.AttachHandlers();
        }
Esempio n. 28
0
        public async Task TranscribeConversationsAsync(IEnumerable <string> voiceSignatureStringUsers)
        {
            uint samplesPerSecond = 16000;
            byte bitsPerSample    = 16;
            byte channels         = 8; // 7 + 1 channels

            var config = SpeechConfig.FromSubscription(this.SubscriptionKey, this.Region);

            config.SetProperty("ConversationTranscriptionInRoomAndOnline", "true");
            var stopRecognition = new TaskCompletionSource <int>();

            using (var audioInput = AudioInputStream.CreatePushStream(AudioStreamFormat.GetWaveFormatPCM(samplesPerSecond, bitsPerSample, channels)))
            {
                var meetingID = Guid.NewGuid().ToString();
                using (var conversation = await Conversation.CreateConversationAsync(config, meetingID))
                {
                    // create a conversation transcriber using audio stream input
                    using (this.conversationTranscriber = new ConversationTranscriber(AudioConfig.FromStreamInput(audioInput)))
                    {
                        conversationTranscriber.Transcribing += (s, e) =>
                        {
                            this.SetText($"TRANSCRIBING: Text={e.Result.Text} SpeakerId={e.Result.UserId}");
                        };

                        conversationTranscriber.Transcribed += (s, e) =>
                        {
                            if (e.Result.Reason == ResultReason.RecognizedSpeech)
                            {
                                this.SetText($"TRANSCRIBED: Text={e.Result.Text} SpeakerId={e.Result.UserId}");
                            }
                            else if (e.Result.Reason == ResultReason.NoMatch)
                            {
                                this.SetText($"NOMATCH: Speech could not be recognized.");
                            }
                        };

                        conversationTranscriber.Canceled += (s, e) =>
                        {
                            this.SetText($"CANCELED: Reason={e.Reason}");

                            if (e.Reason == CancellationReason.Error)
                            {
                                this.SetText($"CANCELED: ErrorCode={e.ErrorCode}");
                                this.SetText($"CANCELED: ErrorDetails={e.ErrorDetails}");
                                this.SetText($"CANCELED: Did you update the subscription info?");
                                stopRecognition.TrySetResult(0);
                            }
                        };

                        conversationTranscriber.SessionStarted += (s, e) =>
                        {
                            this.SetText($"\nSession started event. SessionId={e.SessionId}");
                        };

                        conversationTranscriber.SessionStopped += (s, e) =>
                        {
                            this.SetText($"\nSession stopped event. SessionId={e.SessionId}");
                            this.SetText("\nStop recognition.");
                            stopRecognition.TrySetResult(0);
                        };

                        // Add participants to the conversation.
                        int i = 1;
                        foreach (var voiceSignatureStringUser in voiceSignatureStringUsers)
                        {
                            var speaker = Participant.From($"User{i++}", "en-US", voiceSignatureStringUser);
                            await conversation.AddParticipantAsync(speaker);
                        }

                        // Join to the conversation and start transcribing
                        await conversationTranscriber.JoinConversationAsync(conversation);

                        await conversationTranscriber.StartTranscribingAsync().ConfigureAwait(false);

                        using (var p = Pipeline.Create())
                        {
                            var store   = PsiStore.Create(p, "Transcribe", @"D:\Temp");
                            var capture = new AudioCapture(p, WaveFormat.CreatePcm((int)samplesPerSecond, bitsPerSample, channels)).Write("Audio", store);
                            capture.Do(audio => audioInput.Write(audio.Data));
                            p.RunAsync();

                            // waits for completion, then stop transcription
                            await stopRecognition.Task;
                        }

                        await conversationTranscriber.StopTranscribingAsync().ConfigureAwait(false);
                    }
                }
            }
        }
Esempio n. 29
0
        private static async Task <bool> MakeAudioConfigAsync(SpeechHandler handler)
        {
            // var audioConfig = AudioConfig.FromWavFileInput(@"D:\Users\ManabuTonosaki\OneDrive - tomarika\tono.wav");
            // var audioConfig = AudioConfig.FromDefaultMicrophoneInput();

            Debug.Assert(handler.Device != null);

            var wavein        = new WasapiLoopbackCapture(handler.Device);
            var waveoutFormat = new WaveFormat(16000, 16, 1);
            var lastSpeakDT   = DateTime.Now;
            var willStop      = DateTime.MaxValue;

            wavein.DataAvailable += (s, e) =>
            {
                if (e.BytesRecorded > 0)
                {
                    using var ms   = new MemoryStream(e.Buffer, 0, e.BytesRecorded);
                    using var rs   = new RawSourceWaveStream(ms, wavein.WaveFormat);
                    using var freq = new MediaFoundationResampler(rs, waveoutFormat.SampleRate);
                    var w16 = freq.ToSampleProvider().ToMono().ToWaveProvider16();
                    var len = w16.Read(handler.buf, 0, handler.buf.Length);
                    handler.AudioInputStream.Write(handler.buf, len);

                    lastSpeakDT = DateTime.Now;
                    willStop    = DateTime.MaxValue;
                }
                else
                {
                    if (DateTime.Now < willStop)
                    {
                        if (willStop == DateTime.MaxValue)
                        {
                            willStop = DateTime.Now + TimeSpan.FromSeconds(10);
                        }
                        var silence = new SilenceProvider(waveoutFormat);
                        var len     = silence.Read(handler.buf, 0, waveoutFormat.BitsPerSample * waveoutFormat.SampleRate / 8 / 100); // 10ms
                        var cnt     = (int)((DateTime.Now - lastSpeakDT).TotalMilliseconds / 10);
                        for (var i = 0; i < cnt; i++)
                        {
                            handler.AudioInputStream.Write(handler.buf, len);
                        }
                        lastSpeakDT = DateTime.Now;
                    }
                }
            };

            var audioformat = AudioStreamFormat.GetWaveFormatPCM(samplesPerSecond: 16000, bitsPerSample: 16, channels: 1);

            handler.AudioInputStream = AudioInputStream.CreatePushStream(audioformat);
            handler.AudioConfig      = AudioConfig.FromStreamInput(handler.AudioInputStream);

            await Task.Delay(100);

            handler.StopRequested += (s, e) =>
            {
                wavein.StopRecording();
            };
            wavein.StartRecording();

            return(true);
        }
Esempio n. 30
0
        private async Task <bool> MakeAudioConfigAsync(SpeechHandler handler)
        {
            Debug.Assert(handler.Device != null);

            // NAudio Setting
            var wavein        = CreateCaptureInstance(handler.Device);
            var waveoutFormat = new WaveFormat(16000, 16, 1);

            wavein.StartRecording();

            // Azure Cognitive Service Setting
            var audioformat = AudioStreamFormat.GetWaveFormatPCM((uint)waveoutFormat.SampleRate, (byte)waveoutFormat.BitsPerSample, (byte)waveoutFormat.Channels);

            handler.AudioInputStream = AudioInputStream.CreatePushStream(audioformat);
            handler.AudioConfig      = AudioConfig.FromStreamInput(handler.AudioInputStream);

            // Silence Generate
            DateTime preEvent    = DateTime.Now;
            var      silenceData = new byte[waveoutFormat.BlockAlign];

            // Appliation Preparation
            Hot.SetWavFormat(DisplayName, waveoutFormat);   // for file saving

            // NAudio Voice event
            wavein.DataAvailable += (s, e) =>
            {
                if (e.BytesRecorded > 0)
                {
                    var now = DateTime.Now;
                    using (var ms = new MemoryStream())
                    {
                        var memoryWriter = new WaveFileWriter(ms, waveoutFormat);
                        ms.SetLength(0);    // Delete file header.

                        var samples = Resample(wavein.WaveFormat, e.Buffer, e.BytesRecorded, waveoutFormat);
                        foreach (var sample in samples)
                        {
                            memoryWriter.WriteSample(sample);
                        }
                        Hot.AddWavToAllQueue(DisplayName, ms.GetBuffer(), (int)ms.Length, now); // for file saving
                        handler.AudioInputStream.Write(ms.GetBuffer(), (int)ms.Length);         // for Azure Cognitive Speech to Text
                    }
                    try
                    {
                        Token.Add(TokenWavDataQueued, this);    // TODO: Need Confirm it must be fixed with Tono.Gui.WinForm 1.1.2 - System.InvalidOperationException: 'Collection was modified; enumeration operation may not execute.'
                                                                // It must be not fixed yet. so I added try-catch.
                    }
                    catch
                    {
                        // No Action because the above token is a QoS 0 message. But it's necessary to disappear exception messages that's why catch them here.
                    }
                    preEvent = DateTime.Now;
                }
                else
                {
                    if (_talkID != null)
                    {
                        var spms = (double)waveoutFormat.SampleRate / 1000; // samples per ms
                        var n    = (int)(spms * (DateTime.Now - preEvent).TotalMilliseconds);

                        for (var i = n; i >= 0; i--)
                        {
                            handler.AudioInputStream.Write(silenceData, silenceData.Length);    // send silence to azure to get realtime event (othewise, azure will wait untile next event timing even if there is no event long time)
                        }
                    }
                    preEvent = DateTime.Now;
                }
            };

            handler.StopRequested += (s, e) =>
            {
                wavein.StopRecording();     // Stop NAudio recording
            };

            return(await Task.FromResult(true));
        }