/// <summary> /// Returns speech to text from selected Opus audiofile streamed from a blobcontainer in Azure Storage. /// </summary> /// <param name="opusBlob">Name of opus file</param> /// <param name="container">Azure blob container name</param> /// <returns>List<Speech> container speechresults</returns> public async Task <List <Speech> > RunRecognitionAsync(string opusBlob, string container) { SpeechResult = new List <Speech>(); var blobService = new BlobService(); var blobClient = await blobService.GetBlobFromContainerAsync(opusBlob, container); using var audioInputStream = AudioInputStream.CreatePushStream(); using var audioConfig = AudioConfig.FromStreamInput(audioInputStream); using (var recognizer = new SpeechRecognizer(_speechConfig, _languagesToDetect, audioConfig)) { recognizer.Recognizing += Recognizing; recognizer.Recognized += Recognized; recognizer.SessionStarted += SessionStarted; recognizer.SessionStopped += SessionStopped; recognizer.Canceled += SessionCanceled; await InjectStreamIntoRecognizerAsync(audioInputStream, blobClient); await recognizer.StartContinuousRecognitionAsync(); Task.WaitAny(new[] { _stopRecognition.Task }); await recognizer.StopContinuousRecognitionAsync(); } return(SpeechResult); }
public async Task <string> DetectLanguage(byte[] audioBytes, string fileExtension, string locale1, string locale2) { var wavBytes = ConvertToWaveBytes(audioBytes, fileExtension); var autoDetectSourceLanguageConfig = AutoDetectSourceLanguageConfig.FromLanguages(new string[] { locale1, locale2 }); var config = SpeechConfig.FromSubscription(SubscriptionKey, SubscriptionRegion); var stopRecognition = new TaskCompletionSource <int>(); var detected = new List <string>(); using var pushStream = AudioInputStream.CreatePushStream(); using (var audioInput = AudioConfig.FromStreamInput(pushStream)) { using var recognizer = new SpeechRecognizer( config, autoDetectSourceLanguageConfig, audioInput); pushStream.Write(wavBytes); pushStream.Close(); recognizer.Recognized += (s, e) => { var autoDetectSourceLanguageResult = AutoDetectSourceLanguageResult.FromResult(e.Result); var detectedLanguage = autoDetectSourceLanguageResult.Language; detected.Add(detectedLanguage); if (detected.Count > UtteranceCount) { stopRecognition.TrySetResult(0); } }; recognizer.SessionStopped += (s, e) => { stopRecognition.TrySetResult(0); }; await recognizer.StartContinuousRecognitionAsync().ConfigureAwait(false); var t = Task.Factory.StartNew(async() => { await SetTimeOutForRecognition(stopRecognition).ConfigureAwait(false); }, CancellationToken.None, TaskCreationOptions.None, TaskScheduler.Default); Task.WaitAny(new[] { stopRecognition.Task }); await recognizer.StopKeywordRecognitionAsync().ConfigureAwait(false); } if (detected.Count == 0) { throw new TimeoutException("Did not get any language identification results back in time."); } var detectedByCount = detected.GroupBy(i => i); var mostFreq = detectedByCount.OrderBy(t => t.Count()).LastOrDefault().Key; if (string.IsNullOrEmpty(mostFreq) || (!mostFreq.Equals(locale1, StringComparison.OrdinalIgnoreCase) && !mostFreq.Equals(locale2, StringComparison.OrdinalIgnoreCase))) { return(locale1); } return(mostFreq); }
//private const string speechEndpoint = "https://YOUR_LOCATION.api.cognitive.microsoft.com/"; //public async Task<IActionResult> OnGetAsync() //{ // return Page(); //} public async Task <IActionResult> OnPostAsync() { var speechConfig = SpeechConfig.FromSubscription(speechKey, speechLocation); speechConfig.SpeechRecognitionLanguage = "ja-JP"; byte[] readBytes; using var audioInputStream = AudioInputStream.CreatePushStream(); using var reader = new BinaryReader(VoiceFile.OpenReadStream()); do { readBytes = reader.ReadBytes(1024); audioInputStream.Write(readBytes, readBytes.Length); } while (readBytes.Length > 0); var audioConfig = AudioConfig.FromStreamInput(audioInputStream); using var speechRecognizer = new SpeechRecognizer(speechConfig, audioConfig); var result = await speechRecognizer.RecognizeOnceAsync(); if (result.Reason == ResultReason.RecognizedSpeech) { Result = "Œ‹‰Ê:"; RecognizedText = result.Text; } return(Page()); }
/// <summary> /// Remote audio transcription of the given audioFile with CognitiveServices /// </summary> public static AnalysisResult TranscribeAudio(ref AnalysisResult audioResponse, IFormFile audioFile) { // needed for speaker diarization to resolve at the word level SPEECH_CONFIG.RequestWordLevelTimestamps(); var audioFormat128 = AudioStreamFormat.GetWaveFormatPCM(8000, 16, 1); var audioFormat256 = AudioStreamFormat.GetWaveFormatPCM(16000, 16, 1); // load bytestream -> audio stream // load audio config from audio stream // initialize speech recognizer using (var br = new BinaryReader(audioFile.OpenReadStream())) using (var audioInputStream = AudioInputStream.CreatePushStream(audioFormat128)) using (var audioConfig = AudioConfig.FromStreamInput(audioInputStream)) using (var recognizer = new SpeechRecognizer(SPEECH_CONFIG, audioConfig)) { long nbytes = audioFile.Length; var buff = new List <byte>(); // read through bytes of audio byte[] readBytes; do { readBytes = br.ReadBytes(1024); buff.AddRange(readBytes); audioInputStream.Write(readBytes, readBytes.Length); } while (readBytes.Length > 0); var transcript = ExecuteRecognizer(recognizer).Result; audioResponse.Transcript = transcript; return(audioResponse); } }
private void SetupTranscriptionAndTranslationService() { try { var lCognitiveKey = _settings.AzureCognitiveKey; var lCognitiveRegion = _settings.AzureCognitiveRegion; _eventPublisher.Publish("MySTT Setup", $"Got region: {lCognitiveRegion}, key starting from: {lCognitiveKey??lCognitiveKey.Substring(0, lCognitiveKey.Length /2)}"); this.mTransSpeechConfig = SpeechTranslationConfig.FromSubscription(lCognitiveKey, lCognitiveRegion); var fromLanguage = "en-US"; var toLanguages = new List <string> { "el-GR" }; //var toLanguages = new List<string> { "ru-RU" }; this.mTransSpeechConfig.SpeechRecognitionLanguage = fromLanguage; toLanguages.ForEach(this.mTransSpeechConfig.AddTargetLanguage); this.mInputStream = AudioInputStream.CreatePushStream(AudioStreamFormat.GetWaveFormatPCM(SAMPLESPERSECOND, BITSPERSAMPLE, NUMBEROFCHANNELS)); this.mAudioConfig = AudioConfig.FromStreamInput(this.mInputStream); this.mTranslationRecognizer = new TranslationRecognizer(this.mTransSpeechConfig, this.mAudioConfig); this.mTranslationRecognizer.Recognizing += this.MSpeechRecognizer_Recognizing; this.mTranslationRecognizer.Recognized += this.MSpeechRecognizer_Recognized; this.mTranslationRecognizer.SpeechEndDetected += this.MSpeechRecognizer_SpeechEndDetected; this.StartRecognisionIfNeeded(); } catch (Exception ex) { _eventPublisher.Publish("MySTT Setup - Failed", $"Failed to initialize: {ex.Message}"); } }
private async Task <PushAudioInputStream> CreatePushStreamAsync(Stream stream) { var read = 0; var recognitionStream = AudioInputStream.CreatePushStream(); var buffer = ArrayPool <byte> .Shared.Rent(80000); var sumRead = 0; try { while ((read = await stream.ReadAsync(buffer, 0, buffer.Length)) != 0) { sumRead += read; recognitionStream.Write(buffer, read); } recognitionStream.Close(); if (sumRead == 0) { return(null); } return(recognitionStream); } finally { ArrayPool <byte> .Shared.Return(buffer); } }
/// <summary> /// Initializes a new instance of the <see cref="ContinuousSpeechRecognizer"/> class. /// </summary> /// <param name="pipeline">The pipeline in which to create the component.</param> /// <param name="subscriptionKey">The subscription key for the Azure speech resource.</param> /// <param name="region">The service region of the Azure speech resource.</param> public ContinuousSpeechRecognizer(Pipeline pipeline, string subscriptionKey, string region) : base(pipeline) { var config = SpeechConfig.FromSubscription(subscriptionKey, region); this.pushStream = AudioInputStream.CreatePushStream(); this.audioInput = AudioConfig.FromStreamInput(this.pushStream); this.recognizer = new SpeechRecognizer(config, this.audioInput); }
/// <summary> /// Constructs an <see cref="AudioConfig"/> from <see cref="Config"/>. /// Depending on the available services, this may either use the audio features built into the Speech SDK (such as <see cref="AudioConfig.FromDefaultMicrophoneInput"/>), /// or it may construct a <see cref="IStreamAudioSource"/> that accesses the requested <see cref="AudioDevice"/> with resampling and noise gates as required. /// </summary> /// <returns></returns> protected AudioConfig GetAudioConfig() { var streamSource = GetStreamAudioSource(Config.AudioSource); if (streamSource != null) { //use this stream source and convert to an Azure audio stream try { var azureInput = AudioInputStream.CreatePushStream(AudioStreamFormat.GetWaveFormatPCM( (uint)streamSource.Format.SampleRate, (byte)streamSource.Format.BitsPerSample, (byte)streamSource.Format.ChannelCount)); byte[] bufferOptional = null; streamSource.DataAvailable += (s, e) => { azureInput.Write(e.Buffer.GetArray(ref bufferOptional), e.Buffer.Count); }; streamSource.Stopped += (s, e) => { if (e.Cause == StreamAudioSourceStoppedCause.Stopped) { //signal end-of-stream to Azure azureInput.Close(); } }; this.StreamAudioSource = streamSource; return(AudioConfig.FromStreamInput(azureInput)); } catch (Exception ex) { Logger.LogError(ex, $"Error while creating an Azure AudioConfig from an IStreamAudioSource. Format: SampleRate={streamSource.Format.SampleRate}, BitsPerSample={streamSource.Format.BitsPerSample}, Channels={streamSource.Format.ChannelCount}"); streamSource.Dispose(); } } this.StreamAudioSource = null; this.StreamAudioNoiseGate = null; //try and use the built-in audio engine if (Config.AudioSource is AudioDevice audioDevice) { if (audioDevice.UseDefaultAudioInputDevice) { return(AudioConfig.FromDefaultMicrophoneInput()); } } return(null); }
void Start() { if (outputText == null) { UnityEngine.Debug.LogError("outputText property is null! Assign a UI Text element to it."); } else if (recoButton == null) { message = "recoButton property is null! Assign a UI Button to it."; UnityEngine.Debug.LogError(message); } else { // Continue with normal initialization, Text and Button objects are present. #if PLATFORM_ANDROID // Request to use the microphone, cf. // https://docs.unity3d.com/Manual/android-RequestingPermissions.html message = "Waiting for mic permission"; if (!Permission.HasUserAuthorizedPermission(Permission.Microphone)) { Permission.RequestUserPermission(Permission.Microphone); } #elif PLATFORM_IOS if (!Application.HasUserAuthorization(UserAuthorization.Microphone)) { Application.RequestUserAuthorization(UserAuthorization.Microphone); } #else micPermissionGranted = true; message = "Click button to recognize speech"; #endif grabacionCompleta = new StringBuilder(200); config = SpeechConfig.FromSubscription("b899f4a3bc2b4b30b3e690476b1af952", "westus"); config.SpeechRecognitionLanguage = "es-ES"; pushStream = AudioInputStream.CreatePushStream(); audioInput = AudioConfig.FromStreamInput(pushStream); recognizer = new SpeechRecognizer(config, audioInput); recognizer.Recognizing += RecognizingHandler; recognizer.Recognized += RecognizedHandler; recognizer.Canceled += CanceledHandler; recoButton.onClick.AddListener(ButtonClick); foreach (var device in Microphone.devices) { Debug.Log("DeviceName: " + device); } audioSource = GameObject.Find("MyAudioSource").GetComponent <AudioSource>(); } }
void ConfigureSpeechRecognizer() { _speechConfig = SpeechConfig.FromSubscription(SubscriptionKey, SubscriptionRegion); _speechConfig.SpeechRecognitionLanguage = "es-US"; _speechConfig.OutputFormat = OutputFormat.Detailed; _pushStream = AudioInputStream.CreatePushStream(); _audioInput = AudioConfig.FromStreamInput(_pushStream); _speechRecognizer = new SpeechRecognizer(_speechConfig, _audioInput); _speechRecognizer.Recognizing += SpeechRecognizingHandler; _speechRecognizer.Recognized += SpeechRecognizedHandler; _speechRecognizer.Canceled += SpeechCanceledHandler; _audioSource = GameObject.Find("AudioSource").GetComponent <AudioSource>(); _audioSource.loop = false; _audioSource.playOnAwake = false; }
void Start() { if (outputText == null) { UnityEngine.Debug.LogError("outputText property is null! Assign a UI Text element to it."); } else if (recoButton == null) { _message = "recoButton property is null! Assign a UI Button to it."; UnityEngine.Debug.LogError(_message); } else { // Continue with normal initialization, Text and Button objects are present. #if PLATFORM_ANDROID // Request to use the microphone, cf. // https://docs.unity3d.com/Manual/android-RequestingPermissions.html message = "Waiting for mic permission"; if (!Permission.HasUserAuthorizedPermission(Permission.Microphone)) { Permission.RequestUserPermission(Permission.Microphone); } #elif PLATFORM_IOS if (!Application.HasUserAuthorization(UserAuthorization.Microphone)) { Application.RequestUserAuthorization(UserAuthorization.Microphone); } #else _micPermissionGranted = true; _message = "Click button to recognize speech"; #endif _config = SpeechTranslationConfig.FromSubscription(SubscriptionKey, SubscriptionRegion); _config.SpeechRecognitionLanguage = "es-US"; _config.AddTargetLanguage("en-US"); _pushStream = AudioInputStream.CreatePushStream(); _audioInput = AudioConfig.FromStreamInput(_pushStream); _recognizer = new TranslationRecognizer(_config, _audioInput); _recognizer.Recognizing += RecognizingHandler; _recognizer.Recognized += RecognizedHandler; _recognizer.Canceled += CanceledHandler; foreach (var device in Microphone.devices) { Debug.Log("DeviceName: " + device); } _audioSource = GameObject.Find("AudioSource").GetComponent <AudioSource>(); } }
public async UniTask STTBytes(byte[] readBytes, int sampleRate, int bitRate, int channels) { var speechConfig = SpeechConfig.FromSubscription(subscription_key, region); speechConfig.SpeechRecognitionLanguage = location; var audioStreamFormat = AudioStreamFormat.GetWaveFormatPCM((uint)sampleRate, (byte)bitRate, (byte)channels); var audioInputStream = AudioInputStream.CreatePushStream(audioStreamFormat); var audioConfig = AudioConfig.FromStreamInput(audioInputStream); var recognizer = new SpeechRecognizer(speechConfig, audioConfig); audioInputStream.Write(readBytes, readBytes.Length); var result = await recognizer.RecognizeOnceAsync(); Debug.Log($"Recognized Line : = {result.Text}"); }
public async Task Start() { var config = SpeechConfig.FromSubscription(_projectSettings.AzureSpeechServiceSubscriptionKey, _projectSettings.AzureSpeechServiceRegionName); var audioFormat = AudioStreamFormat.GetWaveFormatPCM(8000, 16, 1); _inputStream = AudioInputStream.CreatePushStream(audioFormat); _audioInput = AudioConfig.FromStreamInput(_inputStream); _recognizer = new SpeechRecognizer(config, _audioInput); _recognizer.SessionStarted += RecognizerStarted; _recognizer.Recognized += RecognizerRecognized; _recognizer.Canceled += RecognizerCancelled; await _recognizer.StartContinuousRecognitionAsync(); }
private void Init(string from, string to) { this.toLanguage = to; Profile = MediaEncodingProfile.CreateWav(AudioEncodingQuality.Low); Profile.Audio = AudioEncodingProperties.CreatePcm(16000, 1, 16); byte channels = 1; byte bitsPerSample = 16; uint samplesPerSecond = 16000; // or 8000 var audioFormat = AudioStreamFormat.GetWaveFormatPCM(samplesPerSecond, bitsPerSample, channels); // Init Push Stream pushStream = AudioInputStream.CreatePushStream(audioFormat); if (from == to) { var config = SpeechConfig.FromSubscription(apiKey, region); config.SpeechRecognitionLanguage = from; speechRecognizer = new SpeechRecognizer(config, AudioConfig.FromStreamInput(pushStream)); speechRecognizer.Recognizing += RecognisingSpeechHandler; speechRecognizer.Recognized += RecognisingSpeechHandler; speechRecognizer.SessionStarted += (sender, args) => this.RecognisionStarted?.Invoke(); speechRecognizer.SessionStopped += (sender, args) => this.RecognisionStopped?.Invoke(); } else { var config = SpeechTranslationConfig.FromSubscription(apiKey, region); config.SpeechRecognitionLanguage = from; config.AddTargetLanguage(to); translationRecognizer = new TranslationRecognizer(config, AudioConfig.FromStreamInput(pushStream)); translationRecognizer.SessionStarted += (sender, args) => this.RecognisionStarted?.Invoke(); translationRecognizer.SessionStopped += (sender, args) => this.RecognisionStopped?.Invoke(); translationRecognizer.Recognizing += RecognisingTranslationHandler; translationRecognizer.Recognized += RecognisingTranslationHandler; } }
async static Task FromStream(SpeechConfig speechConfig) { var reader = new BinaryReader(File.OpenRead(DEMO_FILE)); Console.WriteLine(reader.ToString()); using var audioInputStream = AudioInputStream.CreatePushStream(); using var audioConfig = AudioConfig.FromStreamInput(audioInputStream); using var recognizer = new SpeechRecognizer(speechConfig, audioConfig); byte[] readBytes; do { readBytes = reader.ReadBytes(1024); audioInputStream.Write(readBytes, readBytes.Length); } while (readBytes.Length > 0); var result = await recognizer.RecognizeOnceAsync(); Console.WriteLine($"RECOGNIZED: Text={result.Text}"); }
public AzureSpeechRecognizer(string key, string region, WaveStream stream) { var speechConfig = SpeechConfig.FromSubscription(key, region); this.stream = NormalizeStream(stream); this.pushStream = AudioInputStream.CreatePushStream(); this.recognizer = new SpeechRecognizer(speechConfig, AudioConfig.FromStreamInput(this.pushStream)); this.resultId = Guid.NewGuid().ToString(); this.lockObj = new object(); this.recognizer.Recognized += (snd, evt) => { string id = null; lock (this.lockObj) { id = this.resultId; this.resultId = Guid.NewGuid().ToString(); } if (!string.IsNullOrWhiteSpace(evt.Result.Text)) { this.SpeechRecognized?.Invoke(this, new RecognitionEventArgs(evt, id)); } }; this.recognizer.Recognizing += (snd, evt) => { string id = null; lock (this.lockObj) { id = this.resultId; } this.SpeechPredicted?.Invoke(this, new RecognitionEventArgs(evt, id)); }; this.recognizer.Canceled += (snd, evt) => { Debug.WriteLine("lost recognizer"); }; }
/// <summary>语音转文字 从内存流识别</summary> public static async Task <string> RecognizeFromStreamAsync(string inputFileName) { var config = SpeechConfig.FromSubscription(subscriptionKey, region); var reader = new BinaryReader(File.OpenRead(inputFileName)); using var audioInputStream = AudioInputStream.CreatePushStream(); using var audioConfig = AudioConfig.FromStreamInput(audioInputStream); using var recognizer = new SpeechRecognizer(config, audioConfig); byte[] readBytes; do { readBytes = reader.ReadBytes(1024); audioInputStream.Write(readBytes, readBytes.Length); } while (readBytes.Length > 0); var result = await recognizer.RecognizeOnceAsync(); return(result.Text); }
/// <summary> /// Creates Recognizer with baseline model and selected language: /// Creates a config with subscription key and selected region /// If input source is audio file, creates recognizer with audio file otherwise with default mic /// Waits on RunRecognition. /// </summary> private async Task CreateRecognizer(byte[] channel) { // Todo: suport users to specifiy a different region. var config = SpeechConfig.FromSubscription(this.SubscriptionKey, this.Region); config.SpeechRecognitionLanguage = this.RecognitionLanguage; config.OutputFormat = OutputFormat.Detailed; SpeechRecognizer basicRecognizer; PushAudioInputStream pushStream = AudioInputStream.CreatePushStream(); pushStream.Write(channel); pushStream.Close(); using (var audioInput = AudioConfig.FromStreamInput(pushStream)) { using (basicRecognizer = new SpeechRecognizer(config, audioInput)) { await this.RunRecognizer(basicRecognizer, stopBaseRecognitionTaskCompletionSource).ConfigureAwait(false); } } }
public async Task <string> AudioToTextAsync(byte[] pcm) { var guid = Guid.NewGuid(); if (!Text.ContainsKey(guid)) { Text[guid] = null; } // Build out the speech recognizer using (var pushStream = AudioInputStream.CreatePushStream(AudioStreamFormat.GetDefaultInputFormat())) using (var audioInput = AudioConfig.FromStreamInput(pushStream)) using (var recognizer = new SpeechRecognizer(SpeechConfig, audioInput)) { // Subscribe to speech recognizer events. recognizer.SessionStarted += OnSpeechRecognitionSessionStarted; recognizer.Recognizing += OnSpeechRecognizing; recognizer.Recognized += (s, e) => OnSpeechRecognized(s, e, guid); recognizer.Canceled += OnSpeechCanceled; recognizer.SessionStopped += OnSpeechRecognitionSessionStopped; // Start continuous recognition. Uses StopContinuousRecognitionAsync() to stop recognition. await recognizer.StartContinuousRecognitionAsync().ConfigureAwait(false); // Send the pcm data to the speech recognizer pushStream.Write(pcm); pushStream.Close(); // Wait for completion. // Use Task.WaitAny to keep the task rooted. Task.WaitAny(StopRecognition.Task); // Stop recognition. await recognizer.StopContinuousRecognitionAsync().ConfigureAwait(false); return(Text[guid]); } }
private PushAudioInputStream CreateAudioInputStream() { return(AudioInputStream.CreatePushStream(AudioStreamFormat.GetWaveFormatPCM((uint)SamplesPerMillisecond * 1000, 16, 1))); }
public static async Task RecognitionWithPushAudioStreamAsync() { // Creates an instance of a speech config with specified subscription key and service region. // Replace with your own subscription key and service region (e.g., "westus"). var config = SpeechConfig.FromSubscription("YourSubscriptionKey", "YourServiceRegion"); var stopRecognition = new TaskCompletionSource <int>(); // Create a push stream using (var pushStream = AudioInputStream.CreatePushStream()) { using (var audioInput = AudioConfig.FromStreamInput(pushStream)) { // Creates a speech recognizer using audio stream input. using (var recognizer = new SpeechRecognizer(config, audioInput)) { // Subscribes to events. recognizer.Recognizing += (s, e) => { Console.WriteLine($"RECOGNIZING: Text={e.Result.Text}"); }; recognizer.Recognized += (s, e) => { if (e.Result.Reason == ResultReason.RecognizedSpeech) { Console.WriteLine($"RECOGNIZED: Text={e.Result.Text}"); } else if (e.Result.Reason == ResultReason.NoMatch) { Console.WriteLine($"NOMATCH: Speech could not be recognized."); } }; recognizer.Canceled += (s, e) => { Console.WriteLine($"CANCELED: Reason={e.Reason}"); if (e.Reason == CancellationReason.Error) { Console.WriteLine($"CANCELED: ErrorCode={e.ErrorCode}"); Console.WriteLine($"CANCELED: ErrorDetails={e.ErrorDetails}"); Console.WriteLine($"CANCELED: Did you update the subscription info?"); } stopRecognition.TrySetResult(0); }; recognizer.SessionStarted += (s, e) => { Console.WriteLine("\nSession started event."); }; recognizer.SessionStopped += (s, e) => { Console.WriteLine("\nSession stopped event."); Console.WriteLine("\nStop recognition."); stopRecognition.TrySetResult(0); }; // Starts continuous recognition. Uses StopContinuousRecognitionAsync() to stop recognition. await recognizer.StartContinuousRecognitionAsync().ConfigureAwait(false); // open and read the wave file and push the buffers into the recognizer using (BinaryAudioStreamReader reader = Helper.CreateWavReader(@"whatstheweatherlike.wav")) { byte[] buffer = new byte[1000]; while (true) { var readSamples = reader.Read(buffer, (uint)buffer.Length); if (readSamples == 0) { break; } pushStream.Write(buffer, readSamples); } } pushStream.Close(); // Waits for completion. // Use Task.WaitAny to keep the task rooted. Task.WaitAny(new[] { stopRecognition.Task }); // Stops recognition. await recognizer.StopContinuousRecognitionAsync().ConfigureAwait(false); } } } }
public static async Task RecognitionWithPushAudioStreamAsync() { var capture = new WasapiCapture(); // Creates an instance of a speech config with specified subscription key and service region. // Replace with your own subscription key and service region (e.g., "westus"). var config = SpeechConfig.FromSubscription("your key", "your region"); var stopRecognition = new TaskCompletionSource<int>(); // Create a push stream using (var pushStream = AudioInputStream.CreatePushStream()) { using (var audioInput = AudioConfig.FromStreamInput(pushStream)) { // Creates a speech recognizer using audio stream input. using (var recognizer = new SpeechRecognizer(config, audioInput)) { Console.WriteLine("Say something..."); // Subscribes to events. recognizer.Recognizing += (s, e) => { Console.WriteLine($"RECOGNIZING: Text={e.Result.Text}"); }; recognizer.Recognized += (s, e) => { if (e.Result.Reason == ResultReason.RecognizedSpeech) { Console.WriteLine($"RECOGNIZED: Text={e.Result.Text}"); stopRecognition.TrySetResult(0); } else if (e.Result.Reason == ResultReason.NoMatch) { Console.WriteLine($"NOMATCH: Speech could not be recognized."); } }; recognizer.Canceled += (s, e) => { Console.WriteLine($"CANCELED: Reason={e.Reason}"); if (e.Reason == CancellationReason.Error) { Console.WriteLine($"CANCELED: ErrorCode={e.ErrorCode}"); Console.WriteLine($"CANCELED: ErrorDetails={e.ErrorDetails}"); Console.WriteLine($"CANCELED: Did you update the subscription info?"); } stopRecognition.TrySetResult(0); }; recognizer.SessionStarted += (s, e) => { Console.WriteLine("\nSession started event."); }; recognizer.SessionStopped += (s, e) => { Console.WriteLine("\nSession stopped event."); Console.WriteLine("\nStop recognition."); stopRecognition.TrySetResult(0); }; capture.DataAvailable += (s, e) => { if (e.BytesRecorded != 0) { var floatArray = new float[e.BytesRecorded / 4]; Buffer.BlockCopy(e.Buffer, 0, floatArray, 0, e.BytesRecorded); byte[] ba = ConvertFloatArrayToInt16ByteArray(floatArray); pushStream.Write(ba); // try to push buffer here } }; // Starts continuous recognition. Uses StopContinuousRecognitionAsync() to stop recognition. await recognizer.StartContinuousRecognitionAsync().ConfigureAwait(false); capture.StartRecording(); // Waits for completion. // Use Task.WaitAny to keep the task rooted. Task.WaitAny(new[] { stopRecognition.Task }); // Stops recognition. await recognizer.StopContinuousRecognitionAsync().ConfigureAwait(false); capture.StopRecording(); } } } }
public static async Task <List <JsonResult> > TranscribeAsync(byte[] audio, SpeechConfig speechConfig, ILogger logger) { audio = audio ?? throw new ArgumentNullException(nameof(audio)); speechConfig = speechConfig ?? throw new ArgumentNullException(nameof(speechConfig)); var jsonResults = new List <JsonResult>(); var chunkId = Encoding.ASCII.GetString(audio.Take(4).ToArray()); // Verify that first 4 bytes are RIFF in ascii: // (see https://docs.fileformat.com/audio/wav/ for details) if (chunkId != "RIFF") { throw new InvalidOperationException($"Expected file header containing RIFF, received {chunkId} instead."); } // Get bytes for sampleRate, bitsPerSample, and channels: var sampleRate = BitConverter.ToUInt32(audio.Skip(24).Take(4).ToArray()); var bitsPerSample = audio[34]; var channels = audio[22]; var audioStreamFormat = AudioStreamFormat.GetWaveFormatPCM(sampleRate, bitsPerSample, channels); var stopRecognition = new TaskCompletionSource <int>(); using var pushStream = AudioInputStream.CreatePushStream(audioStreamFormat); using (var audioInput = AudioConfig.FromStreamInput(pushStream)) { using var recognizer = new SpeechRecognizer( speechConfig, audioInput); using var connection = Connection.FromRecognizer(recognizer); pushStream.Write(audio); pushStream.Close(); logger.LogInformation("Starting speech recognition."); recognizer.Recognized += (s, e) => { if (e.Result.Reason == ResultReason.RecognizedSpeech) { var stringResult = e.Result.Properties.GetProperty(PropertyId.SpeechServiceResponse_JsonResult); var deserializedResult = JsonConvert.DeserializeObject <JsonResult>(stringResult); jsonResults.Add(deserializedResult); } }; recognizer.SessionStarted += (s, e) => { logger.LogInformation($"Starting session. Session id: {e.SessionId}."); }; recognizer.SessionStopped += (s, e) => { logger.LogInformation("Session stopped"); stopRecognition.TrySetResult(0); }; recognizer.Canceled += (s, e) => { var cancellation = CancellationDetails.FromResult(e.Result); logger.LogError($"Recognition canceled: Reason={cancellation.Reason}"); if (cancellation.Reason == CancellationReason.Error) { logger.LogError($"ErrorCode={cancellation.ErrorCode}"); logger.LogError($"ErrorDetails={cancellation.ErrorDetails}"); if (cancellation.ErrorCode != CancellationErrorCode.NoError) { throw new RealtimeTranscriptionException(cancellation.ErrorCode, cancellation.ErrorDetails); } } stopRecognition.TrySetResult(0); }; await recognizer.StartContinuousRecognitionAsync().ConfigureAwait(false); await Task.WhenAll(stopRecognition.Task).ConfigureAwait(false); await recognizer.StopContinuousRecognitionAsync().ConfigureAwait(false); logger.LogInformation("Recognition stopped."); } return(jsonResults); }
/// <summary> /// Sets up the initial state needed for Direct Line Speech, including creation of the /// underlying DialogServiceConnector and wiring of its events. /// </summary> /// <param name="keywordFile"> The keyword file to be loaded as part of initialization.</param> /// <returns> A task that completes once initialization is complete. </returns> public Task InitializeAsync(StorageFile keywordFile) { Contract.Requires(keywordFile != null); var configRefreshRequired = this.TryRefreshConfigValues(); var refreshConnector = configRefreshRequired || (this.keywordFilePath != keywordFile.Path); if (LocalSettingsHelper.SetProperty != null) { this.enableKwsLogging = true; } if (this.enableKwsLogging) { refreshConnector = true; this.enableKwsLogging = false; } if (refreshConnector) { var newConnectorConfiguration = this.CreateConfiguration(); this.ConfirmationModel = KeywordRecognitionModel.FromFile(keywordFile.Path); this.keywordFilePath = keywordFile.Path; this.ConnectorConfiguration = newConnectorConfiguration; this.connectorInputStream = AudioInputStream.CreatePushStream(); this.connector?.Dispose(); this.connector = new DialogServiceConnector( this.ConnectorConfiguration, AudioConfig.FromStreamInput(this.connectorInputStream)); this.connector.SessionStarted += (s, e) => this.SessionStarted?.Invoke(e.SessionId); this.connector.SessionStopped += (s, e) => this.SessionStopped?.Invoke(e.SessionId); this.connector.Recognizing += (s, e) => { switch (e.Result.Reason) { case ResultReason.RecognizingKeyword: this.logger.Log(LogMessageLevel.SignalDetection, $"Local model recognized keyword \"{e.Result.Text}\""); this.KeywordRecognizing?.Invoke(e.Result.Text); this.secondStageConfirmed = true; break; case ResultReason.RecognizingSpeech: this.logger.Log(LogMessageLevel.SignalDetection, $"Recognized speech in progress: \"{e.Result.Text}\""); this.SpeechRecognizing?.Invoke(e.Result.Text); break; default: throw new InvalidOperationException(); } }; this.connector.Recognized += (s, e) => { KwsPerformanceLogger.KwsEventFireTime = TimeSpan.FromTicks(DateTime.Now.Ticks); switch (e.Result.Reason) { case ResultReason.RecognizedKeyword: var thirdStageStartTime = KwsPerformanceLogger.KwsStartTime.Ticks; thirdStageStartTime = DateTime.Now.Ticks; this.logger.Log(LogMessageLevel.SignalDetection, $"Cloud model recognized keyword \"{e.Result.Text}\""); this.KeywordRecognized?.Invoke(e.Result.Text); this.kwsPerformanceLogger.LogSignalReceived("SWKWS", "A", "3", KwsPerformanceLogger.KwsEventFireTime.Ticks, thirdStageStartTime, DateTime.Now.Ticks); this.secondStageConfirmed = false; break; case ResultReason.RecognizedSpeech: this.logger.Log(LogMessageLevel.SignalDetection, $"Recognized final speech: \"{e.Result.Text}\""); this.SpeechRecognized?.Invoke(e.Result.Text); break; case ResultReason.NoMatch: // If a KeywordRecognized handler is available, this is a final stage // keyword verification rejection. this.logger.Log(LogMessageLevel.SignalDetection, $"Cloud model rejected keyword"); if (this.secondStageConfirmed) { var thirdStageStartTimeRejected = KwsPerformanceLogger.KwsStartTime.Ticks; thirdStageStartTimeRejected = DateTime.Now.Ticks; this.kwsPerformanceLogger.LogSignalReceived("SWKWS", "R", "3", KwsPerformanceLogger.KwsEventFireTime.Ticks, thirdStageStartTimeRejected, DateTime.Now.Ticks); this.secondStageConfirmed = false; } this.KeywordRecognized?.Invoke(null); break; default: throw new InvalidOperationException(); } }; this.connector.Canceled += (s, e) => { var code = (int)e.ErrorCode; var message = $"{e.Reason.ToString()}: {e.ErrorDetails}"; this.ErrorReceived?.Invoke(new DialogErrorInformation(code, message)); }; this.connector.ActivityReceived += (s, e) => { // Note: the contract of when to end a turn is unique to your dialog system. In this sample, // it's assumed that receiving a message activity without audio marks the end of a turn. Your // dialog system may have a different contract! var wrapper = new ActivityWrapper(e.Activity); if (wrapper.Type == ActivityWrapper.ActivityType.Event) { if (!this.startEventReceived) { this.startEventReceived = true; return; } else { this.startEventReceived = false; } } var payload = new DialogResponse( messageBody: e.Activity, messageMedia: e.HasAudio ? new DirectLineSpeechAudioOutputStream(e.Audio, LocalSettingsHelper.OutputFormat) : null, shouldEndTurn: (e.Audio == null && wrapper.Type == ActivityWrapper.ActivityType.Message) || wrapper.Type == ActivityWrapper.ActivityType.Event, shouldStartNewTurn: wrapper.InputHint == ActivityWrapper.InputHintType.ExpectingInput); this.DialogResponseReceived?.Invoke(payload); }; } return(Task.FromResult(0)); }
/// <summary> /// Sets up the initial state needed for Direct Line Speech, including creation of the /// underlying DialogServiceConnector and wiring of its events. /// </summary> /// <param name="keywordFile"> The keyword file to be loaded as part of initialization.</param> /// <returns> A task that completes once initialization is complete. </returns> public Task InitializeAsync(StorageFile keywordFile) { Contract.Requires(keywordFile != null); // Default values -- these can be updated this.ConnectorConfiguration = this.CreateConfiguration(); this.ConfirmationModel = KeywordRecognitionModel.FromFile(keywordFile.Path); this.connectorInputStream = AudioInputStream.CreatePushStream(); this.connector = new DialogServiceConnector( this.ConnectorConfiguration, AudioConfig.FromStreamInput(this.connectorInputStream)); this.connector.SessionStarted += (s, e) => this.SessionStarted?.Invoke(e.SessionId); this.connector.SessionStopped += (s, e) => this.SessionStopped?.Invoke(e.SessionId); this.connector.Recognizing += (s, e) => { switch (e.Result.Reason) { case ResultReason.RecognizingKeyword: this.logger.Log($"Local model recognized keyword \"{e.Result.Text}\""); this.KeywordRecognizing?.Invoke(e.Result.Text); break; case ResultReason.RecognizingSpeech: this.logger.Log($"Recognized speech in progress: \"{e.Result.Text}\""); this.SpeechRecognizing?.Invoke(e.Result.Text); break; default: throw new InvalidOperationException(); } }; this.connector.Recognized += (s, e) => { switch (e.Result.Reason) { case ResultReason.RecognizedKeyword: this.logger.Log($"Cloud model recognized keyword \"{e.Result.Text}\""); this.KeywordRecognized?.Invoke(e.Result.Text); break; case ResultReason.RecognizedSpeech: this.logger.Log($"Recognized final speech: \"{e.Result.Text}\""); this.SpeechRecognized?.Invoke(e.Result.Text); break; case ResultReason.NoMatch: // If a KeywordRecognized handler is available, this is a final stage // keyword verification rejection. this.logger.Log($"Cloud model rejected keyword"); this.KeywordRecognized?.Invoke(null); break; default: throw new InvalidOperationException(); } }; this.connector.Canceled += (s, e) => { var code = (int)e.ErrorCode; var message = $"{e.Reason.ToString()}: {e.ErrorDetails}"; this.ErrorReceived?.Invoke(new DialogErrorInformation(code, message)); }; this.connector.ActivityReceived += (s, e) => { // Note: the contract of when to end a turn is unique to your dialog system. In this sample, // it's assumed that receiving a message activity without audio marks the end of a turn. Your // dialog system may have a different contract! var wrapper = new ActivityWrapper(e.Activity); var payload = new DialogResponse( messageBody: e.Activity, messageMedia: e.HasAudio ? new DirectLineSpeechAudioOutputStream(e.Audio, LocalSettingsHelper.OutputFormat) : null, shouldEndTurn: e.Audio == null && wrapper.Type == ActivityWrapper.ActivityType.Message, shouldStartNewTurn: wrapper.InputHint == ActivityWrapper.InputHintType.ExpectingInput); this.logger.Log($"Connector activity received"); this.DialogResponseReceived?.Invoke(payload); }; return(Task.FromResult(0)); }
// Translation using compressed file input. public static async Task TranslationWithFileCompressedInputAsync() { // <TranslationWithFileCompressedInputAsync> // Translation source language with compressed format. // Replace with a language of your choice. string fromLanguage = "en-US"; // Creates an instance of a speech translation config with specified subscription key and service region. // Replace with your own subscription key and service region (e.g., "westus"). var config = SpeechTranslationConfig.FromSubscription("YourSubscriptionKey", "YourServiceRegion"); config.SpeechRecognitionLanguage = fromLanguage; // Translation target language(s). // Replace with language(s) of your choice. config.AddTargetLanguage("de"); config.AddTargetLanguage("fr"); var stopTranslation = new TaskCompletionSource <int>(); // Creates a translation recognizer using file as audio input. using (var pushStream = AudioInputStream.CreatePushStream(AudioStreamFormat.GetCompressedFormat(AudioStreamContainerFormat.MP3))) { using (var audioInput = AudioConfig.FromStreamInput(pushStream)) { using (var recognizer = new TranslationRecognizer(config, audioInput)) { // Subscribes to events. recognizer.Recognizing += (s, e) => { Console.WriteLine($"RECOGNIZING in '{fromLanguage}': Text={e.Result.Text}"); foreach (var element in e.Result.Translations) { Console.WriteLine($" TRANSLATING into '{element.Key}': {element.Value}"); } }; recognizer.Recognized += (s, e) => { if (e.Result.Reason == ResultReason.TranslatedSpeech) { Console.WriteLine($"RECOGNIZED in '{fromLanguage}': Text={e.Result.Text}"); foreach (var element in e.Result.Translations) { Console.WriteLine($" TRANSLATED into '{element.Key}': {element.Value}"); } } else if (e.Result.Reason == ResultReason.RecognizedSpeech) { Console.WriteLine($"RECOGNIZED: Text={e.Result.Text}"); Console.WriteLine($" Speech not translated."); } else if (e.Result.Reason == ResultReason.NoMatch) { Console.WriteLine($"NOMATCH: Speech could not be recognized."); } }; recognizer.Canceled += (s, e) => { Console.WriteLine($"CANCELED: Reason={e.Reason}"); if (e.Reason == CancellationReason.Error) { Console.WriteLine($"CANCELED: ErrorCode={e.ErrorCode}"); Console.WriteLine($"CANCELED: ErrorDetails={e.ErrorDetails}"); Console.WriteLine($"CANCELED: Did you update the subscription info?"); } stopTranslation.TrySetResult(0); }; recognizer.SpeechStartDetected += (s, e) => { Console.WriteLine("\nSpeech start detected event."); }; recognizer.SpeechEndDetected += (s, e) => { Console.WriteLine("\nSpeech end detected event."); }; recognizer.SessionStarted += (s, e) => { Console.WriteLine("\nSession started event."); }; recognizer.SessionStopped += (s, e) => { Console.WriteLine("\nSession stopped event."); Console.WriteLine($"\nStop translation."); stopTranslation.TrySetResult(0); }; // Starts continuous recognition. Uses StopContinuousRecognitionAsync() to stop recognition. Console.WriteLine("Start translation..."); await recognizer.StartContinuousRecognitionAsync().ConfigureAwait(false); // Replace with your own audio file name. using (BinaryAudioStreamReader reader = Helper.CreateBinaryFileReader(@"whatstheweatherlike.mp3")) { byte[] buffer = new byte[1000]; while (true) { var readSamples = reader.Read(buffer, (uint)buffer.Length); if (readSamples == 0) { break; } pushStream.Write(buffer, readSamples); } } pushStream.Close(); // Waits for completion. // Use Task.WaitAny to keep the task rooted. Task.WaitAny(new[] { stopTranslation.Task }); // Stops translation. await recognizer.StopContinuousRecognitionAsync().ConfigureAwait(false); } } } // </TranslationWithFileCompressedInputAsync> }
/// <summary> /// Initializes the connection to the Bot. /// </summary> /// <param name="settings">Application settings object, built from the input JSON file supplied as run-time argument.</param> public void InitConnector(AppSettings settings) { DialogServiceConfig config; this.BotReplyList = new List <BotReply>(); this.stopWatch = new Stopwatch(); this.appsettings = settings; if (!string.IsNullOrWhiteSpace(this.appsettings.CustomCommandsAppId)) { // NOTE: Custom commands is a preview Azure Service. // Set the custom commands configuration object based on three items: // - The Custom commands application ID // - Cognitive services speech subscription key. // - The Azure region of the subscription key(e.g. "westus"). config = CustomCommandsConfig.FromSubscription(this.appsettings.CustomCommandsAppId, this.appsettings.SpeechSubscriptionKey, this.appsettings.SpeechRegion); } else { // Set the bot framework configuration object based on two items: // - Cognitive services speech subscription key. It is needed for billing and is tied to the bot registration. // - The Azure region of the subscription key(e.g. "westus"). config = BotFrameworkConfig.FromSubscription(this.appsettings.SpeechSubscriptionKey, this.appsettings.SpeechRegion); } if (this.appsettings.SpeechSDKLogEnabled) { // Speech SDK has verbose logging to local file, which may be useful when reporting issues. config.SetProperty(PropertyId.Speech_LogFilename, $"{this.appsettings.OutputFolder}SpeechSDKLog-{DateTime.Now.ToString("yyyy-MM-dd-HH-mm-ss", CultureInfo.CurrentCulture)}.log"); } if (!string.IsNullOrWhiteSpace(this.appsettings.SRLanguage)) { // Set the speech recognition language. If not set, the default is "en-us". config.Language = this.appsettings.SRLanguage; } if (!string.IsNullOrWhiteSpace(this.appsettings.CustomSREndpointId)) { // Set your custom speech end-point id here, as given to you by the speech portal https://speech.microsoft.com/portal. // Otherwise the standard speech end-point will be used. config.SetServiceProperty("cid", this.appsettings.CustomSREndpointId, ServicePropertyChannel.UriQueryParameter); // Custom Speech does not support cloud Keyword Verification at the moment. If this is not done, there will be an error // from the service and connection will close. Remove line below when supported. config.SetProperty("KeywordConfig_EnableKeywordVerification", "false"); } if (!string.IsNullOrWhiteSpace(this.appsettings.CustomVoiceDeploymentIds)) { // Set one or more IDs associated with the custom TTS voice your bot will use. // The format of the string is one or more GUIDs separated by comma (no spaces). You get these GUIDs from // your custom TTS on the speech portal https://speech.microsoft.com/portal. config.SetProperty(PropertyId.Conversation_Custom_Voice_Deployment_Ids, this.appsettings.CustomVoiceDeploymentIds); } this.timeout = this.appsettings.Timeout; if (!string.IsNullOrWhiteSpace(this.appsettings.KeywordRecognitionModel)) { this.kwsTable = KeywordRecognitionModel.FromFile(this.appsettings.KeywordRecognitionModel); } if (this.appsettings.SetPropertyId != null) { foreach (KeyValuePair <string, JToken> setPropertyIdPair in this.appsettings.SetPropertyId) { config.SetProperty(setPropertyIdPair.Key, setPropertyIdPair.Value.ToString()); } } if (this.appsettings.SetPropertyString != null) { foreach (KeyValuePair <string, JToken> setPropertyStringPair in this.appsettings.SetPropertyString) { config.SetProperty(setPropertyStringPair.Key.ToString(CultureInfo.CurrentCulture), setPropertyStringPair.Value.ToString()); } } if (this.appsettings.SetServiceProperty != null) { foreach (KeyValuePair <string, JToken> setServicePropertyPair in this.appsettings.SetServiceProperty) { config.SetServiceProperty(setServicePropertyPair.Key.ToString(CultureInfo.CurrentCulture), setServicePropertyPair.Value.ToString(), ServicePropertyChannel.UriQueryParameter); } } if (this.appsettings.RealTimeAudio) { config.SetProperty("SPEECH-AudioThrottleAsPercentageOfRealTime", "100"); config.SetProperty("SPEECH-TransmitLengthBeforeThrottleMs", "0"); } if (this.connector != null) { // Then dispose the object this.connector.Dispose(); this.connector = null; } this.pushAudioInputStream = AudioInputStream.CreatePushStream(); this.connector = new DialogServiceConnector(config, AudioConfig.FromStreamInput(this.pushAudioInputStream)); if (this.appsettings.BotGreeting) { // Starting the timer to calculate latency for Bot Greeting. this.stopWatch.Restart(); } this.AttachHandlers(); }
public async Task TranscribeConversationsAsync(IEnumerable <string> voiceSignatureStringUsers) { uint samplesPerSecond = 16000; byte bitsPerSample = 16; byte channels = 8; // 7 + 1 channels var config = SpeechConfig.FromSubscription(this.SubscriptionKey, this.Region); config.SetProperty("ConversationTranscriptionInRoomAndOnline", "true"); var stopRecognition = new TaskCompletionSource <int>(); using (var audioInput = AudioInputStream.CreatePushStream(AudioStreamFormat.GetWaveFormatPCM(samplesPerSecond, bitsPerSample, channels))) { var meetingID = Guid.NewGuid().ToString(); using (var conversation = await Conversation.CreateConversationAsync(config, meetingID)) { // create a conversation transcriber using audio stream input using (this.conversationTranscriber = new ConversationTranscriber(AudioConfig.FromStreamInput(audioInput))) { conversationTranscriber.Transcribing += (s, e) => { this.SetText($"TRANSCRIBING: Text={e.Result.Text} SpeakerId={e.Result.UserId}"); }; conversationTranscriber.Transcribed += (s, e) => { if (e.Result.Reason == ResultReason.RecognizedSpeech) { this.SetText($"TRANSCRIBED: Text={e.Result.Text} SpeakerId={e.Result.UserId}"); } else if (e.Result.Reason == ResultReason.NoMatch) { this.SetText($"NOMATCH: Speech could not be recognized."); } }; conversationTranscriber.Canceled += (s, e) => { this.SetText($"CANCELED: Reason={e.Reason}"); if (e.Reason == CancellationReason.Error) { this.SetText($"CANCELED: ErrorCode={e.ErrorCode}"); this.SetText($"CANCELED: ErrorDetails={e.ErrorDetails}"); this.SetText($"CANCELED: Did you update the subscription info?"); stopRecognition.TrySetResult(0); } }; conversationTranscriber.SessionStarted += (s, e) => { this.SetText($"\nSession started event. SessionId={e.SessionId}"); }; conversationTranscriber.SessionStopped += (s, e) => { this.SetText($"\nSession stopped event. SessionId={e.SessionId}"); this.SetText("\nStop recognition."); stopRecognition.TrySetResult(0); }; // Add participants to the conversation. int i = 1; foreach (var voiceSignatureStringUser in voiceSignatureStringUsers) { var speaker = Participant.From($"User{i++}", "en-US", voiceSignatureStringUser); await conversation.AddParticipantAsync(speaker); } // Join to the conversation and start transcribing await conversationTranscriber.JoinConversationAsync(conversation); await conversationTranscriber.StartTranscribingAsync().ConfigureAwait(false); using (var p = Pipeline.Create()) { var store = PsiStore.Create(p, "Transcribe", @"D:\Temp"); var capture = new AudioCapture(p, WaveFormat.CreatePcm((int)samplesPerSecond, bitsPerSample, channels)).Write("Audio", store); capture.Do(audio => audioInput.Write(audio.Data)); p.RunAsync(); // waits for completion, then stop transcription await stopRecognition.Task; } await conversationTranscriber.StopTranscribingAsync().ConfigureAwait(false); } } } }
private static async Task <bool> MakeAudioConfigAsync(SpeechHandler handler) { // var audioConfig = AudioConfig.FromWavFileInput(@"D:\Users\ManabuTonosaki\OneDrive - tomarika\tono.wav"); // var audioConfig = AudioConfig.FromDefaultMicrophoneInput(); Debug.Assert(handler.Device != null); var wavein = new WasapiLoopbackCapture(handler.Device); var waveoutFormat = new WaveFormat(16000, 16, 1); var lastSpeakDT = DateTime.Now; var willStop = DateTime.MaxValue; wavein.DataAvailable += (s, e) => { if (e.BytesRecorded > 0) { using var ms = new MemoryStream(e.Buffer, 0, e.BytesRecorded); using var rs = new RawSourceWaveStream(ms, wavein.WaveFormat); using var freq = new MediaFoundationResampler(rs, waveoutFormat.SampleRate); var w16 = freq.ToSampleProvider().ToMono().ToWaveProvider16(); var len = w16.Read(handler.buf, 0, handler.buf.Length); handler.AudioInputStream.Write(handler.buf, len); lastSpeakDT = DateTime.Now; willStop = DateTime.MaxValue; } else { if (DateTime.Now < willStop) { if (willStop == DateTime.MaxValue) { willStop = DateTime.Now + TimeSpan.FromSeconds(10); } var silence = new SilenceProvider(waveoutFormat); var len = silence.Read(handler.buf, 0, waveoutFormat.BitsPerSample * waveoutFormat.SampleRate / 8 / 100); // 10ms var cnt = (int)((DateTime.Now - lastSpeakDT).TotalMilliseconds / 10); for (var i = 0; i < cnt; i++) { handler.AudioInputStream.Write(handler.buf, len); } lastSpeakDT = DateTime.Now; } } }; var audioformat = AudioStreamFormat.GetWaveFormatPCM(samplesPerSecond: 16000, bitsPerSample: 16, channels: 1); handler.AudioInputStream = AudioInputStream.CreatePushStream(audioformat); handler.AudioConfig = AudioConfig.FromStreamInput(handler.AudioInputStream); await Task.Delay(100); handler.StopRequested += (s, e) => { wavein.StopRecording(); }; wavein.StartRecording(); return(true); }
private async Task <bool> MakeAudioConfigAsync(SpeechHandler handler) { Debug.Assert(handler.Device != null); // NAudio Setting var wavein = CreateCaptureInstance(handler.Device); var waveoutFormat = new WaveFormat(16000, 16, 1); wavein.StartRecording(); // Azure Cognitive Service Setting var audioformat = AudioStreamFormat.GetWaveFormatPCM((uint)waveoutFormat.SampleRate, (byte)waveoutFormat.BitsPerSample, (byte)waveoutFormat.Channels); handler.AudioInputStream = AudioInputStream.CreatePushStream(audioformat); handler.AudioConfig = AudioConfig.FromStreamInput(handler.AudioInputStream); // Silence Generate DateTime preEvent = DateTime.Now; var silenceData = new byte[waveoutFormat.BlockAlign]; // Appliation Preparation Hot.SetWavFormat(DisplayName, waveoutFormat); // for file saving // NAudio Voice event wavein.DataAvailable += (s, e) => { if (e.BytesRecorded > 0) { var now = DateTime.Now; using (var ms = new MemoryStream()) { var memoryWriter = new WaveFileWriter(ms, waveoutFormat); ms.SetLength(0); // Delete file header. var samples = Resample(wavein.WaveFormat, e.Buffer, e.BytesRecorded, waveoutFormat); foreach (var sample in samples) { memoryWriter.WriteSample(sample); } Hot.AddWavToAllQueue(DisplayName, ms.GetBuffer(), (int)ms.Length, now); // for file saving handler.AudioInputStream.Write(ms.GetBuffer(), (int)ms.Length); // for Azure Cognitive Speech to Text } try { Token.Add(TokenWavDataQueued, this); // TODO: Need Confirm it must be fixed with Tono.Gui.WinForm 1.1.2 - System.InvalidOperationException: 'Collection was modified; enumeration operation may not execute.' // It must be not fixed yet. so I added try-catch. } catch { // No Action because the above token is a QoS 0 message. But it's necessary to disappear exception messages that's why catch them here. } preEvent = DateTime.Now; } else { if (_talkID != null) { var spms = (double)waveoutFormat.SampleRate / 1000; // samples per ms var n = (int)(spms * (DateTime.Now - preEvent).TotalMilliseconds); for (var i = n; i >= 0; i--) { handler.AudioInputStream.Write(silenceData, silenceData.Length); // send silence to azure to get realtime event (othewise, azure will wait untile next event timing even if there is no event long time) } } preEvent = DateTime.Now; } }; handler.StopRequested += (s, e) => { wavein.StopRecording(); // Stop NAudio recording }; return(await Task.FromResult(true)); }