public void SendAudioPacket(string requestId, byte[] data) { byte[] headerBytes; byte[] headerHead; headerBytes = BuildAudioPacketHeader(requestId); headerHead = BuildAudioPacketHeaderHead(headerBytes); // The WebSocket Speech protocol docs state that PCM audio must be sampled at 16 kHz with 16 bits per sample // and one channel (riff-16khz-16bit-mono-pcm) but 48kHz-16-bit-stereo works too, more testing is required. var arr = headerHead.Concat(headerBytes).Concat(data).ToArray(); var arrSeg = new ArraySegment <byte>(arr, 0, arr.Length); Debug.Log($"Sending audio data sample from microphone."); if (SpeechWebSocketClient.State != WebSocketState.Open) { return; } SpeechWebSocketClient.SendAsync(arrSeg, WebSocketMessageType.Binary, true, new CancellationToken()); Debug.Log($"Audio data packet from microphone sent successfully!"); //var dt = Encoding.ASCII.GetString(arr); }
public async Task <bool> CreateSpeechRecognitionJobFromFile(string audioFilePath, string authenticationToken, string region) { try { state = JobState.PreparingJob; SpeechWebSocketClient = await InitializeSpeechWebSocketClient(authenticationToken, region); var receiving = Receiving(SpeechWebSocketClient); var sending = Task.Run(async() => { // Create a unique request ID, must be a UUID in "no-dash" format var requestId = Guid.NewGuid().ToString("N"); ArraySegment <byte> buffer = CreateSpeechConfigMessagePayloadBuffer(requestId); if (SpeechWebSocketClient.State != WebSocketState.Open) { return; } Debug.Log("Sending speech.config..."); // Send speech.config to Speech Service await SpeechWebSocketClient.SendAsync(buffer, WebSocketMessageType.Text, true, new CancellationToken()); Debug.Log("speech.config sent successfully!"); // SENDING AUDIO TO SPEECH SERVICE: // Speech-enabled client applications send audio to Speech Service by converting the audio stream // into a series of audio chunks. Each chunk of audio carries a segment of the spoken audio that's // to be transcribed by the service. The maximum size of a single audio chunk is 8,192 bytes. // Audio stream messages are Binary WebSocket messages. Debug.Log($"Preparing to send audio file: {audioFilePath}"); FileInfo audioFileInfo = new FileInfo(audioFilePath); FileStream audioFileStream = audioFileInfo.OpenRead(); state = JobState.ProcessingAudio; byte[] headerBytes; byte[] headerHead; for (int cursor = 0; cursor < audioFileInfo.Length; cursor++) { headerBytes = BuildAudioPacketHeader(requestId); headerHead = BuildAudioPacketHeaderHead(headerBytes); // PCM audio must be sampled at 16 kHz with 16 bits per sample and one channel (riff-16khz-16bit-mono-pcm). var byteLen = 8192 - headerBytes.Length - 2; var fbuff = new byte[byteLen]; audioFileStream.Read(fbuff, 0, byteLen); var arr = headerHead.Concat(headerBytes).Concat(fbuff).ToArray(); var arrSeg = new ArraySegment <byte>(arr, 0, arr.Length); Debug.Log($"Sending audio data from position: {cursor}"); if (SpeechWebSocketClient.State != WebSocketState.Open) { return; } cursor += byteLen; var end = cursor >= audioFileInfo.Length; await SpeechWebSocketClient.SendAsync(arrSeg, WebSocketMessageType.Binary, true, new CancellationToken()); Debug.Log($"Audio data from file {audioFilePath} sent successfully!"); var dt = Encoding.ASCII.GetString(arr); } await SendEmptyAudioMessageToWebSocketClient(SpeechWebSocketClient, requestId); audioFileStream.Dispose(); }); // Wait for tasks to complete await Task.WhenAll(sending, receiving); if (sending.IsFaulted) { state = JobState.Error; var err = sending.Exception; throw err; } if (receiving.IsFaulted) { state = JobState.Error; var err = receiving.Exception; throw err; } return(true); } catch (Exception ex) { state = JobState.Error; Debug.Log($"An exception occurred during creation of Speech Recognition job from audio file {audioFilePath}:" + Environment.NewLine + ex.Message); return(false); } }
/// <summary> /// prepares a new speech recognition job, sending the proper headers including the audio data header /// </summary> /// <param name="authenticationToken"></param> /// <param name="region"></param> /// <param name="resolution"></param> /// <param name="channels"></param> /// <param name="rate"></param> /// <returns></returns> public async Task <bool> CreateSpeechRecognitionJobFromVoice(string authenticationToken, string region, int resolution, int channels, int rate) { try { state = JobState.PreparingJob; SpeechWebSocketClient = await InitializeSpeechWebSocketClient(authenticationToken, region); var receiving = Receiving(SpeechWebSocketClient); var sending = Task.Run(async() => { // Create a unique request ID, must be a UUID in "no-dash" format CurrentRequestId = Guid.NewGuid().ToString("N"); ArraySegment <byte> buffer = CreateSpeechConfigMessagePayloadBuffer(CurrentRequestId); if (SpeechWebSocketClient.State != WebSocketState.Open) { return; } Debug.Log("Sending speech.config..."); // Send speech.config to Speech Service await SpeechWebSocketClient.SendAsync(buffer, WebSocketMessageType.Text, true, new CancellationToken()); Debug.Log("speech.config sent successfully!"); // SENDING AUDIO TO SPEECH SERVICE: // Speech-enabled client applications send audio to Speech Service by converting the audio stream // into a series of audio chunks. Each chunk of audio carries a segment of the spoken audio that's // to be transcribed by the service. The maximum size of a single audio chunk is 8,192 bytes. // Audio stream messages are Binary WebSocket messages. // First we need to send an audio packet with the RIFF PCM (WAV) data header. Note that we don't know how // many samples we'll have since we are recording live, so we set nbsamples to zero. // The WebSocket Speech protocol docs state that PCM audio must be sampled at 16 kHz with 16 bits per sample // and one channel (riff-16khz-16bit-mono-pcm) but 48kHz-16-bit-stereo works too, more testing is required. var wavHeader = BuildRiffWAVHeader(0, resolution, channels, rate); SendAudioPacket(CurrentRequestId, wavHeader); Debug.Log($"First Audio data paket with WAV header sent successfully!"); Debug.Log($"WebSocket Client is now ready to receive audio packets from the microphone."); state = JobState.ReadyForAudioPackets; }); // Wait for tasks to complete await Task.WhenAll(sending, receiving); if (sending.IsFaulted) { var err = sending.Exception; throw err; } if (receiving.IsFaulted) { var err = receiving.Exception; throw err; } return(true); } catch (Exception ex) { Debug.Log($"An exception occurred during creation of Speech Recognition job from microphone:" + Environment.NewLine + ex.Message); return(false); } }