示例#1
0
        public void SendAudioPacket(string requestId, byte[] data)
        {
            byte[] headerBytes;
            byte[] headerHead;
            headerBytes = BuildAudioPacketHeader(requestId);
            headerHead  = BuildAudioPacketHeaderHead(headerBytes);

            // The WebSocket Speech protocol docs state that PCM audio must be sampled at 16 kHz with 16 bits per sample
            // and one channel (riff-16khz-16bit-mono-pcm) but 48kHz-16-bit-stereo works too, more testing is required.
            var arr    = headerHead.Concat(headerBytes).Concat(data).ToArray();
            var arrSeg = new ArraySegment <byte>(arr, 0, arr.Length);

            Debug.Log($"Sending audio data sample from microphone.");
            if (SpeechWebSocketClient.State != WebSocketState.Open)
            {
                return;
            }
            SpeechWebSocketClient.SendAsync(arrSeg, WebSocketMessageType.Binary, true, new CancellationToken());
            Debug.Log($"Audio data packet from microphone sent successfully!");

            //var dt = Encoding.ASCII.GetString(arr);
        }
示例#2
0
        public async Task <bool> CreateSpeechRecognitionJobFromFile(string audioFilePath, string authenticationToken, string region)
        {
            try
            {
                state = JobState.PreparingJob;

                SpeechWebSocketClient = await InitializeSpeechWebSocketClient(authenticationToken, region);

                var receiving = Receiving(SpeechWebSocketClient);

                var sending = Task.Run(async() =>
                {
                    // Create a unique request ID, must be a UUID in "no-dash" format
                    var requestId = Guid.NewGuid().ToString("N");

                    ArraySegment <byte> buffer = CreateSpeechConfigMessagePayloadBuffer(requestId);

                    if (SpeechWebSocketClient.State != WebSocketState.Open)
                    {
                        return;
                    }

                    Debug.Log("Sending speech.config...");
                    // Send speech.config to Speech Service
                    await SpeechWebSocketClient.SendAsync(buffer, WebSocketMessageType.Text, true, new CancellationToken());
                    Debug.Log("speech.config sent successfully!");

                    // SENDING AUDIO TO SPEECH SERVICE:
                    // Speech-enabled client applications send audio to Speech Service by converting the audio stream
                    // into a series of audio chunks. Each chunk of audio carries a segment of the spoken audio that's
                    // to be transcribed by the service. The maximum size of a single audio chunk is 8,192 bytes.
                    // Audio stream messages are Binary WebSocket messages.
                    Debug.Log($"Preparing to send audio file: {audioFilePath}");
                    FileInfo audioFileInfo     = new FileInfo(audioFilePath);
                    FileStream audioFileStream = audioFileInfo.OpenRead();

                    state = JobState.ProcessingAudio;

                    byte[] headerBytes;
                    byte[] headerHead;
                    for (int cursor = 0; cursor < audioFileInfo.Length; cursor++)
                    {
                        headerBytes = BuildAudioPacketHeader(requestId);
                        headerHead  = BuildAudioPacketHeaderHead(headerBytes);

                        // PCM audio must be sampled at 16 kHz with 16 bits per sample and one channel (riff-16khz-16bit-mono-pcm).
                        var byteLen = 8192 - headerBytes.Length - 2;
                        var fbuff   = new byte[byteLen];
                        audioFileStream.Read(fbuff, 0, byteLen);

                        var arr    = headerHead.Concat(headerBytes).Concat(fbuff).ToArray();
                        var arrSeg = new ArraySegment <byte>(arr, 0, arr.Length);

                        Debug.Log($"Sending audio data from position: {cursor}");
                        if (SpeechWebSocketClient.State != WebSocketState.Open)
                        {
                            return;
                        }
                        cursor += byteLen;
                        var end = cursor >= audioFileInfo.Length;
                        await SpeechWebSocketClient.SendAsync(arrSeg, WebSocketMessageType.Binary, true, new CancellationToken());
                        Debug.Log($"Audio data from file {audioFilePath} sent successfully!");

                        var dt = Encoding.ASCII.GetString(arr);
                    }
                    await SendEmptyAudioMessageToWebSocketClient(SpeechWebSocketClient, requestId);
                    audioFileStream.Dispose();
                });

                // Wait for tasks to complete
                await Task.WhenAll(sending, receiving);

                if (sending.IsFaulted)
                {
                    state = JobState.Error;
                    var err = sending.Exception;
                    throw err;
                }
                if (receiving.IsFaulted)
                {
                    state = JobState.Error;
                    var err = receiving.Exception;
                    throw err;
                }

                return(true);
            }
            catch (Exception ex)
            {
                state = JobState.Error;
                Debug.Log($"An exception occurred during creation of Speech Recognition job from audio file {audioFilePath}:"
                          + Environment.NewLine + ex.Message);
                return(false);
            }
        }
示例#3
0
        /// <summary>
        /// prepares a new speech recognition job, sending the proper headers including the audio data header
        /// </summary>
        /// <param name="authenticationToken"></param>
        /// <param name="region"></param>
        /// <param name="resolution"></param>
        /// <param name="channels"></param>
        /// <param name="rate"></param>
        /// <returns></returns>
        public async Task <bool> CreateSpeechRecognitionJobFromVoice(string authenticationToken, string region, int resolution, int channels, int rate)
        {
            try
            {
                state = JobState.PreparingJob;

                SpeechWebSocketClient = await InitializeSpeechWebSocketClient(authenticationToken, region);

                var receiving = Receiving(SpeechWebSocketClient);

                var sending = Task.Run(async() =>
                {
                    // Create a unique request ID, must be a UUID in "no-dash" format
                    CurrentRequestId = Guid.NewGuid().ToString("N");

                    ArraySegment <byte> buffer = CreateSpeechConfigMessagePayloadBuffer(CurrentRequestId);

                    if (SpeechWebSocketClient.State != WebSocketState.Open)
                    {
                        return;
                    }

                    Debug.Log("Sending speech.config...");
                    // Send speech.config to Speech Service
                    await SpeechWebSocketClient.SendAsync(buffer, WebSocketMessageType.Text, true, new CancellationToken());
                    Debug.Log("speech.config sent successfully!");

                    // SENDING AUDIO TO SPEECH SERVICE:
                    // Speech-enabled client applications send audio to Speech Service by converting the audio stream
                    // into a series of audio chunks. Each chunk of audio carries a segment of the spoken audio that's
                    // to be transcribed by the service. The maximum size of a single audio chunk is 8,192 bytes.
                    // Audio stream messages are Binary WebSocket messages.
                    // First we need to send an audio packet with the RIFF PCM (WAV) data header. Note that we don't know how
                    // many samples we'll have since we are recording live, so we set nbsamples to zero.

                    // The WebSocket Speech protocol docs state that PCM audio must be sampled at 16 kHz with 16 bits per sample
                    // and one channel (riff-16khz-16bit-mono-pcm) but 48kHz-16-bit-stereo works too, more testing is required.
                    var wavHeader = BuildRiffWAVHeader(0, resolution, channels, rate);
                    SendAudioPacket(CurrentRequestId, wavHeader);
                    Debug.Log($"First Audio data paket with WAV header sent successfully!");

                    Debug.Log($"WebSocket Client is now ready to receive audio packets from the microphone.");

                    state = JobState.ReadyForAudioPackets;
                });

                // Wait for tasks to complete
                await Task.WhenAll(sending, receiving);

                if (sending.IsFaulted)
                {
                    var err = sending.Exception;
                    throw err;
                }
                if (receiving.IsFaulted)
                {
                    var err = receiving.Exception;
                    throw err;
                }


                return(true);
            }
            catch (Exception ex)
            {
                Debug.Log($"An exception occurred during creation of Speech Recognition job from microphone:"
                          + Environment.NewLine + ex.Message);
                return(false);
            }
        }