/// <summary>
        /// Starts this instance.
        /// </summary>
        private async Task _start()
        {
            await this._syncLock.WaitAsync().ConfigureAwait(false);

            if (!_isRunning)
            {
                _tokenSource = new CancellationTokenSource();
                _buffer      = new BufferBlock <SerializableAudioMediaBuffer>(new DataflowBlockOptions {
                    CancellationToken = this._tokenSource.Token
                });
                await Task.Factory.StartNew(this._process).ConfigureAwait(false);

                // Initialize speech recognizer.
                Debug.WriteLine("RecordingBot _start.");
                _audioStream = new VoiceAudioStream();
                var audioFormat  = AudioStreamFormat.GetWaveFormatPCM(16000, 16, 1);
                var audioConfig  = AudioConfig.FromStreamInput(_audioStream, audioFormat);
                var speechConfig = SpeechConfig.FromSubscription("03f0f0daa33448ba9f9bf799d2e14d2a", "westus2");

                _speechClient              = new SpeechRecognizer(speechConfig, audioConfig);
                _speechClient.Recognized  += _speechClient_Recognized;
                _speechClient.Recognizing += _speechClient_Recognizing;
                _speechClient.Canceled    += _speechClient_Canceled;
                await _speechClient.StartContinuousRecognitionAsync();

                _isRunning = true;
            }
            this._syncLock.Release();
        }
        private static PullAudioInputStreamCallback OpenWavFileStream(string filename, out AudioStreamFormat format)
        {
            BinaryReader reader = new BinaryReader(File.OpenRead(filename));

            // Tag "RIFF"
            char[] data = new char[4];
            reader.Read(data, 0, 4);
            Trace.Assert((data[0] == 'R') && (data[1] == 'I') && (data[2] == 'F') && (data[3] == 'F'), "Wrong wav header");

            // Chunk size
            long fileSize = reader.ReadInt32();

            // Subchunk, Wave Header
            // Subchunk, Format
            // Tag: "WAVE"
            reader.Read(data, 0, 4);
            Trace.Assert((data[0] == 'W') && (data[1] == 'A') && (data[2] == 'V') && (data[3] == 'E'), "Wrong wav tag in wav header");

            // Tag: "fmt"
            reader.Read(data, 0, 4);
            Trace.Assert((data[0] == 'f') && (data[1] == 'm') && (data[2] == 't') && (data[3] == ' '), "Wrong format tag in wav header");

            // chunk format size
            var formatSize           = reader.ReadInt32();
            var unusedFormatTag      = reader.ReadUInt16();
            var channels             = reader.ReadUInt16();
            var samplesPerSecond     = reader.ReadUInt32();
            var unusedAvgBytesPerSec = reader.ReadUInt32();
            var unusedBlockAlign     = reader.ReadUInt16();
            var bitsPerSample        = reader.ReadUInt16();

            // Until now we have read 16 bytes into format, the rest is cbSize and is ignored for now.
            if (formatSize > 16)
            {
                reader.ReadBytes((int)(formatSize - 16));
            }

            bool foundDataChunk = false;

            while (!foundDataChunk)
            {
                reader.Read(data, 0, 4);
                var chunkSize = reader.ReadInt32();
                if ((data[0] == 'd') && (data[1] == 'a') && (data[2] == 't') && (data[3] == 'a'))
                {
                    foundDataChunk = true;
                    break;
                }
                reader.ReadBytes(chunkSize);
            }
            if (!foundDataChunk)
            {
                throw new System.ApplicationException("${filename} does not contain a data chunk!");
            }

            // Now, we have the format in the format parameter and the
            // reader set to the start of the body, i.e., the raw sample data
            format = AudioStreamFormat.GetWaveFormatPCM(samplesPerSecond, (byte)bitsPerSample, (byte)channels);
            return(new AudioStreamReader(reader));
        }
        /// <summary>
        /// This method opens wavefile.
        /// </summary>
        /// <param name="filename">The wavfile to read the audio data from.</param>
        public static AudioConfig OpenWavFile(string filename)
        {
            AudioStreamFormat format = null;
            var callback             = OpenWavFileStream(filename, out format);

            return(AudioConfig.FromStreamInput(callback, format));
        }
Пример #4
0
    public static AudioConfig OpenWavFile(Stream stream)
    {
        BinaryReader      reader = new BinaryReader(stream);
        AudioStreamFormat format = readWaveHeader(reader);

        return(AudioConfig.FromStreamInput(new BinaryAudioStreamReader(reader), format));
    }
Пример #5
0
        public async void AudioStart()
        {
            var audioStream  = new VoiceAudioStream();
            var audioFormat  = AudioStreamFormat.GetWaveFormatPCM(16000, 16, 1);
            var audioConfig  = AudioConfig.FromStreamInput(audioStream, audioFormat);
            var speechConfig = SpeechConfig.FromSubscription(_config["SpeechApiKey"], _config["SpeechRegion"]);
            var speechClient = new SpeechRecognizer(speechConfig, audioConfig);
            var phraseList   = PhraseListGrammar.FromRecognizer(speechClient);

            foreach (var phrase in phrases)
            {
                phraseList.AddPhrase(phrase);
            }

            speechClient.Recognized += _speechClient_Recognized;

            string sessionId = speechClient.Properties.GetProperty(PropertyId.Speech_SessionId);

            var conn = new ConnectionInfo()
            {
                SessionId    = sessionId,
                AudioStream  = audioStream,
                SpeechClient = speechClient,
            };

            _connections.Add(Context.ConnectionId, conn);

            await speechClient.StartContinuousRecognitionAsync();

            Debug.WriteLine("Audio start message.");
        }
Пример #6
0
        public static AudioStreamFormat readWaveHeader(BinaryReader reader)
        {
            char[] data = new char[4];
            reader.Read(data, 0, 4);
            Trace.Assert((data[0] == 'R') && (data[1] == 'I') && (data[2] == 'F') && (data[3] == 'F'), "Wrong wav header");

            long fileSize = reader.ReadInt32();

            reader.Read(data, 0, 4);
            Trace.Assert((data[0] == 'W') && (data[1] == 'A') && (data[2] == 'V') && (data[3] == 'E'), "Wrong wav tag in wav header");

            reader.Read(data, 0, 4);
            Trace.Assert((data[0] == 'f') && (data[1] == 'm') && (data[2] == 't') && (data[3] == ' '), "Wrong format tag in wav header");

            var formatSize       = reader.ReadInt32();
            var formatTag        = reader.ReadUInt16();
            var channels         = reader.ReadUInt16();
            var samplesPerSecond = reader.ReadUInt32();
            var avgBytesPerSec   = reader.ReadUInt32();
            var blockAlign       = reader.ReadUInt16();
            var bitsPerSample    = reader.ReadUInt16();

            if (formatSize > 16)
            {
                reader.ReadBytes((int)(formatSize - 16));
            }

            reader.Read(data, 0, 4);
            // Trace.Assert((data[0] == 'd') && (data[1] == 'a') && (data[2] == 't') && (data[3] == 'a'), "Wrong data tag in wav");

            int dataSize = reader.ReadInt32();

            return(AudioStreamFormat.GetWaveFormatPCM(samplesPerSecond, (byte)bitsPerSample, (byte)channels));
        }
        private void SetupTranscriptionAndTranslationService()
        {
            try
            {
                var lCognitiveKey    = _settings.AzureCognitiveKey;
                var lCognitiveRegion = _settings.AzureCognitiveRegion;

                _eventPublisher.Publish("MySTT Setup", $"Got region: {lCognitiveRegion}, key starting from: {lCognitiveKey??lCognitiveKey.Substring(0, lCognitiveKey.Length /2)}");

                this.mTransSpeechConfig = SpeechTranslationConfig.FromSubscription(lCognitiveKey, lCognitiveRegion);
                var fromLanguage = "en-US";
                var toLanguages  = new List <string> {
                    "el-GR"
                };
                //var toLanguages = new List<string> { "ru-RU" };
                this.mTransSpeechConfig.SpeechRecognitionLanguage = fromLanguage;
                toLanguages.ForEach(this.mTransSpeechConfig.AddTargetLanguage);
                this.mInputStream = AudioInputStream.CreatePushStream(AudioStreamFormat.GetWaveFormatPCM(SAMPLESPERSECOND, BITSPERSAMPLE, NUMBEROFCHANNELS));

                this.mAudioConfig           = AudioConfig.FromStreamInput(this.mInputStream);
                this.mTranslationRecognizer = new TranslationRecognizer(this.mTransSpeechConfig, this.mAudioConfig);

                this.mTranslationRecognizer.Recognizing       += this.MSpeechRecognizer_Recognizing;
                this.mTranslationRecognizer.Recognized        += this.MSpeechRecognizer_Recognized;
                this.mTranslationRecognizer.SpeechEndDetected += this.MSpeechRecognizer_SpeechEndDetected;

                this.StartRecognisionIfNeeded();
            }
            catch (Exception ex)
            {
                _eventPublisher.Publish("MySTT Setup - Failed", $"Failed to initialize: {ex.Message}");
            }
        }
        /// <summary>
        /// Remote audio transcription of the given audioFile with CognitiveServices
        /// </summary>
        public static AnalysisResult TranscribeAudio(ref AnalysisResult audioResponse, IFormFile audioFile)
        {
            // needed for speaker diarization to resolve at the word level
            SPEECH_CONFIG.RequestWordLevelTimestamps();

            var audioFormat128 = AudioStreamFormat.GetWaveFormatPCM(8000, 16, 1);
            var audioFormat256 = AudioStreamFormat.GetWaveFormatPCM(16000, 16, 1);

            // load bytestream -> audio stream
            // load audio config from audio stream
            // initialize speech recognizer
            using (var br = new BinaryReader(audioFile.OpenReadStream()))
                using (var audioInputStream = AudioInputStream.CreatePushStream(audioFormat128))
                    using (var audioConfig = AudioConfig.FromStreamInput(audioInputStream))
                        using (var recognizer = new SpeechRecognizer(SPEECH_CONFIG, audioConfig))
                        {
                            long nbytes = audioFile.Length;
                            var  buff   = new List <byte>();

                            // read through bytes of audio
                            byte[] readBytes;
                            do
                            {
                                readBytes = br.ReadBytes(1024);
                                buff.AddRange(readBytes);
                                audioInputStream.Write(readBytes, readBytes.Length);
                            } while (readBytes.Length > 0);

                            var transcript = ExecuteRecognizer(recognizer).Result;
                            audioResponse.Transcript = transcript;
                            return(audioResponse);
                        }
        }
Пример #9
0
        public static BinaryAudioStreamReader CreateWavReader(string filename)
        {
            BinaryReader      reader = new BinaryReader(File.OpenRead(filename));
            AudioStreamFormat format = readWaveHeader(reader);

            return(new BinaryAudioStreamReader(reader));
        }
Пример #10
0
        public static async Task <string> RecognizeSpeechFromUrlAsync(string url, string locale)
        {
            byte[] audioData = null;
            using (var wc = new System.Net.WebClient())
            {
                audioData = wc.DownloadData(url);
            }
            var stream = new MemoryStream(audioData);

            var speechApiKey    = Environment.GetEnvironmentVariable("SpeechApiKey");
            var speechApiRegion = Environment.GetEnvironmentVariable("SpeechApiRegion");

            var speechConfig = SpeechConfig.FromSubscription(speechApiKey, speechApiRegion);

            speechConfig.SpeechRecognitionLanguage = locale;

            var audioFormat = AudioStreamFormat.GetWaveFormatPCM(16000, 16, 1);
            var audioStream = new VoiceAudioStream(stream);
            var audioConfig = AudioConfig.FromStreamInput(audioStream, audioFormat);

            var recognizer = new SpeechRecognizer(speechConfig, audioConfig);
            var result     = await recognizer.RecognizeOnceAsync();

            return(result.Text);
        }
Пример #11
0
        public static AudioConfig OpenWavFile(BinaryReader reader, AudioProcessingOptions audioProcessingOptions = null)
        {
            AudioStreamFormat format = readWaveHeader(reader);

            return((audioProcessingOptions == null)
                    ? AudioConfig.FromStreamInput(new BinaryAudioStreamReader(reader), format)
                    : AudioConfig.FromStreamInput(new BinaryAudioStreamReader(reader), format, audioProcessingOptions));
        }
Пример #12
0
        public static BinaryAudioStreamReader CreateWavReader(string filename)
        {
            BinaryReader reader = new BinaryReader(File.OpenRead(filename));
            // read the wave header so that it won't get into the in the following readings
            AudioStreamFormat format = ReadWaveHeader(reader);

            return(new BinaryAudioStreamReader(reader));
        }
Пример #13
0
        public async void RegisterAttendeeAsync(string name, string myLanguage, string preferredLanguage)
        {
            Debug.WriteLine($"User {name}, Language: {myLanguage}, Connection {Context.ConnectionId} starting audio.");
            var config = _config.GetSection("SpeechAPI").Get <AppSettings>();

            bool exists = await InitializeAttendeeInfo(name, myLanguage, preferredLanguage);

            var audioStream = new VoiceAudioStream();
            var audioFormat = AudioStreamFormat.GetWaveFormatPCM(16000, 16, 1);
            var audioConfig = AudioConfig.FromStreamInput(audioStream, audioFormat);

            var speechKey    = config.SubscriptionKey;
            var speechRegion = config.Region;
            var url          = config.EndpointUri;

            Debug.WriteLine($"Key:{speechKey} | Region:{speechRegion}");

            var speechConfig = SpeechConfig.FromSubscription(speechKey, speechRegion);

            speechConfig.SpeechRecognitionLanguage = preferredLanguage;
            speechConfig.OutputFormat = OutputFormat.Simple;

            var speechClient = new SpeechRecognizer(speechConfig, audioConfig);

            speechClient.Recognized     += _speechClient_Recognized;
            speechClient.Recognizing    += _speechClient_Recognizing;
            speechClient.Canceled       += _speechClient_Canceled;
            speechClient.SessionStarted += _speechClient_SessionStarted;
            string sessionId = speechClient.Properties.GetProperty(PropertyId.Speech_SessionId);

            //Maintains only one API connection per language
            SpeechAPIConnection conn = null;

            if (_connections.ContainsKey(preferredLanguage))
            {
                conn           = _connections[preferredLanguage];
                conn.SessionId = sessionId;
            }
            else
            {
                conn = new SpeechAPIConnection()
                {
                    SessionId   = sessionId,
                    AudioStream = audioStream,
                    Recognizer  = speechClient,
                    Language    = preferredLanguage
                };
                _connections[preferredLanguage] = conn;
            }

            Debug.WriteLine($"Connection for {preferredLanguage} added | SessionId:{sessionId}");

            await SendToAttendeeAsync(_attendeeInfo.GetAttendeeByConnectionID(Context.ConnectionId), $"Welcome:{name}");

            await speechClient.StartContinuousRecognitionAsync();

            Debug.WriteLine("Audio start message.");
        }
Пример #14
0
        public static AudioStreamFormat readWaveHeader(BinaryReader reader)
        {
            // Tag "RIFF"
            char[] data = new char[4];
            reader.Read(data, 0, 4);
            Trace.Assert((data[0] == 'R') && (data[1] == 'I') && (data[2] == 'F') && (data[3] == 'F'), "Wrong wav header");

            // Chunk size
            long fileSize = reader.ReadInt32();

            // Subchunk, Wave Header
            // Subchunk, Format
            // Tag: "WAVE"
            reader.Read(data, 0, 4);
            Trace.Assert((data[0] == 'W') && (data[1] == 'A') && (data[2] == 'V') && (data[3] == 'E'), "Wrong wav tag in wav header");

            // Tag: "fmt"
            reader.Read(data, 0, 4);
            Trace.Assert((data[0] == 'f') && (data[1] == 'm') && (data[2] == 't') && (data[3] == ' '), "Wrong format tag in wav header");

            // chunk format size
            int    formatSize       = reader.ReadInt32();
            ushort formatTag        = reader.ReadUInt16();
            ushort channels         = reader.ReadUInt16();
            uint   samplesPerSecond = reader.ReadUInt32();
            uint   avgBytesPerSec   = reader.ReadUInt32();
            ushort blockAlign       = reader.ReadUInt16();
            ushort bitsPerSample    = reader.ReadUInt16();

            // Until now we have read 16 bytes in format, the rest is cbSize and is ignored for now.
            if (formatSize > 16)
            {
                reader.ReadBytes((int)(formatSize - 16));
            }

            // Handle optional LIST chunk.
            // tag: "LIST"
            reader.Read(data, 0, 4);
            if (data[0] == 'L' && data[1] == 'I' && data[2] == 'S' && data[3] == 'T')
            {
                uint listChunkSize = reader.ReadUInt32();
                reader.ReadBytes((int)listChunkSize);
                reader.Read(data, 0, 4);
            }

            // Second Chunk, data
            // tag: "data"
            Trace.Assert((data[0] == 'd') && (data[1] == 'a') && (data[2] == 't') && (data[3] == 'a'), "Wrong data tag in wav");
            // data chunk size
            int dataSize = reader.ReadInt32();

            // now, we have the format in the format parameter and the
            // reader set to the start of the body, i.e., the raw sample data
            return(AudioStreamFormat.GetWaveFormatPCM(samplesPerSecond, (byte)bitsPerSample, (byte)channels));
        }
Пример #15
0
        private SpeechSynthesizer BuildAzureSpeechSynthesizer()
        {
            // Create an audio config to tell Azure Speech SDK to return speech output as a memory stream
            // using its default output format (16kHz, 16bit, mono).
            var audioConfig =
                AudioConfig.FromStreamOutput(
                    AudioOutputStream.CreatePullStream(AudioStreamFormat.GetDefaultOutputFormat()));

            // Create an instance of the Azure Speech SDK speech synthesizer
            return(new SpeechSynthesizer(SpeechConfig, audioConfig));
        }
Пример #16
0
        public AudioSegment(byte[] audioData, long startOffset, long endOffset,
                            uint sampleRate = SAMPLE_RATE, byte bitsPerSample = BITS_PER_SAMPLE, byte channels = CHANNELS)
        {
            MemoryStream      tempStream   = new MemoryStream(audioData);
            AudioStreamFormat streamFormat = AudioStreamFormat.GetWaveFormatPCM(sampleRate, bitsPerSample, channels);

            AudioStream = AudioInputStream.CreatePullStream(new BinaryAudioStreamReader(tempStream), streamFormat);

            AudioData   = audioData;
            StartOffset = startOffset;
            EndOffset   = endOffset;
        }
Пример #17
0
        public static AudioStreamFormat ReadWaveHeader(this BinaryReader reader)
        {
            // Tag "RIFF"
            char[] data = new char[4];
            reader.Read(data, 0, 4);
            Trace.Assert((data[0] == 'R') && (data[1] == 'I') && (data[2] == 'F') && (data[3] == 'F'), "Wrong wav header");

            // Chunk size
            long fileSize = reader.ReadInt32();

            // Subchunk, Wave Header
            // Subchunk, Format
            // Tag: "WAVE"
            reader.Read(data, 0, 4);
            Trace.Assert((data[0] == 'W') && (data[1] == 'A') && (data[2] == 'V') && (data[3] == 'E'), "Wrong wav tag in wav header");

            // Tag: "fmt"
            reader.Read(data, 0, 4);
            Trace.Assert((data[0] == 'f') && (data[1] == 'm') && (data[2] == 't') && (data[3] == ' '), "Wrong format tag in wav header");

            // chunk format size
            var formatSize       = reader.ReadInt32();
            var formatTag        = reader.ReadUInt16();
            var channels         = reader.ReadUInt16();
            var samplesPerSecond = reader.ReadUInt32();
            var avgBytesPerSec   = reader.ReadUInt32();
            var blockAlign       = reader.ReadUInt16();
            var bitsPerSample    = reader.ReadUInt16();

            // Until now we have read 16 bytes in format, the rest is cbSize and is ignored for now.
            if (formatSize > 16)
            {
                reader.ReadBytes((int)(formatSize - 16));
            }

            // Second Chunk, data
            // tag: data.
            reader.Read(data, 0, 4);
            if ((data[0] == 'd') && (data[1] == 'a') && (data[2] == 't') && (data[3] == 'a')
                ||
                (data[0] == 'L') && (data[1] == 'I') && (data[2] == 'S') && (data[3] == 'T')
                is false)
            {
                throw new Exception("Wrong data tag in wav");
            }
            // data chunk size
            int dataSize = reader.ReadInt32();

            // now, we have the format in the format parameter and the
            // reader set to the start of the body, i.e., the raw sample data
            return(AudioStreamFormat.GetWaveFormatPCM(samplesPerSecond, (byte)bitsPerSample, (byte)channels));
        }
        /// <summary>
        /// Constructs an <see cref="AudioConfig"/> from <see cref="Config"/>.
        /// Depending on the available services, this may either use the audio features built into the Speech SDK (such as <see cref="AudioConfig.FromDefaultMicrophoneInput"/>),
        /// or it may construct a <see cref="IStreamAudioSource"/> that accesses the requested <see cref="AudioDevice"/> with resampling and noise gates as required.
        /// </summary>
        /// <returns></returns>
        protected AudioConfig GetAudioConfig()
        {
            var streamSource = GetStreamAudioSource(Config.AudioSource);

            if (streamSource != null)
            {
                //use this stream source and convert to an Azure audio stream
                try
                {
                    var azureInput = AudioInputStream.CreatePushStream(AudioStreamFormat.GetWaveFormatPCM(
                                                                           (uint)streamSource.Format.SampleRate,
                                                                           (byte)streamSource.Format.BitsPerSample,
                                                                           (byte)streamSource.Format.ChannelCount));

                    byte[] bufferOptional = null;
                    streamSource.DataAvailable += (s, e) =>
                    {
                        azureInput.Write(e.Buffer.GetArray(ref bufferOptional), e.Buffer.Count);
                    };
                    streamSource.Stopped += (s, e) =>
                    {
                        if (e.Cause == StreamAudioSourceStoppedCause.Stopped)
                        {
                            //signal end-of-stream to Azure
                            azureInput.Close();
                        }
                    };

                    this.StreamAudioSource = streamSource;
                    return(AudioConfig.FromStreamInput(azureInput));
                }
                catch (Exception ex)
                {
                    Logger.LogError(ex, $"Error while creating an Azure AudioConfig from an IStreamAudioSource. Format: SampleRate={streamSource.Format.SampleRate}, BitsPerSample={streamSource.Format.BitsPerSample}, Channels={streamSource.Format.ChannelCount}");
                    streamSource.Dispose();
                }
            }

            this.StreamAudioSource    = null;
            this.StreamAudioNoiseGate = null;

            //try and use the built-in audio engine
            if (Config.AudioSource is AudioDevice audioDevice)
            {
                if (audioDevice.UseDefaultAudioInputDevice)
                {
                    return(AudioConfig.FromDefaultMicrophoneInput());
                }
            }

            return(null);
        }
        // Allows OverlordBot to listen for a specific word to start listening. Currently not used although the setup has all been done.
        // This is due to wierd state transition errors that I cannot be bothered to debug. Possible benefit is less calls to Speech endpoint but
        // not sure if that is good enough or not to keep investigating.
        //private readonly KeywordRecognitionModel _wakeWord;

        public SpeechRecognitionListener(BufferedWaveProvider bufferedWaveProvider, ConcurrentQueue <byte[]> responseQueue, RadioInformation radioInfo)
        {
            radioInfo.TransmissionQueue = responseQueue;
            _botType   = radioInfo.botType;
            _frequency = radioInfo.freq;
            _callsign  = radioInfo.callsign;

            _logClientId = radioInfo.name;

            switch (radioInfo.botType)
            {
            case "ATC":
                Controller = new AtcController
                {
                    Callsign = radioInfo.callsign,
                    Voice    = radioInfo.voice,
                    Radio    = radioInfo
                };
                break;

            case "AWACS":
                Controller = new AwacsController
                {
                    Callsign = radioInfo.callsign,
                    Voice    = radioInfo.voice,
                    Radio    = radioInfo
                };
                break;

            default:
                Controller = new MuteController
                {
                    Callsign = radioInfo.callsign,
                    Voice    = null,
                    Radio    = null
                };
                break;
            }

            var encoder = OpusEncoder.Create(AudioManager.InputSampleRate, 1, Application.Voip);

            encoder.ForwardErrorCorrection = false;
            encoder.FrameByteCount(AudioManager.SegmentFrames);

            var streamReader = new BufferedWaveProviderStreamReader(bufferedWaveProvider);

            _audioConfig = AudioConfig.FromStreamInput(streamReader, AudioStreamFormat.GetWaveFormatPCM(16000, 16, 1));

            //_wakeWord = KeywordRecognitionModel.FromFile($"Overlord/WakeWords/{callsign}.table");
        }
        public async Task Start()
        {
            var config = SpeechConfig.FromSubscription(_projectSettings.AzureSpeechServiceSubscriptionKey, _projectSettings.AzureSpeechServiceRegionName);

            var audioFormat = AudioStreamFormat.GetWaveFormatPCM(8000, 16, 1);

            _inputStream = AudioInputStream.CreatePushStream(audioFormat);
            _audioInput  = AudioConfig.FromStreamInput(_inputStream);

            _recognizer = new SpeechRecognizer(config, _audioInput);
            _recognizer.SessionStarted += RecognizerStarted;
            _recognizer.Recognized     += RecognizerRecognized;
            _recognizer.Canceled       += RecognizerCancelled;

            await _recognizer.StartContinuousRecognitionAsync();
        }
Пример #21
0
    public async UniTask STTBytes(byte[] readBytes, int sampleRate, int bitRate, int channels)
    {
        var speechConfig = SpeechConfig.FromSubscription(subscription_key, region);

        speechConfig.SpeechRecognitionLanguage = location;
        var audioStreamFormat = AudioStreamFormat.GetWaveFormatPCM((uint)sampleRate, (byte)bitRate, (byte)channels);
        var audioInputStream  = AudioInputStream.CreatePushStream(audioStreamFormat);
        var audioConfig       = AudioConfig.FromStreamInput(audioInputStream);
        var recognizer        = new SpeechRecognizer(speechConfig, audioConfig);

        audioInputStream.Write(readBytes, readBytes.Length);

        var result = await recognizer.RecognizeOnceAsync();

        Debug.Log($"Recognized Line : = {result.Text}");
    }
Пример #22
0
        public static AudioConfig DownloadWavFile(BinaryReader reader)
        {
            // Tag "RIFF"
            char[] data = new char[4];
            reader.Read(data, 0, 4);

            // Chunk size
            long fileSize = reader.ReadInt32();

            // Subchunk, Wave Header
            // Subchunk, Format
            // Tag: "WAVE"
            reader.Read(data, 0, 4);

            // Tag: "fmt"
            reader.Read(data, 0, 4);

            // chunk format size
            var formatSize       = reader.ReadInt32();
            var formatTag        = reader.ReadUInt16();
            var channels         = reader.ReadUInt16();
            var samplesPerSecond = reader.ReadUInt32();
            var avgBytesPerSec   = reader.ReadUInt32();
            var blockAlign       = reader.ReadUInt16();
            var bitsPerSample    = reader.ReadUInt16();

            // Until now we have read 16 bytes in format, the rest is cbSize and is ignored for now.
            if (formatSize > 16)
            {
                reader.ReadBytes(formatSize - 16);
            }

            // Second Chunk, data
            // tag: data.
            reader.Read(data, 0, 4);

            // data chunk size
            int dataSize = reader.ReadInt32();

            // now, we have the format in the format parameter and the
            // reader set to the start of the body, i.e., the raw sample data
            AudioStreamFormat format = AudioStreamFormat.GetWaveFormatPCM(samplesPerSecond, (byte)bitsPerSample, (byte)channels);

            return(AudioConfig.FromStreamInput(new BinaryAudioStreamReader(reader), format));
        }
Пример #23
0
        private void Init(string from, string to)
        {
            this.toLanguage = to;

            Profile       = MediaEncodingProfile.CreateWav(AudioEncodingQuality.Low);
            Profile.Audio = AudioEncodingProperties.CreatePcm(16000, 1, 16);

            byte channels         = 1;
            byte bitsPerSample    = 16;
            uint samplesPerSecond = 16000; // or 8000
            var  audioFormat      = AudioStreamFormat.GetWaveFormatPCM(samplesPerSecond, bitsPerSample, channels);

            // Init Push Stream

            pushStream = AudioInputStream.CreatePushStream(audioFormat);

            if (from == to)
            {
                var config = SpeechConfig.FromSubscription(apiKey, region);
                config.SpeechRecognitionLanguage = from;

                speechRecognizer = new SpeechRecognizer(config, AudioConfig.FromStreamInput(pushStream));

                speechRecognizer.Recognizing += RecognisingSpeechHandler;
                speechRecognizer.Recognized  += RecognisingSpeechHandler;

                speechRecognizer.SessionStarted += (sender, args) => this.RecognisionStarted?.Invoke();
                speechRecognizer.SessionStopped += (sender, args) => this.RecognisionStopped?.Invoke();
            }
            else
            {
                var config = SpeechTranslationConfig.FromSubscription(apiKey, region);
                config.SpeechRecognitionLanguage = from;
                config.AddTargetLanguage(to);

                translationRecognizer = new TranslationRecognizer(config, AudioConfig.FromStreamInput(pushStream));

                translationRecognizer.SessionStarted += (sender, args) => this.RecognisionStarted?.Invoke();
                translationRecognizer.SessionStopped += (sender, args) => this.RecognisionStopped?.Invoke();

                translationRecognizer.Recognizing += RecognisingTranslationHandler;
                translationRecognizer.Recognized  += RecognisingTranslationHandler;
            }
        }
Пример #24
0
        public static async Task <string> RecognizeSpeechFromBytesAsync(byte[] bytes, string locale)
        {
            MemoryStream stream = new MemoryStream(bytes);

            var speechApiKey    = Environment.GetEnvironmentVariable("SpeechApiKey");
            var speechApiRegion = Environment.GetEnvironmentVariable("SpeechApiRegion");

            var speechConfig = SpeechConfig.FromSubscription(speechApiKey, speechApiRegion);

            speechConfig.SpeechRecognitionLanguage = locale;

            var audioFormat = AudioStreamFormat.GetWaveFormatPCM(16000, 16, 1);
            var audioStream = new VoiceAudioStream(stream);
            var audioConfig = AudioConfig.FromStreamInput(audioStream);

            var recognizer = new SpeechRecognizer(speechConfig, audioConfig);
            var result     = await recognizer.RecognizeOnceAsync();

            return(result.Text);
        }
        public async Task TranscribeSpeechFromAudioStream(SpeechConfig config, string person, int startSecond = 0, int endSecond = 0)
        {
            var audioFormat = AudioStreamFormat.GetWaveFormatPCM(SamplesPerSecond, BitsPerSample, Channels);

            using (var waveFileReader = new WaveFileReader(_wavFilePath))
            {
                var pullAudioInputStreamCallback = new VoiceAudioStream();
                TrimWavFile(waveFileReader, pullAudioInputStreamCallback, BytesPerSecond * startSecond,
                            BytesPerSecond * endSecond);
                var speechToText = new SpeechToTextRecognizer(person, _streamWriter);
                using (var audioConfig = AudioConfig.FromStreamInput(pullAudioInputStreamCallback, audioFormat))
                {
                    using (var basicRecognizer = new SpeechRecognizer(config, audioConfig))
                    {
                        await speechToText.RunRecognizer(basicRecognizer, RecognizerType.Base,
                                                         _stopBaseRecognitionTaskCompletionSource).ConfigureAwait(false);
                    }
                }
            };
        }
Пример #26
0
        public async Task <IAudioClip> Synthesize(string text)
        {
            var stream = AudioOutputStream.CreatePullStream(AudioStreamFormat.GetWaveFormatPCM(16000, 16, 1));

            //Generate voice data into stream
            using (var streamConfig = AudioConfig.FromStreamOutput(stream))
                using (var synthesizer = new SpeechSynthesizer(_config, streamConfig))
                {
                    using (var result = await synthesizer.SpeakTextAsync(text))
                    {
                        if (result.Reason == ResultReason.Canceled)
                        {
                            var cancellation = SpeechSynthesisCancellationDetails.FromResult(result);
                            throw new TaskCanceledException($"{cancellation.Reason}: {cancellation.ErrorDetails}");
                        }
                    }
                }

            //Create a clip which consumes this audio data
            return(new AudioOutputStreamClip($"TTS:`{text}`", stream, new WaveFormat(16000, 16, 1)));
        }
Пример #27
0
 //
 // Create AudioConfig
 //
 private AudioConfig AudioConfigFromUserConfig()
 {
     if (this.userConfig.inputFilePath is string inputFilePathValue)
     {
         if (!this.userConfig.useCompressedAudio)
         {
             return(Helper.OpenWavFile(inputFilePathValue, AudioProcessingOptions.Create(0)));
         }
         else
         {
             var reader = new BinaryReader(File.OpenRead(inputFilePathValue));
             var format = AudioStreamFormat.GetCompressedFormat(userConfig.compressedAudioFormat);
             var stream = new PullAudioInputStream(new BinaryAudioStreamReader(reader), format);
             return(AudioConfig.FromStreamInput(stream));
         }
     }
     else
     {
         return(AudioConfig.FromDefaultMicrophoneInput());
     }
 }
Пример #28
0
        /// <summary>
        /// Creates a class-level Speech Recognizer for a specific language using Azure credentials
        /// and hooks-up lifecycle & recognition events
        /// </summary>
        void CreateSpeechRecognizer()
        {
            if (SpeechServiceAPIKey.Length == 0 || SpeechServiceAPIKey == String.Empty)
            {
                finalString = "You forgot to obtain Cognitive Services Speech credentials and inserting them in this app." + Environment.NewLine +
                              "See the README file and/or the instructions in the Awake() function for more info before proceeding.";
                errorString = "ERROR: Missing service credentials";
                UnityEngine.Debug.LogFormat(errorString);
                return;
            }
            UnityEngine.Debug.LogFormat("Creating Speech Recognizer.");
            // finalString = "Initializing speech recognition, please wait...";
            finalString = "Start: ";

            if (recognizer == null)
            {
                SpeechConfig sconfig = SpeechConfig.FromSubscription("b9bdc34702c1439589daf92475e8f827", "westus2");
                sconfig.SpeechRecognitionLanguage = fromLanguage;

                audioStream = new MicToAudioStream();
                AudioConfig aconfig = AudioConfig.FromStreamInput(audioStream, AudioStreamFormat.GetWaveFormatPCM(16000, 16, 1));

                recognizer = new SpeechRecognizer(sconfig, aconfig);

                if (recognizer != null)
                {
                    // Subscribes to speech events.
                    recognizer.Recognizing         += RecognizingHandler;
                    recognizer.Recognized          += RecognizedHandler;
                    recognizer.SpeechStartDetected += SpeechStartDetectedHandler;
                    recognizer.SpeechEndDetected   += SpeechEndDetectedHandler;
                    recognizer.Canceled            += CanceledHandler;
                    recognizer.SessionStarted      += SessionStartedHandler;
                    recognizer.SessionStopped      += SessionStoppedHandler;
                }
            }
            UnityEngine.Debug.LogFormat("CreateSpeechRecognizer exit");
        }
Пример #29
0
        public async Task <string> AudioToTextAsync(byte[] pcm)
        {
            var guid = Guid.NewGuid();

            if (!Text.ContainsKey(guid))
            {
                Text[guid] = null;
            }

            // Build out the speech recognizer
            using (var pushStream = AudioInputStream.CreatePushStream(AudioStreamFormat.GetDefaultInputFormat()))
                using (var audioInput = AudioConfig.FromStreamInput(pushStream))
                    using (var recognizer = new SpeechRecognizer(SpeechConfig, audioInput))
                    {
                        // Subscribe to speech recognizer events.
                        recognizer.SessionStarted += OnSpeechRecognitionSessionStarted;
                        recognizer.Recognizing    += OnSpeechRecognizing;
                        recognizer.Recognized     += (s, e) => OnSpeechRecognized(s, e, guid);
                        recognizer.Canceled       += OnSpeechCanceled;
                        recognizer.SessionStopped += OnSpeechRecognitionSessionStopped;

                        // Start continuous recognition. Uses StopContinuousRecognitionAsync() to stop recognition.
                        await recognizer.StartContinuousRecognitionAsync().ConfigureAwait(false);

                        // Send the pcm data to the speech recognizer
                        pushStream.Write(pcm);
                        pushStream.Close();

                        // Wait for completion.
                        // Use Task.WaitAny to keep the task rooted.
                        Task.WaitAny(StopRecognition.Task);

                        // Stop recognition.
                        await recognizer.StopContinuousRecognitionAsync().ConfigureAwait(false);

                        return(Text[guid]);
                    }
        }
Пример #30
0
        /// <summary>
        /// Common routine for transcribing an audio file.
        /// </summary>
        /// <param name="apiKey">The subscription key.</param>
        /// <param name="region">The region of the resource.</param>
        /// <param name="reader">BinaryReader instance for reading the input stream.</param>
        /// <returns>A Task returning the transcribed speech.</returns>
        private async Task <string> TranscribeAudioCommonAsync(Secret apiKey, string region, BinaryReader reader)
        {
            string transcript = null;

            using (BinaryAudioStreamReader streamReader = new BinaryAudioStreamReader(reader))
            {
                AudioStreamFormat audioStreamFormat = ReadWaveHeader(reader);
                AudioConfig       audioConfig       = AudioConfig.FromStreamInput(streamReader, audioStreamFormat);
                SpeechConfig      speechConfig      = SpeechConfig.FromSubscription(apiKey.Value, region);

                _speechRecognizer = new SpeechRecognizer(speechConfig, audioConfig);

                _speechRecognizer.Recognized     += Recognized;
                _speechRecognizer.Canceled       += Canceled;
                _speechRecognizer.SessionStopped += SessionStopped;
                _speechRecognizer.Canceled       += SessionStopped;

                await _speechRecognizer.StartContinuousRecognitionAsync().ConfigureAwait(false);

                Task.WaitAny(new[] { _stopRecognition.Task });

                await _speechRecognizer.StopContinuousRecognitionAsync().ConfigureAwait(false);

                if (!string.IsNullOrWhiteSpace(_cancellationDetails))
                {
                    throw new TranscriberCanceledException($"Azure Speech cancellation error: {_cancellationDetails}");
                }

                transcript = _transcriptBuilder.ToString();

                if (string.IsNullOrWhiteSpace(transcript))
                {
                    throw new TranscriberEmptyTranscriptException("Azure Speech returned blank transcript!");
                }
            }

            return(transcript);
        }