コード例 #1
0
ファイル: MainWindow.cs プロジェクト: danbohus/psi-samples
        private void MainWindow_Shown(object sender, EventArgs e)
        {
            // Create the \psi pipeline
            this.pipeline = Pipeline.Create();

            // Create the webcam component
            var webcam = new MediaCapture(this.pipeline, 640, 480, "/dev/video0", PixelFormatId.YUYV);

            // Create the audio capture component
            var audio = new AudioCapture(this.pipeline, new AudioCaptureConfiguration {
                DeviceName = "plughw:0,0", Format = WaveFormat.Create16kHz1Channel16BitPcm()
            });

            // Create an acoustic features extractor component and pipe the audio to it
            var acousticFeatures = new AcousticFeaturesExtractor(this.pipeline);

            audio.PipeTo(acousticFeatures);

            // Fuse the webcam images with the audio log energy level
            var webcamWithAudioEnergy = webcam.Join(acousticFeatures.LogEnergy, RelativeTimeInterval.Past());

            // Overlay the audio energy on the webcam image and display it in the window.
            // The "Do" operator is executed on each fused webcam and audio energy sample.
            webcamWithAudioEnergy.Do(
                frame =>
            {
                // Update the window with the latest frame
                this.DrawFrame(frame);
            },
                DeliveryPolicy.LatestMessage);

            // Start the pipeline running
            this.pipeline.RunAsync();
        }
コード例 #2
0
        private async Task <string> GetVoiceSignatureString()
        {
            var audioStream = new MemoryStream();
            var writer      = new WaveDataWriterClass(audioStream, WaveFormat.Create16kHz1Channel16BitPcm());

            using (var p = Pipeline.Create())
            {
                var capture = new AudioCapture(p, WaveFormat.Create16kHz1Channel16BitPcm());
                capture.Do(audio => writer.Write(audio.Data.DeepClone()));
                p.RunAsync();
                await Task.Delay(5000);

                writer.Flush();
            }

            var content = new ByteArrayContent(audioStream.GetBuffer(), 0, (int)audioStream.Length);
            var client  = new HttpClient();

            client.DefaultRequestHeaders.Add("Ocp-Apim-Subscription-Key", this.SubscriptionKey);
            var response = await client.PostAsync($"https://signature.{this.Region}.cts.speech.microsoft.com/api/v1/Signature/GenerateVoiceSignatureFromByteArray", content);

            var jsonData = await response.Content.ReadAsStringAsync();

            var result = JsonConvert.DeserializeObject <VoiceSignature>(jsonData);

            return(JsonConvert.SerializeObject(result.Signature));
        }
コード例 #3
0
        private static IProducer <AudioBuffer> SetupAudioInput(Pipeline pipeline, string inputLogPath, ref DateTime startTime)
        {
            IProducer <AudioBuffer> audioInput = null;

            if (inputLogPath != null)
            {
                // Open the MicrophoneAudio stream from the last saved log
                var store = Store.Open(pipeline, AppName, inputLogPath);
                audioInput = store.OpenStream <AudioBuffer>($"{AppName}.MicrophoneAudio");

                // Get the originating time of the start of the data in the store. We will use this
                // to set the correct start time in the visualizer (if live visualization is on).
                startTime = store.OriginatingTimeInterval.Left;
            }
            else
            {
                // Create the AudioSource component to capture audio from the default device in 16 kHz 1-channel
                // PCM format as required by both the voice activity detector and speech recognition components.
                audioInput = new AudioSource(pipeline, new AudioSourceConfiguration()
                {
                    OutputFormat = WaveFormat.Create16kHz1Channel16BitPcm()
                });
            }

            return(audioInput);
        }
コード例 #4
0
        /// <summary>
        /// Builds and runs a webcam pipeline and records the data to a Psi store.
        /// </summary>
        /// <param name="pathToStore">The path to directory where store should be saved.</param>
        public static void RecordAudioVideo(string pathToStore)
        {
            // Create the pipeline object.
            using (Pipeline pipeline = Pipeline.Create())
            {
                // Register an event handler to catch pipeline errors
                pipeline.PipelineExceptionNotHandled += Pipeline_PipelineException;

                // Register an event handler to be notified when the pipeline completes
                pipeline.PipelineCompleted += Pipeline_PipelineCompleted;

                // Create store
                var store = PsiStore.Create(pipeline, ApplicationName, pathToStore);

                // Create our webcam
                var webcam = new MediaCapture(pipeline, 1920, 1080, 30);

                // Create the AudioCapture component to capture audio from the default device in 16 kHz 1-channel
                IProducer <AudioBuffer> audioInput = new AudioCapture(pipeline, WaveFormat.Create16kHz1Channel16BitPcm());

                var images = webcam.Out.EncodeJpeg(90, DeliveryPolicy.LatestMessage).Out;

                // Attach the webcam's image output to the store. We will write the images to the store as compressed JPEGs.
                images.Write("Image", store, true, DeliveryPolicy.LatestMessage);

                // Attach the audio input to the store
                audioInput.Out.Write("Audio", store, true, DeliveryPolicy.LatestMessage);

                // Run the pipeline
                pipeline.RunAsync();

                Console.WriteLine("Press any key to finish recording");
                Console.ReadKey();
            }
        }
コード例 #5
0
        /// <summary>
        /// Initializes a new instance of the <see cref="SystemVoiceActivityDetectorConfiguration"/> class.
        /// </summary>
        public SystemVoiceActivityDetectorConfiguration()
        {
            this.Language         = "en-us";
            this.Grammars         = null;
            this.BufferLengthInMs = 1000;

            // These values affect the latency of results from the VAD. Due to inherent delay
            // between the time audio is sent to the internal recognition engine and when the
            // engine detects that speech is present and makes a state transition, we need to
            // add these offsets to the computed time at which the state transition occurs to
            // ensure proper alignment between the audio and VAD result. A negative value
            // will shift the result earlier in time to account for this delay. However, this
            // will also contribute to the latency of the VAD output, so we should tune this
            // to be as close to zero as possible while still maintaining correctness. Values
            // of between -50ms and -150ms appear to be reasonable.
            this.VoiceActivityStartOffsetMs = -150;
            this.VoiceActivityEndOffsetMs   = -150;

            // Defaults to 16 kHz, 16-bit, 1-channel PCM samples
            this.InputFormat = WaveFormat.Create16kHz1Channel16BitPcm();

            // Modify these values to improve VAD responsiveness. The EndSilenceTimeoutMs and
            // EndSilenceTimeoutAmbiguousMs parameters seems to matter most. Initialized to the
            // default values as specified in the documentation here:
            // https://docs.microsoft.com/en-us/dotnet/api/system.speech.recognition.speechrecognitionengine?view=netframework-4.8#properties
            this.InitialSilenceTimeoutMs      = 0;
            this.BabbleTimeoutMs              = 0;
            this.EndSilenceTimeoutAmbiguousMs = 500;
            this.EndSilenceTimeoutMs          = 150;
        }
コード例 #6
0
        /// <summary>
        /// Initializes a new instance of the <see cref="SystemSpeechRecognizerConfiguration"/> class.
        /// </summary>
        public SystemSpeechRecognizerConfiguration()
        {
            this.Language         = "en-us";
            this.Grammars         = null;
            this.BufferLengthInMs = 1000;

            // Defaults to 16 kHz, 16-bit, 1-channel PCM samples
            this.InputFormat = WaveFormat.Create16kHz1Channel16BitPcm();
        }
コード例 #7
0
        /// <summary>
        /// Initializes a new instance of the <see cref="AzureSpeechRecognizerConfiguration"/> class.
        /// </summary>
        public AzureSpeechRecognizerConfiguration()
        {
            this.Language        = "en-us";
            this.SubscriptionKey = null; // This must be set to the key associated with your account
            this.Region          = null; // This must be set to the region associated to the key

            // Defaults to 16 kHz, 16-bit, 1-channel PCM samples
            this.InputFormat = WaveFormat.Create16kHz1Channel16BitPcm();
        }
コード例 #8
0
        /// <summary>
        /// Initializes a new instance of the <see cref="AcousticFeaturesExtractorConfiguration"/> class.
        /// </summary>
        public AcousticFeaturesExtractorConfiguration()
        {
            // Default parameters for acoustic features computation
            this.computeFFT      = false;
            this.computeFFTPower = false;

            // Defaults to 16 kHz, 16-bit, 1-channel PCM samples
            this.InputFormat = WaveFormat.Create16kHz1Channel16BitPcm();
        }
コード例 #9
0
        public void AudioBuffer_Empty()
        {
            AudioBuffer buffer = new AudioBuffer(0, WaveFormat.Create16kHz1Channel16BitPcm());

            Assert.AreEqual(0, buffer.Length);
            Assert.AreEqual(0, buffer.Data.Length);
            Assert.AreEqual(WaveFormat.Create16kHz1Channel16BitPcm(), buffer.Format);
            Assert.AreEqual(TimeSpan.Zero, buffer.Duration);
            CollectionAssert.AreEqual(new double[0], this.GetSamples(buffer).ToArray());
        }
コード例 #10
0
        public void AudioBuffer_16kHz1Channel16BitPcm1Sample()
        {
            AudioBuffer buffer = new AudioBuffer(BitConverter.GetBytes((short)-12345), WaveFormat.Create16kHz1Channel16BitPcm());

            Assert.AreEqual(-12345, BitConverter.ToInt16(buffer.Data, 0));
            Assert.AreEqual(2, buffer.Length);
            Assert.AreEqual(2, buffer.Data.Length);
            Assert.AreEqual(WaveFormat.Create16kHz1Channel16BitPcm(), buffer.Format);
            Assert.AreEqual(TimeSpan.FromTicks(10000000L / 16000), buffer.Duration);
            CollectionAssert.AreEqual(new double[] { -12345 }, this.GetSamples(buffer).ToArray());
        }
コード例 #11
0
        /// <summary>
        /// Initializes a new instance of the <see cref="SystemVoiceActivityDetectorConfiguration"/> class.
        /// </summary>
        public SystemVoiceActivityDetectorConfiguration()
        {
            this.Language                   = "en-us";
            this.Grammars                   = null;
            this.BufferLengthInMs           = 1000;
            this.VoiceActivityStartOffsetMs = -150;
            this.VoiceActivityEndOffsetMs   = -150;

            // Defaults to 16 kHz, 16-bit, 1-channel PCM samples
            this.InputFormat = WaveFormat.Create16kHz1Channel16BitPcm();
        }
コード例 #12
0
        /// <summary>
        /// Initializes a new instance of the <see cref="AudioPlayerConfiguration"/> class.
        /// </summary>
        public AudioPlayerConfiguration()
        {
            this.DeviceName          = string.Empty;
            this.TargetLatencyInMs   = 20;
            this.BufferLengthSeconds = 0.1;
            this.AudioLevel          = -1;
            this.Gain = 1.0f;

            // Defaults to 16 kHz, 16-bit, 1-channel PCM samples
            this.InputFormat = WaveFormat.Create16kHz1Channel16BitPcm();
        }
コード例 #13
0
ファイル: Program.cs プロジェクト: bookesse/psi
        /// <summary>
        /// Builds and runs a speech recognition pipeline using the Azure speech recognizer. Requires a valid Cognitive Services
        /// subscription key. See https://docs.microsoft.com/en-us/azure/cognitive-services/cognitive-services-apis-create-account.
        /// </summary>
        /// <remarks>
        /// If you are getting a <see cref="System.InvalidOperationException"/> with the message 'AzureSpeechRecognizer returned
        /// OnConversationError with error code: LoginFailed. Original error text: Transport error', this most likely is due to
        /// an invalid subscription key. Please check your Azure portal at https://portal.azure.com and ensure that you have
        /// added a subscription to the Azure Speech API on your account.
        /// </remarks>
        public static void RunAzureSpeech()
        {
            // Create the pipeline object.
            using (Pipeline pipeline = Pipeline.Create())
            {
                // Create the AudioSource component to capture audio from the default device in 16 kHz 1-channel
                // PCM format as required by both the voice activity detector and speech recognition components.
                IProducer <AudioBuffer> audioInput = new AudioCapture(pipeline, new AudioCaptureConfiguration()
                {
                    DeviceName = "plughw:0,0", Format = WaveFormat.Create16kHz1Channel16BitPcm()
                });

                // Perform voice activity detection using the voice activity detector component
                var vad = new SimpleVoiceActivityDetector(pipeline);
                audioInput.PipeTo(vad);

                // Create Azure speech recognizer component
                var recognizer = new AzureSpeechRecognizer(pipeline, new AzureSpeechRecognizerConfiguration()
                {
                    SubscriptionKey = Program.azureSubscriptionKey, Region = Program.azureRegion
                });

                // The input audio to the Azure speech recognizer needs to be annotated with a voice activity flag.
                // This can be constructed by using the Psi Join() operator to combine the audio and VAD streams.
                var annotatedAudio = audioInput.Join(vad);

                // Subscribe the recognizer to the annotated audio
                annotatedAudio.PipeTo(recognizer);

                // Partial and final speech recognition results are posted on the same stream. Here
                // we use Psi's Where() operator to filter out only the final recognition results.
                var finalResults = recognizer.Out.Where(result => result.IsFinal);

                // Print the recognized text of the final recognition result to the console.
                finalResults.Do(result => Console.WriteLine(result.Text));

                // Register an event handler to catch pipeline errors
                pipeline.PipelineExceptionNotHandled += Pipeline_PipelineException;

                // Register an event handler to be notified when the pipeline completes
                pipeline.PipelineCompleted += Pipeline_PipelineCompleted;

                // Run the pipeline
                pipeline.RunAsync();

                // Azure speech transcribes speech to text
                Console.WriteLine("Say anything");

                Console.WriteLine("Press any key to exit...");
                Console.ReadKey(true);
            }
        }
コード例 #14
0
        /// <summary>
        /// Initializes a new instance of the <see cref="SystemSpeechSynthesizerConfiguration"/> class.
        /// </summary>
        public SystemSpeechSynthesizerConfiguration()
        {
            this.Voice        = "Microsoft Zira Desktop";
            this.PersistAudio = false;
            this.UseDefaultAudioPlaybackDevice = false;
            this.BufferLengthInMs = 1000;
            this.ProsodyRate      = 1.0;
            this.ProsodyPitch     = "default";
            this.ProsodyVolume    = "default";

            // Defaults to 16 kHz, 16-bit, 1-channel PCM samples
            this.OutputFormat = WaveFormat.Create16kHz1Channel16BitPcm();
        }
コード例 #15
0
ファイル: Program.cs プロジェクト: areilly711/psi
        /// <summary>
        /// Builds and runs a webcam pipeline and records the data to a Psi store
        /// </summary>
        /// <param name="pathToStore">The path to directory where store should be saved.</param>
        public static void RecordAudioVideo(string pathToStore)
        {
            // Create the pipeline object.
            using (Pipeline pipeline = Pipeline.Create())
            {
                var visualizationClient = new VisualizationClient();

                // Register an event handler to catch pipeline errors
                pipeline.PipelineCompletionEvent += PipelineCompletionEvent;

                // Clear all data if the visualizer is already open
                visualizationClient.ClearAll();

                // Set the visualization client to visualize live data
                visualizationClient.SetLiveMode();

                // Create store
                Data.Exporter store = Store.Create(pipeline, ApplicationName, pathToStore);

                // Create our webcam
                MediaCapture webcam = new MediaCapture(pipeline, 1920, 1080, 30);

                // Create the AudioCapture component to capture audio from the default device in 16 kHz 1-channel
                IProducer <AudioBuffer> audioInput = new AudioCapture(pipeline, new AudioCaptureConfiguration()
                {
                    OutputFormat = WaveFormat.Create16kHz1Channel16BitPcm()
                });

                var images = webcam.Out.EncodeJpeg(90, DeliveryPolicy.LatestMessage).Out;

                // Attach the webcam's image output to the store. We will write the images to the store as compressed JPEGs.
                Store.Write(images, "Image", store, true, DeliveryPolicy.LatestMessage);

                // Attach the audio input to the store
                Store.Write(audioInput.Out, "Audio", store, true, DeliveryPolicy.LatestMessage);

                // Create a XY panel in PsiStudio to display the images
                visualizationClient.AddXYPanel();
                images.Show(visualizationClient);

                // Create a timeline panel in PsiStudio to display the audio waveform
                visualizationClient.AddTimelinePanel();
                audioInput.Out.Show(visualizationClient);

                // Run the pipeline
                pipeline.RunAsync();

                Console.WriteLine("Press any key to finish recording");
                Console.ReadKey();
            }
        }
コード例 #16
0
        public void AudioBuffer_ReframeByDuration()
        {
            var audioFormat = WaveFormat.Create16kHz1Channel16BitPcm();
            var audioData   = new byte[] { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 };

            int inputCount     = 100;
            int inputSize      = 10;
            var inputInterval  = TimeSpan.FromTicks(inputSize * 10000 / 32); // 10 bytes @ 32 kBytes/sec
            var outputInterval = TimeSpan.FromMilliseconds(10);              // 10 ms
            int outputSize     = (int)(outputInterval.TotalSeconds * audioFormat.AvgBytesPerSec);

            var output    = new List <(AudioBuffer, DateTime)>();
            var startTime = DateTime.MinValue;

            using (var p = Pipeline.Create())
            {
                // input stream of 10-byte audio buffers
                var audio = Generators.Repeat(p, new AudioBuffer(audioData, audioFormat), inputCount, inputInterval);

                // reframe output stream as 10 ms audio buffers
                var reframed = audio.Reframe(outputInterval);

                // capture outputs and start time for verification
                reframed.Do((x, e) => output.Add((x.DeepClone(), e.OriginatingTime)));
                audio.First().Do((x, e) => startTime = e.OriginatingTime - inputInterval);

                p.Run();
            }

            // verify no. of reframed output buffers
            Assert.AreEqual(inputCount * inputSize / outputSize, output.Count);

            foreach (var(buffer, dt) in output)
            {
                // verify output audio buffer originating times
                startTime += outputInterval;
                Assert.AreEqual(startTime, dt);

                // verify audio format remains the same
                Assert.AreEqual(audioFormat, buffer.Format);

                // verify the output audio bytes by constructing the expected output from a concatenation of the input data
                var expectedOutput = Enumerable.Repeat(audioData, outputSize / inputSize).SelectMany(x => x).Concat(audioData.Take(outputSize % inputSize));
                CollectionAssert.AreEqual(expectedOutput.ToArray(), buffer.Data);

                // shift the input data to account for any partial bytes when constructing the expected output above
                audioData = audioData.Skip(outputSize % inputSize).Concat(audioData.Take(outputSize % inputSize)).ToArray();
            }
        }
コード例 #17
0
        public void AudioBuffer_ReframeSmaller()
        {
            var audioFormat = WaveFormat.Create16kHz1Channel16BitPcm();
            var audioData   = new byte[] { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 };

            int inputCount     = 100;
            int inputSize      = 10;
            int outputSize     = 6;
            var inputInterval  = TimeSpan.FromTicks(inputSize * 10000 / 32);  // 10 bytes @ 32 kBytes/sec
            var outputInterval = TimeSpan.FromTicks(outputSize * 10000 / 32); // 6 bytes @ 32 kBytes/sec

            var output    = new List <(AudioBuffer, DateTime)>();
            var startTime = DateTime.MinValue;

            using (var p = Pipeline.Create())
            {
                // input stream of 10-byte audio buffers
                var audio = Generators.Repeat(p, new AudioBuffer(audioData, audioFormat), inputCount, inputInterval);

                // reframe output stream as 6-byte audio buffers
                var reframed = audio.Reframe(outputSize);

                // capture outputs and start time for verification
                reframed.Do((x, e) => output.Add((x.DeepClone(), e.OriginatingTime)));
                audio.First().Do((x, e) => startTime = e.OriginatingTime - inputInterval);

                p.Run();
            }

            // verify no. of reframed output buffers
            Assert.AreEqual(inputCount * inputSize / outputSize, output.Count);

            foreach (var(buffer, dt) in output)
            {
                // verify output audio buffer originating times
                startTime += outputInterval;
                Assert.AreEqual(startTime, dt);

                // verify audio format remains the same
                Assert.AreEqual(audioFormat, buffer.Format);

                // verify that the output audio bytes match the first [outputSize] bytes of the input data
                CollectionAssert.AreEqual(audioData.Take(outputSize).ToArray(), buffer.Data);

                // shift the input data to be aligned with the start of the next expected output buffer
                audioData = audioData.Skip(outputSize).Concat(audioData.Take(outputSize)).ToArray();
            }
        }
コード例 #18
0
        public void AudioBuffer_16kHz1Channel16BitPcm3Samples()
        {
            short[] rawValues = new short[] { -32768, 32767, 12345 };
            byte[]  rawBytes  = rawValues.SelectMany(x => BitConverter.GetBytes(x)).ToArray();

            AudioBuffer buffer = new AudioBuffer(rawBytes, WaveFormat.Create16kHz1Channel16BitPcm());

            Assert.AreEqual(-32768, BitConverter.ToInt16(buffer.Data, 0));
            Assert.AreEqual(32767, BitConverter.ToInt16(buffer.Data, 2));
            Assert.AreEqual(12345, BitConverter.ToInt16(buffer.Data, 4));
            Assert.AreEqual(6, buffer.Length);
            Assert.AreEqual(6, buffer.Data.Length);
            Assert.AreEqual(WaveFormat.Create16kHz1Channel16BitPcm(), buffer.Format);
            Assert.AreEqual(TimeSpan.FromTicks(rawValues.Length * (10000000L / 16000)), buffer.Duration);
            CollectionAssert.AreEqual(new double[] { -32768, 32767, 12345 }, this.GetSamples(buffer).ToArray());
        }
コード例 #19
0
        public void WaveFormat_Create16kHz1Channel16BitPcm()
        {
            // Define "native" WAVEFORMATEX structure for PCM
            byte[] formatBytes = new byte[]
            {
                0x01, 0x00,             // FormatTag = 1
                0x01, 0x00,             // Channels = 1
                0x80, 0x3e, 0x00, 0x00, // SamplesPerSec = 16000
                0x00, 0x7d, 0x00, 0x00, // AvgBytesPerSec = 32000
                0x02, 0x00,             // BlockAlign = 2
                0x10, 0x00,             // BitsPerSample = 16
                0x00, 0x00,             // ExtraSize = 0
            };

            // Create equivalent managed WaveFormat object
            WaveFormat format = WaveFormat.Create16kHz1Channel16BitPcm();

            // Verify against expected
            this.MarshalAndVerify(format, formatBytes);
        }
コード例 #20
0
        static void Main(string[] args)
        {
            using (Pipeline pipeline = Pipeline.Create())
            {
                WaveFormat waveFormat = WaveFormat.Create16kHz1Channel16BitPcm();

                IProducer <AudioBuffer> audioInput = new AudioCapture(pipeline, new AudioCaptureConfiguration()
                {
                    OutputFormat = waveFormat
                });
                DataFaucet <AudioBuffer> df = new DataFaucet <AudioBuffer>(pipeline);
                audioInput.PipeTo(df);
                AggregateDump dump = new AggregateDump(pipeline);
                df.PipeTo(dump);
                GoogleASR gsr = new GoogleASR(pipeline, "en");                   //gsr for google speech recognition
                dump.PipeTo(gsr);
                GoogleTranslate gt = new GoogleTranslate(pipeline, "en", "de");  //gt for google translate
                gsr.PipeTo(gt);
                GoogleSpeak gs = new GoogleSpeak(pipeline, waveFormat, "de-DE"); //gs for google speak
                gt.PipeTo(gs);
                AudioOutput aOut = new AudioOutput(pipeline);                    //aOut for audio out
                gs.PipeTo(aOut);

                ActiveMQ rasa = new ActiveMQ(pipeline, "rasa.PSI", "rasa.PYTHON");
                gsr.PipeTo(rasa);

                GUI    gui    = new GUI(df, dump, gsr, gt);
                Thread thread = new Thread(() =>
                {
                    gui.ShowDialog();
                });
                thread.Start();

                pipeline.RunAsync();

                Console.ReadKey(true);
            }
        }
コード例 #21
0
        /// <summary>
        /// Initializes a new instance of the <see cref="AcousticFeaturesExtractorConfiguration"/> class.
        /// </summary>
        public AcousticFeaturesExtractorConfiguration()
        {
            // Default parameters for acoustic features computation
            this.FrameDurationInSeconds = 0.025f;
            this.FrameRateInHz          = 100.0f;
            this.AddDither                    = true;
            this.DitherScaleFactor            = 1.0f;
            this.StartFrequency               = 250.0f;
            this.EndFrequency                 = 7000.0f;
            this.LowEndFrequency              = 3000.0f;
            this.HighStartFrequency           = 2500.0f;
            this.EntropyBandwidth             = 2500.0f;
            this.ComputeLogEnergy             = true;
            this.ComputeZeroCrossingRate      = true;
            this.ComputeFrequencyDomainEnergy = true;
            this.ComputeLowFrequencyEnergy    = true;
            this.ComputeHighFrequencyEnergy   = true;
            this.ComputeSpectralEntropy       = true;
            this.ComputeFFT                   = false;
            this.ComputeFFTPower              = false;

            // Defaults to 16 kHz, 16-bit, 1-channel PCM samples
            this.InputFormat = WaveFormat.Create16kHz1Channel16BitPcm();
        }
コード例 #22
0
ファイル: Program.cs プロジェクト: llfuller/psi
        /// <summary>
        /// Builds and runs a speech recognition pipeline using the Azure speech service. Requires a valid Cognitive Services
        /// subscription key. See https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/get-started.
        /// </summary>
        public static void RunAzureSpeech()
        {
            // Create the pipeline object.
            using (Pipeline pipeline = Pipeline.Create())
            {
                // Create the AudioSource component to capture audio from the default device in 16 kHz 1-channel
                // PCM format as required by the speech recognition component.
                var audio = new AudioCapture(pipeline, new AudioCaptureConfiguration {
                    DeviceName = deviceName, Format = WaveFormat.Create16kHz1Channel16BitPcm()
                });

                // Create the speech recognizer component
                var recognizer = new ContinuousSpeechRecognizer(pipeline, azureSubscriptionKey, azureRegion);

                // Subscribe the recognizer to the annotated audio
                audio.PipeTo(recognizer);

                // Print the recognized text of the final recognition result to the console.
                recognizer.Out.Do((result, e) => Console.WriteLine($"{e.OriginatingTime.TimeOfDay}: {result}"));

                // Register an event handler to catch pipeline errors
                pipeline.PipelineExceptionNotHandled += Pipeline_PipelineException;

                // Register an event handler to be notified when the pipeline completes
                pipeline.PipelineCompleted += Pipeline_PipelineCompleted;

                // Run the pipeline
                pipeline.RunAsync();

                // Azure speech transcribes speech to text
                Console.WriteLine("Say anything");

                Console.WriteLine("Press any key to exit...");
                Console.ReadKey(true);
            }
        }
コード例 #23
0
 public void AudioBuffer_HasValidData()
 {
     Assert.IsFalse(default(AudioBuffer).HasValidData);
     Assert.IsTrue(new AudioBuffer(new byte[2], WaveFormat.Create16kHz1Channel16BitPcm()).HasValidData);
 }
コード例 #24
0
ファイル: Program.cs プロジェクト: llfuller/psi
        /// <summary>
        /// Builds and runs a speech recognition pipeline using the Azure speech recognizer. Requires a valid Cognitive Services
        /// subscription key. See https://docs.microsoft.com/en-us/azure/cognitive-services/cognitive-services-apis-create-account.
        /// </summary>
        /// <remarks>
        /// If you are getting a <see cref="System.InvalidOperationException"/> with the message 'AzureSpeechRecognizer returned
        /// OnConversationError with error code: LoginFailed. Original error text: Transport error', this most likely is due to
        /// an invalid subscription key. Please check your Azure portal at https://portal.azure.com and ensure that you have
        /// added a subscription to the Azure Speech API on your account.
        /// </remarks>
        /// <param name="outputLogPath">The path under which to write log data.</param>
        /// <param name="inputLogPath">The path from which to read audio input data.</param>
        public static void RunAzureSpeech(string outputLogPath = null, string inputLogPath = null)
        {
            // Create the pipeline object.
            using (Pipeline pipeline = Pipeline.Create())
            {
                // Use either live audio from the microphone or audio from a previously saved log
                IProducer <AudioBuffer> audioInput = null;
                if (inputLogPath != null)
                {
                    // Open the MicrophoneAudio stream from the last saved log
                    var store = PsiStore.Open(pipeline, Program.AppName, inputLogPath);
                    audioInput = store.OpenStream <AudioBuffer>($"{Program.AppName}.MicrophoneAudio");
                }
                else
                {
                    // Create the AudioCapture component to capture audio from the default device in 16 kHz 1-channel
                    // PCM format as required by both the voice activity detector and speech recognition components.
                    audioInput = new AudioCapture(pipeline, WaveFormat.Create16kHz1Channel16BitPcm());
                }

                // Perform voice activity detection using the voice activity detector component
                var vad = new SystemVoiceActivityDetector(pipeline);
                audioInput.PipeTo(vad);

                // Create Azure speech recognizer component
                var recognizer = new AzureSpeechRecognizer(pipeline, new AzureSpeechRecognizerConfiguration()
                {
                    SubscriptionKey = Program.azureSubscriptionKey, Region = Program.azureRegion
                });

                // The input audio to the Azure speech recognizer needs to be annotated with a voice activity flag.
                // This can be constructed by using the Psi Join() operator to combine the audio and VAD streams.
                var annotatedAudio = audioInput.Join(vad);

                // Subscribe the recognizer to the annotated audio
                annotatedAudio.PipeTo(recognizer);

                // Partial and final speech recognition results are posted on the same stream. Here
                // we use Psi's Where() operator to filter out only the final recognition results.
                var finalResults = recognizer.Out.Where(result => result.IsFinal);

                // Print the recognized text of the final recognition result to the console.
                finalResults.Do(result => Console.WriteLine(result.Text));

                // Create a data store to log the data to if necessary. A data store is necessary
                // only if output logging is enabled.
                var dataStore = CreateDataStore(pipeline, outputLogPath);

                // For disk logging only
                if (dataStore != null)
                {
                    // Log the microphone audio and recognition results
                    audioInput.Write($"{Program.AppName}.MicrophoneAudio", dataStore);
                    finalResults.Write($"{Program.AppName}.FinalRecognitionResults", dataStore);
                    vad.Write($"{Program.AppName}.VoiceActivity", dataStore);
                }

                // Register an event handler to catch pipeline errors
                pipeline.PipelineExceptionNotHandled += Pipeline_PipelineException;

                // Register an event handler to be notified when the pipeline completes
                pipeline.PipelineCompleted += Pipeline_PipelineCompleted;

                // Run the pipeline
                pipeline.RunAsync();

                // Azure speech transcribes speech to text
                Console.WriteLine("Say anything");

                Console.WriteLine("Press any key to exit...");
                Console.ReadKey(true);
            }
        }
コード例 #25
0
 /// <summary>
 /// Initializes a new instance of the <see cref="AudioCaptureConfiguration"/> class.
 /// </summary>
 /// <remarks>Defaults to 16kHz, 1 channel, 16-bit PCM.</remarks>
 public AudioCaptureConfiguration()
     : this("plughw:0,0", WaveFormat.Create16kHz1Channel16BitPcm())
 {
 }
コード例 #26
0
        /// <summary>
        /// Builds and runs a speech recognition pipeline using the .NET System.Speech recognizer and a set of fixed grammars.
        /// </summary>
        /// <param name="outputLogPath">The path under which to write log data.</param>
        /// <param name="inputLogPath">The path from which to read audio input data.</param>
        /// <param name="showLiveVisualization">A flag indicating whether to display live data in PsiStudio as the pipeline is running.</param>
        public static void RunSystemSpeech(string outputLogPath = null, string inputLogPath        = null, bool showLiveVisualization = true,
                                           string facilitatorIP = "localhost", int facilitatorPort = 9000, int localPort              = 8090)
        {
            // Create the pipeline object.
            using (Pipeline pipeline = Pipeline.Create())
            {
                // Needed only for live visualization
                DateTime startTime = DateTime.Now;

                // Use either live audio from the microphone or audio from a previously saved log
                IProducer <AudioBuffer> audioInput = null;
                if (inputLogPath != null)
                {
                    // Open the MicrophoneAudio stream from the last saved log
                    var store = Store.Open(pipeline, Program.AppName, inputLogPath);
                    audioInput = store.OpenStream <AudioBuffer>($"{Program.AppName}.MicrophoneAudio");

                    // Get the originating time of the start of the data in the store. We will use this
                    // to set the correct start time in the visualizer (if live visualization is on).
                    startTime = store.OriginatingTimeInterval.Left;
                }
                else
                {
                    // Create the AudioSource component to capture audio from the default device in 16 kHz 1-channel
                    // PCM format as required by both the voice activity detector and speech recognition components.
                    audioInput = new AudioSource(pipeline, new AudioSourceConfiguration()
                    {
                        OutputFormat = WaveFormat.Create16kHz1Channel16BitPcm()
                    });
                }

                // Create System.Speech recognizer component
                var recognizer = CreateSpeechRecognizer(pipeline);

                // Subscribe the recognizer to the input audio
                audioInput.PipeTo(recognizer);

                // Partial and final speech recognition results are posted on the same stream. Here
                // we use Psi's Where() operator to filter out only the final recognition results.
                var finalResults = recognizer.Out.Where(result => result.IsFinal);
                finalResults.Do(x => Console.WriteLine(x));
                //KioskUI.KioskUI ui = new KioskUI.KioskUI(pipeline);
                //SystemSpeechSynthesizer speechSynth = CreateSpeechSynthesizer(pipeline);
                KioskInputTextPreProcessor preproc = new NU.Kqml.KioskInputTextPreProcessor(pipeline, (SystemSpeechRecognizer)recognizer);

                finalResults.PipeTo(preproc.In);
                preproc.Out.Do(x => Console.WriteLine($"Processed: {x}"));

                //preproc.Out.PipeTo(ui.UserInput);
                if (facilitatorIP != "none")
                {
                    python = new SocketStringConsumer(pipeline, facilitatorIP, facilitatorPort, localPort);
                    //preproc.Out.PipeTo(ui.UserInput);
                    //python.Out.PipeTo(ui.CompResponse);
                    //python.Out.PipeTo(speechSynth);
                }
                else
                {
                    //preproc.Out.PipeTo(ui.CompResponse);
                    //preproc.Out.PipeTo(speechSynth);
                }
                //speechSynth.SpeakCompleted.Do(x => preproc.setAccepting());

                // Create a data store to log the data to if necessary. A data store is necessary
                // only if output logging or live visualization are enabled.
                var dataStore = CreateDataStore(pipeline, outputLogPath, showLiveVisualization);

                // For disk logging or live visualization only
                if (dataStore != null)
                {
                    // Log the microphone audio and recognition results
                    audioInput.Write($"{Program.AppName}.MicrophoneAudio", dataStore);
                    finalResults.Write($"{Program.AppName}.FinalRecognitionResults", dataStore);
                }

                // Register an event handler to catch pipeline errors
                pipeline.PipelineCompletionEvent += PipelineCompletionEvent;

                // Run the pipeline
                pipeline.RunAsync();

                Console.WriteLine("Press any key to exit...");
                Console.ReadKey(true);
            }
        }
コード例 #27
0
ファイル: Program.cs プロジェクト: llfuller/psi
        /// <summary>
        /// Builds and runs a speech recognition pipeline using the .NET System.Speech recognizer and a set of fixed grammars.
        /// </summary>
        /// <param name="outputLogPath">The path under which to write log data.</param>
        /// <param name="inputLogPath">The path from which to read audio input data.</param>
        public static void RunSystemSpeech(string outputLogPath = null, string inputLogPath = null)
        {
            // Create the pipeline object.
            using (Pipeline pipeline = Pipeline.Create())
            {
                // Use either live audio from the microphone or audio from a previously saved log
                IProducer <AudioBuffer> audioInput = null;
                if (inputLogPath != null)
                {
                    // Open the MicrophoneAudio stream from the last saved log
                    var store = PsiStore.Open(pipeline, Program.AppName, inputLogPath);
                    audioInput = store.OpenStream <AudioBuffer>($"{Program.AppName}.MicrophoneAudio");
                }
                else
                {
                    // Create the AudioCapture component to capture audio from the default device in 16 kHz 1-channel
                    // PCM format as required by both the voice activity detector and speech recognition components.
                    audioInput = new AudioCapture(pipeline, WaveFormat.Create16kHz1Channel16BitPcm());
                }

                // Create System.Speech recognizer component
                var recognizer = new SystemSpeechRecognizer(
                    pipeline,
                    new SystemSpeechRecognizerConfiguration()
                {
                    Language = "en-US",
                    Grammars = new GrammarInfo[]
                    {
                        new GrammarInfo()
                        {
                            Name = Program.AppName, FileName = "SampleGrammar.grxml"
                        },
                    },
                });

                // Subscribe the recognizer to the input audio
                audioInput.PipeTo(recognizer);

                // Partial and final speech recognition results are posted on the same stream. Here
                // we use Psi's Where() operator to filter out only the final recognition results.
                var finalResults = recognizer.Out.Where(result => result.IsFinal);

                // Print the final recognition result to the console.
                finalResults.Do(result =>
                {
                    Console.WriteLine($"{result.Text} (confidence: {result.Confidence})");
                });

                // Create a data store to log the data to if necessary. A data store is necessary
                // only if output logging is enabled.
                var dataStore = CreateDataStore(pipeline, outputLogPath);

                // For disk logging only
                if (dataStore != null)
                {
                    // Log the microphone audio and recognition results
                    audioInput.Write($"{Program.AppName}.MicrophoneAudio", dataStore);
                    finalResults.Write($"{Program.AppName}.FinalRecognitionResults", dataStore);
                }

                // Register an event handler to catch pipeline errors
                pipeline.PipelineExceptionNotHandled += Pipeline_PipelineException;

                // Register an event handler to be notified when the pipeline completes
                pipeline.PipelineCompleted += Pipeline_PipelineCompleted;

                // Run the pipeline
                pipeline.RunAsync();

                // The file SampleGrammar.grxml defines a grammar to transcribe numbers
                Console.WriteLine("Say any number between 0 and 100");

                Console.WriteLine("Press any key to exit...");
                Console.ReadKey(true);
            }
        }
コード例 #28
0
 private void ButtonCreate16kHz1Channel16BitPcm_Click(object sender, RoutedEventArgs e)
 {
     Update(WaveFormat.Create16kHz1Channel16BitPcm());
 }
コード例 #29
0
        /// <summary>
        /// This is the main code for our Multimodal Speech Detection demo.
        /// </summary>
        private void PerformMultiModalSpeechDetection()
        {
            Console.WriteLine("Initializing Psi.");

            bool detected = false;

            // First create our \Psi pipeline
            using (var pipeline = Pipeline.Create("MultiModalSpeechDetection"))
            {
                // Register an event handler to catch pipeline errors
                pipeline.PipelineExceptionNotHandled += Pipeline_PipelineException;

                // Register an event handler to be notified when the pipeline completes
                pipeline.PipelineCompleted += Pipeline_PipelineCompleted;

                // Next create our Kinect sensor. We will be using the color images, face tracking, and audio from the Kinect sensor
                var kinectSensorConfig = new KinectSensorConfiguration();
                kinectSensorConfig.OutputColor  = true;
                kinectSensorConfig.OutputAudio  = true;
                kinectSensorConfig.OutputBodies = true; // In order to detect faces using Kinect you must also enable detection of bodies
                var kinectSensor       = new KinectSensor(pipeline, kinectSensorConfig);
                var kinectFaceDetector = new Microsoft.Psi.Kinect.Face.KinectFaceDetector(pipeline, kinectSensor, Microsoft.Psi.Kinect.Face.KinectFaceDetectorConfiguration.Default);

                // Create our Voice Activation Detector
                var speechDetector = new SystemVoiceActivityDetector(pipeline);
                var convertedAudio = kinectSensor.Audio.Resample(WaveFormat.Create16kHz1Channel16BitPcm());
                convertedAudio.PipeTo(speechDetector);

                // Use the Kinect's face track to determine if the mouth is opened
                var mouthOpenAsFloat = kinectFaceDetector.Faces.Where(faces => faces.Count > 0).Select((List <Microsoft.Psi.Kinect.Face.KinectFace> list) =>
                {
                    if (!detected)
                    {
                        detected = true;
                        Console.WriteLine("Found your face");
                    }

                    bool open = (list[0] != null) ? list[0].FaceProperties[Microsoft.Kinect.Face.FaceProperty.MouthOpen] == Microsoft.Kinect.DetectionResult.Yes : false;
                    return(open ? 1.0 : 0.0);
                });

                // Next take the "mouthOpen" value and create a hold on that value (so that we don't see 1,0,1,0,1 but instead would see 1,1,1,1,0.8,0.6,0.4)
                var mouthOpen = mouthOpenAsFloat.Hold(0.1);

                // Next join the results of the speechDetector with the mouthOpen generator and only select samples where
                // we have detected speech and that the mouth was open.
                var mouthAndSpeechDetector = speechDetector.Join(mouthOpen, hundredMs).Select((t, e) => t.Item1 && t.Item2);

                // Convert our speech into text
                var speechRecognition = convertedAudio.SpeechToText(mouthAndSpeechDetector);
                speechRecognition.Do((s, t) =>
                {
                    if (s.Item1.Length > 0)
                    {
                        Console.WriteLine("You said: " + s.Item1);
                    }
                });

                // Create a stream of landmarks (points) from the face detector
                var facePoints = new List <Tuple <System.Windows.Point, string> >();
                var landmarks  = kinectFaceDetector.Faces.Where(faces => faces.Count > 0).Select((List <Microsoft.Psi.Kinect.Face.KinectFace> list) =>
                {
                    facePoints.Clear();
                    System.Windows.Point pt1 = new System.Windows.Point(
                        list[0].FacePointsInColorSpace[Microsoft.Kinect.Face.FacePointType.EyeLeft].X,
                        list[0].FacePointsInColorSpace[Microsoft.Kinect.Face.FacePointType.EyeLeft].Y);
                    facePoints.Add(Tuple.Create(pt1, string.Empty));

                    System.Windows.Point pt2 = new System.Windows.Point(
                        list[0].FacePointsInColorSpace[Microsoft.Kinect.Face.FacePointType.EyeRight].X,
                        list[0].FacePointsInColorSpace[Microsoft.Kinect.Face.FacePointType.EyeRight].Y);
                    facePoints.Add(Tuple.Create(pt2, string.Empty));

                    System.Windows.Point pt3 = new System.Windows.Point(
                        list[0].FacePointsInColorSpace[Microsoft.Kinect.Face.FacePointType.MouthCornerLeft].X,
                        list[0].FacePointsInColorSpace[Microsoft.Kinect.Face.FacePointType.MouthCornerLeft].Y);
                    facePoints.Add(Tuple.Create(pt3, string.Empty));

                    System.Windows.Point pt4 = new System.Windows.Point(
                        list[0].FacePointsInColorSpace[Microsoft.Kinect.Face.FacePointType.MouthCornerRight].X,
                        list[0].FacePointsInColorSpace[Microsoft.Kinect.Face.FacePointType.MouthCornerRight].Y);
                    facePoints.Add(Tuple.Create(pt4, string.Empty));

                    System.Windows.Point pt5 = new System.Windows.Point(
                        list[0].FacePointsInColorSpace[Microsoft.Kinect.Face.FacePointType.Nose].X,
                        list[0].FacePointsInColorSpace[Microsoft.Kinect.Face.FacePointType.Nose].Y);
                    facePoints.Add(Tuple.Create(pt5, string.Empty));
                    return(facePoints);
                });

                // ********************************************************************
                // Finally create a Live Visualizer using PsiStudio.
                // We must persist our streams to a store in order for Live Viz to work properly
                // ********************************************************************

                // Create store for the data. Live Visualizer can only read data from a store.
                var pathToStore = Environment.GetFolderPath(Environment.SpecialFolder.MyVideos);
                Microsoft.Psi.Data.Exporter store = Store.Create(pipeline, ApplicationName, pathToStore);

                mouthOpen.Select(v => v ? 1d : 0d).Write("MouthOpen", store);

                speechDetector.Select(v => v ? 1d : 0d).Write("VAD", store);

                mouthAndSpeechDetector.Write("Join(MouthOpen,VAD)", store);

                kinectSensor.Audio.Write("Audio", store);

                var images = kinectSensor.ColorImage.EncodeJpeg(90, DeliveryPolicy.LatestMessage).Out;
                Store.Write(images, "Images", store, true, DeliveryPolicy.LatestMessage);

                landmarks.Write("FaceLandmarks", store);

                // Run the pipeline
                pipeline.RunAsync();

                Console.WriteLine("Press any key to finish recording");
                Console.ReadKey();
            }
        }
コード例 #30
0
 /// <summary>
 /// Initializes a new instance of the <see cref="AudioCaptureConfiguration"/> class.
 /// </summary>
 /// <remarks>Defaults to 16kHz, 1 channel, 16-bit PCM.</remarks>
 /// <param name="name">Device name (e.g. "plughw:0,0").</param>
 public AudioCaptureConfiguration(string name)
     : this(name, WaveFormat.Create16kHz1Channel16BitPcm())
 {
 }