Example #1
0
        private void MainWindow_Shown(object sender, EventArgs e)
        {
            // Create the \psi pipeline
            this.pipeline = Pipeline.Create();

            // Create the webcam component
            var webcam = new MediaCapture(this.pipeline, 640, 480, "/dev/video0", PixelFormatId.YUYV);

            // Create the audio capture component
            var audio = new AudioCapture(this.pipeline, new AudioCaptureConfiguration {
                DeviceName = "plughw:0,0", Format = WaveFormat.Create16kHz1Channel16BitPcm()
            });

            // Create an acoustic features extractor component and pipe the audio to it
            var acousticFeatures = new AcousticFeaturesExtractor(this.pipeline);

            audio.PipeTo(acousticFeatures);

            // Fuse the webcam images with the audio log energy level
            var webcamWithAudioEnergy = webcam.Join(acousticFeatures.LogEnergy, RelativeTimeInterval.Past());

            // Overlay the audio energy on the webcam image and display it in the window.
            // The "Do" operator is executed on each fused webcam and audio energy sample.
            webcamWithAudioEnergy.Do(
                frame =>
            {
                // Update the window with the latest frame
                this.DrawFrame(frame);
            },
                DeliveryPolicy.LatestMessage);

            // Start the pipeline running
            this.pipeline.RunAsync();
        }
Example #2
0
        /// <summary>
        /// Initializes a new instance of the <see cref="SimpleVoiceActivityDetector"/> class.
        /// </summary>
        /// <param name="pipeline">The pipeline to add the component to.</param>
        /// <param name="configuration">The component configuration.</param>
        public SimpleVoiceActivityDetector(Pipeline pipeline, SimpleVoiceActivityDetectorConfiguration configuration = null)
        {
            this.configuration = configuration ?? new SimpleVoiceActivityDetectorConfiguration();

            // The input audio - must be 16kHz 1-channel PCM
            this.audioInputConnector = pipeline.CreateConnector <AudioBuffer>(nameof(this.audioInputConnector));

            this.Out = pipeline.CreateEmitter <bool>(this, nameof(this.Out));

            // Currently using only the log energy feature for voice activity detection
            var acousticFeaturesExtractorConfiguration = new AcousticFeaturesExtractorConfiguration()
            {
                FrameDurationInSeconds = (float)this.configuration.FrameDuration,
                FrameRateInHz          = (float)this.configuration.FrameRate,
                ComputeLogEnergy       = true,
            };

            // Pipe the input audio to the audio features extractor component
            this.acousticFeaturesExtractor = new AcousticFeaturesExtractor(pipeline, acousticFeaturesExtractorConfiguration);
            this.audioInputConnector.PipeTo(this.acousticFeaturesExtractor);

            // Use a simple threshold for detection
            var logEnergy          = this.acousticFeaturesExtractor.LogEnergy;
            var logEnergyThreshold = logEnergy.Select(e => (e > this.configuration.LogEnergyThreshold) ? 1.0f : 0);

            // We use a sliding window of frames for both detection of voice activity and silence
            int voiceActivityDetectionFrames = (int)Math.Ceiling(this.configuration.VoiceActivityDetectionWindow * this.configuration.FrameRate);
            int silenceDetectionFrames       = (int)Math.Ceiling(this.configuration.SilenceDetectionWindow * this.configuration.FrameRate);

            // For front-end voice activity detection, we use a forward-looking Window() operator as this will use the timestamp
            // of the first frame in the window. For detection of silence during voice activity, we want to use the last frame's timestamp.
            var voiceActivityDetected = logEnergyThreshold.Window(0, voiceActivityDetectionFrames - 1).Average();
            var silenceDetected       = logEnergyThreshold.Window(-(silenceDetectionFrames - 1), 0).Average();

            // Use Aggregate opertator to update the state (isSpeaking) based on the current state.
            var vad = voiceActivityDetected.Join(silenceDetected).Aggregate(
                false,
                (isSpeaking, v) => isSpeaking ? v.Item2 != 0 : v.Item1 == 1.0);

            // Sync the output to the timestamps of the original audio frames (since we split the audio into fixed
            // length frames during the computation of the acoustic features).
            this.Out = this.audioInputConnector.Join(vad, TimeSpan.MaxValue).Select(a => a.Item2).Out;
        }