private void MainWindow_Shown(object sender, EventArgs e) { // Create the \psi pipeline this.pipeline = Pipeline.Create(); // Create the webcam component var webcam = new MediaCapture(this.pipeline, 640, 480, "/dev/video0", PixelFormatId.YUYV); // Create the audio capture component var audio = new AudioCapture(this.pipeline, new AudioCaptureConfiguration { DeviceName = "plughw:0,0", Format = WaveFormat.Create16kHz1Channel16BitPcm() }); // Create an acoustic features extractor component and pipe the audio to it var acousticFeatures = new AcousticFeaturesExtractor(this.pipeline); audio.PipeTo(acousticFeatures); // Fuse the webcam images with the audio log energy level var webcamWithAudioEnergy = webcam.Join(acousticFeatures.LogEnergy, RelativeTimeInterval.Past()); // Overlay the audio energy on the webcam image and display it in the window. // The "Do" operator is executed on each fused webcam and audio energy sample. webcamWithAudioEnergy.Do( frame => { // Update the window with the latest frame this.DrawFrame(frame); }, DeliveryPolicy.LatestMessage); // Start the pipeline running this.pipeline.RunAsync(); }
/// <summary> /// Initializes a new instance of the <see cref="SimpleVoiceActivityDetector"/> class. /// </summary> /// <param name="pipeline">The pipeline to add the component to.</param> /// <param name="configuration">The component configuration.</param> public SimpleVoiceActivityDetector(Pipeline pipeline, SimpleVoiceActivityDetectorConfiguration configuration = null) { this.configuration = configuration ?? new SimpleVoiceActivityDetectorConfiguration(); // The input audio - must be 16kHz 1-channel PCM this.audioInputConnector = pipeline.CreateConnector <AudioBuffer>(nameof(this.audioInputConnector)); this.Out = pipeline.CreateEmitter <bool>(this, nameof(this.Out)); // Currently using only the log energy feature for voice activity detection var acousticFeaturesExtractorConfiguration = new AcousticFeaturesExtractorConfiguration() { FrameDurationInSeconds = (float)this.configuration.FrameDuration, FrameRateInHz = (float)this.configuration.FrameRate, ComputeLogEnergy = true, }; // Pipe the input audio to the audio features extractor component this.acousticFeaturesExtractor = new AcousticFeaturesExtractor(pipeline, acousticFeaturesExtractorConfiguration); this.audioInputConnector.PipeTo(this.acousticFeaturesExtractor); // Use a simple threshold for detection var logEnergy = this.acousticFeaturesExtractor.LogEnergy; var logEnergyThreshold = logEnergy.Select(e => (e > this.configuration.LogEnergyThreshold) ? 1.0f : 0); // We use a sliding window of frames for both detection of voice activity and silence int voiceActivityDetectionFrames = (int)Math.Ceiling(this.configuration.VoiceActivityDetectionWindow * this.configuration.FrameRate); int silenceDetectionFrames = (int)Math.Ceiling(this.configuration.SilenceDetectionWindow * this.configuration.FrameRate); // For front-end voice activity detection, we use a forward-looking Window() operator as this will use the timestamp // of the first frame in the window. For detection of silence during voice activity, we want to use the last frame's timestamp. var voiceActivityDetected = logEnergyThreshold.Window(0, voiceActivityDetectionFrames - 1).Average(); var silenceDetected = logEnergyThreshold.Window(-(silenceDetectionFrames - 1), 0).Average(); // Use Aggregate opertator to update the state (isSpeaking) based on the current state. var vad = voiceActivityDetected.Join(silenceDetected).Aggregate( false, (isSpeaking, v) => isSpeaking ? v.Item2 != 0 : v.Item1 == 1.0); // Sync the output to the timestamps of the original audio frames (since we split the audio into fixed // length frames during the computation of the acoustic features). this.Out = this.audioInputConnector.Join(vad, TimeSpan.MaxValue).Select(a => a.Item2).Out; }