private void MainWindow_Shown(object sender, EventArgs e) { // Create the \psi pipeline this.pipeline = Pipeline.Create(); // Create the webcam component var webcam = new MediaCapture(this.pipeline, 640, 480, "/dev/video0", PixelFormatId.YUYV); // Create the audio capture component var audio = new AudioCapture(this.pipeline, new AudioCaptureConfiguration { DeviceName = "plughw:0,0", Format = WaveFormat.Create16kHz1Channel16BitPcm() }); // Create an acoustic features extractor component and pipe the audio to it var acousticFeatures = new AcousticFeaturesExtractor(this.pipeline); audio.PipeTo(acousticFeatures); // Fuse the webcam images with the audio log energy level var webcamWithAudioEnergy = webcam.Join(acousticFeatures.LogEnergy, RelativeTimeInterval.Past()); // Overlay the audio energy on the webcam image and display it in the window. // The "Do" operator is executed on each fused webcam and audio energy sample. webcamWithAudioEnergy.Do( frame => { // Update the window with the latest frame this.DrawFrame(frame); }, DeliveryPolicy.LatestMessage); // Start the pipeline running this.pipeline.RunAsync(); }
private async Task <string> GetVoiceSignatureString() { var audioStream = new MemoryStream(); var writer = new WaveDataWriterClass(audioStream, WaveFormat.Create16kHz1Channel16BitPcm()); using (var p = Pipeline.Create()) { var capture = new AudioCapture(p, WaveFormat.Create16kHz1Channel16BitPcm()); capture.Do(audio => writer.Write(audio.Data.DeepClone())); p.RunAsync(); await Task.Delay(5000); writer.Flush(); } var content = new ByteArrayContent(audioStream.GetBuffer(), 0, (int)audioStream.Length); var client = new HttpClient(); client.DefaultRequestHeaders.Add("Ocp-Apim-Subscription-Key", this.SubscriptionKey); var response = await client.PostAsync($"https://signature.{this.Region}.cts.speech.microsoft.com/api/v1/Signature/GenerateVoiceSignatureFromByteArray", content); var jsonData = await response.Content.ReadAsStringAsync(); var result = JsonConvert.DeserializeObject <VoiceSignature>(jsonData); return(JsonConvert.SerializeObject(result.Signature)); }
private static IProducer <AudioBuffer> SetupAudioInput(Pipeline pipeline, string inputLogPath, ref DateTime startTime) { IProducer <AudioBuffer> audioInput = null; if (inputLogPath != null) { // Open the MicrophoneAudio stream from the last saved log var store = Store.Open(pipeline, AppName, inputLogPath); audioInput = store.OpenStream <AudioBuffer>($"{AppName}.MicrophoneAudio"); // Get the originating time of the start of the data in the store. We will use this // to set the correct start time in the visualizer (if live visualization is on). startTime = store.OriginatingTimeInterval.Left; } else { // Create the AudioSource component to capture audio from the default device in 16 kHz 1-channel // PCM format as required by both the voice activity detector and speech recognition components. audioInput = new AudioSource(pipeline, new AudioSourceConfiguration() { OutputFormat = WaveFormat.Create16kHz1Channel16BitPcm() }); } return(audioInput); }
/// <summary> /// Builds and runs a webcam pipeline and records the data to a Psi store. /// </summary> /// <param name="pathToStore">The path to directory where store should be saved.</param> public static void RecordAudioVideo(string pathToStore) { // Create the pipeline object. using (Pipeline pipeline = Pipeline.Create()) { // Register an event handler to catch pipeline errors pipeline.PipelineExceptionNotHandled += Pipeline_PipelineException; // Register an event handler to be notified when the pipeline completes pipeline.PipelineCompleted += Pipeline_PipelineCompleted; // Create store var store = PsiStore.Create(pipeline, ApplicationName, pathToStore); // Create our webcam var webcam = new MediaCapture(pipeline, 1920, 1080, 30); // Create the AudioCapture component to capture audio from the default device in 16 kHz 1-channel IProducer <AudioBuffer> audioInput = new AudioCapture(pipeline, WaveFormat.Create16kHz1Channel16BitPcm()); var images = webcam.Out.EncodeJpeg(90, DeliveryPolicy.LatestMessage).Out; // Attach the webcam's image output to the store. We will write the images to the store as compressed JPEGs. images.Write("Image", store, true, DeliveryPolicy.LatestMessage); // Attach the audio input to the store audioInput.Out.Write("Audio", store, true, DeliveryPolicy.LatestMessage); // Run the pipeline pipeline.RunAsync(); Console.WriteLine("Press any key to finish recording"); Console.ReadKey(); } }
/// <summary> /// Initializes a new instance of the <see cref="SystemVoiceActivityDetectorConfiguration"/> class. /// </summary> public SystemVoiceActivityDetectorConfiguration() { this.Language = "en-us"; this.Grammars = null; this.BufferLengthInMs = 1000; // These values affect the latency of results from the VAD. Due to inherent delay // between the time audio is sent to the internal recognition engine and when the // engine detects that speech is present and makes a state transition, we need to // add these offsets to the computed time at which the state transition occurs to // ensure proper alignment between the audio and VAD result. A negative value // will shift the result earlier in time to account for this delay. However, this // will also contribute to the latency of the VAD output, so we should tune this // to be as close to zero as possible while still maintaining correctness. Values // of between -50ms and -150ms appear to be reasonable. this.VoiceActivityStartOffsetMs = -150; this.VoiceActivityEndOffsetMs = -150; // Defaults to 16 kHz, 16-bit, 1-channel PCM samples this.InputFormat = WaveFormat.Create16kHz1Channel16BitPcm(); // Modify these values to improve VAD responsiveness. The EndSilenceTimeoutMs and // EndSilenceTimeoutAmbiguousMs parameters seems to matter most. Initialized to the // default values as specified in the documentation here: // https://docs.microsoft.com/en-us/dotnet/api/system.speech.recognition.speechrecognitionengine?view=netframework-4.8#properties this.InitialSilenceTimeoutMs = 0; this.BabbleTimeoutMs = 0; this.EndSilenceTimeoutAmbiguousMs = 500; this.EndSilenceTimeoutMs = 150; }
/// <summary> /// Initializes a new instance of the <see cref="SystemSpeechRecognizerConfiguration"/> class. /// </summary> public SystemSpeechRecognizerConfiguration() { this.Language = "en-us"; this.Grammars = null; this.BufferLengthInMs = 1000; // Defaults to 16 kHz, 16-bit, 1-channel PCM samples this.InputFormat = WaveFormat.Create16kHz1Channel16BitPcm(); }
/// <summary> /// Initializes a new instance of the <see cref="AzureSpeechRecognizerConfiguration"/> class. /// </summary> public AzureSpeechRecognizerConfiguration() { this.Language = "en-us"; this.SubscriptionKey = null; // This must be set to the key associated with your account this.Region = null; // This must be set to the region associated to the key // Defaults to 16 kHz, 16-bit, 1-channel PCM samples this.InputFormat = WaveFormat.Create16kHz1Channel16BitPcm(); }
/// <summary> /// Initializes a new instance of the <see cref="AcousticFeaturesExtractorConfiguration"/> class. /// </summary> public AcousticFeaturesExtractorConfiguration() { // Default parameters for acoustic features computation this.computeFFT = false; this.computeFFTPower = false; // Defaults to 16 kHz, 16-bit, 1-channel PCM samples this.InputFormat = WaveFormat.Create16kHz1Channel16BitPcm(); }
public void AudioBuffer_Empty() { AudioBuffer buffer = new AudioBuffer(0, WaveFormat.Create16kHz1Channel16BitPcm()); Assert.AreEqual(0, buffer.Length); Assert.AreEqual(0, buffer.Data.Length); Assert.AreEqual(WaveFormat.Create16kHz1Channel16BitPcm(), buffer.Format); Assert.AreEqual(TimeSpan.Zero, buffer.Duration); CollectionAssert.AreEqual(new double[0], this.GetSamples(buffer).ToArray()); }
public void AudioBuffer_16kHz1Channel16BitPcm1Sample() { AudioBuffer buffer = new AudioBuffer(BitConverter.GetBytes((short)-12345), WaveFormat.Create16kHz1Channel16BitPcm()); Assert.AreEqual(-12345, BitConverter.ToInt16(buffer.Data, 0)); Assert.AreEqual(2, buffer.Length); Assert.AreEqual(2, buffer.Data.Length); Assert.AreEqual(WaveFormat.Create16kHz1Channel16BitPcm(), buffer.Format); Assert.AreEqual(TimeSpan.FromTicks(10000000L / 16000), buffer.Duration); CollectionAssert.AreEqual(new double[] { -12345 }, this.GetSamples(buffer).ToArray()); }
/// <summary> /// Initializes a new instance of the <see cref="SystemVoiceActivityDetectorConfiguration"/> class. /// </summary> public SystemVoiceActivityDetectorConfiguration() { this.Language = "en-us"; this.Grammars = null; this.BufferLengthInMs = 1000; this.VoiceActivityStartOffsetMs = -150; this.VoiceActivityEndOffsetMs = -150; // Defaults to 16 kHz, 16-bit, 1-channel PCM samples this.InputFormat = WaveFormat.Create16kHz1Channel16BitPcm(); }
/// <summary> /// Initializes a new instance of the <see cref="AudioPlayerConfiguration"/> class. /// </summary> public AudioPlayerConfiguration() { this.DeviceName = string.Empty; this.TargetLatencyInMs = 20; this.BufferLengthSeconds = 0.1; this.AudioLevel = -1; this.Gain = 1.0f; // Defaults to 16 kHz, 16-bit, 1-channel PCM samples this.InputFormat = WaveFormat.Create16kHz1Channel16BitPcm(); }
/// <summary> /// Builds and runs a speech recognition pipeline using the Azure speech recognizer. Requires a valid Cognitive Services /// subscription key. See https://docs.microsoft.com/en-us/azure/cognitive-services/cognitive-services-apis-create-account. /// </summary> /// <remarks> /// If you are getting a <see cref="System.InvalidOperationException"/> with the message 'AzureSpeechRecognizer returned /// OnConversationError with error code: LoginFailed. Original error text: Transport error', this most likely is due to /// an invalid subscription key. Please check your Azure portal at https://portal.azure.com and ensure that you have /// added a subscription to the Azure Speech API on your account. /// </remarks> public static void RunAzureSpeech() { // Create the pipeline object. using (Pipeline pipeline = Pipeline.Create()) { // Create the AudioSource component to capture audio from the default device in 16 kHz 1-channel // PCM format as required by both the voice activity detector and speech recognition components. IProducer <AudioBuffer> audioInput = new AudioCapture(pipeline, new AudioCaptureConfiguration() { DeviceName = "plughw:0,0", Format = WaveFormat.Create16kHz1Channel16BitPcm() }); // Perform voice activity detection using the voice activity detector component var vad = new SimpleVoiceActivityDetector(pipeline); audioInput.PipeTo(vad); // Create Azure speech recognizer component var recognizer = new AzureSpeechRecognizer(pipeline, new AzureSpeechRecognizerConfiguration() { SubscriptionKey = Program.azureSubscriptionKey, Region = Program.azureRegion }); // The input audio to the Azure speech recognizer needs to be annotated with a voice activity flag. // This can be constructed by using the Psi Join() operator to combine the audio and VAD streams. var annotatedAudio = audioInput.Join(vad); // Subscribe the recognizer to the annotated audio annotatedAudio.PipeTo(recognizer); // Partial and final speech recognition results are posted on the same stream. Here // we use Psi's Where() operator to filter out only the final recognition results. var finalResults = recognizer.Out.Where(result => result.IsFinal); // Print the recognized text of the final recognition result to the console. finalResults.Do(result => Console.WriteLine(result.Text)); // Register an event handler to catch pipeline errors pipeline.PipelineExceptionNotHandled += Pipeline_PipelineException; // Register an event handler to be notified when the pipeline completes pipeline.PipelineCompleted += Pipeline_PipelineCompleted; // Run the pipeline pipeline.RunAsync(); // Azure speech transcribes speech to text Console.WriteLine("Say anything"); Console.WriteLine("Press any key to exit..."); Console.ReadKey(true); } }
/// <summary> /// Initializes a new instance of the <see cref="SystemSpeechSynthesizerConfiguration"/> class. /// </summary> public SystemSpeechSynthesizerConfiguration() { this.Voice = "Microsoft Zira Desktop"; this.PersistAudio = false; this.UseDefaultAudioPlaybackDevice = false; this.BufferLengthInMs = 1000; this.ProsodyRate = 1.0; this.ProsodyPitch = "default"; this.ProsodyVolume = "default"; // Defaults to 16 kHz, 16-bit, 1-channel PCM samples this.OutputFormat = WaveFormat.Create16kHz1Channel16BitPcm(); }
/// <summary> /// Builds and runs a webcam pipeline and records the data to a Psi store /// </summary> /// <param name="pathToStore">The path to directory where store should be saved.</param> public static void RecordAudioVideo(string pathToStore) { // Create the pipeline object. using (Pipeline pipeline = Pipeline.Create()) { var visualizationClient = new VisualizationClient(); // Register an event handler to catch pipeline errors pipeline.PipelineCompletionEvent += PipelineCompletionEvent; // Clear all data if the visualizer is already open visualizationClient.ClearAll(); // Set the visualization client to visualize live data visualizationClient.SetLiveMode(); // Create store Data.Exporter store = Store.Create(pipeline, ApplicationName, pathToStore); // Create our webcam MediaCapture webcam = new MediaCapture(pipeline, 1920, 1080, 30); // Create the AudioCapture component to capture audio from the default device in 16 kHz 1-channel IProducer <AudioBuffer> audioInput = new AudioCapture(pipeline, new AudioCaptureConfiguration() { OutputFormat = WaveFormat.Create16kHz1Channel16BitPcm() }); var images = webcam.Out.EncodeJpeg(90, DeliveryPolicy.LatestMessage).Out; // Attach the webcam's image output to the store. We will write the images to the store as compressed JPEGs. Store.Write(images, "Image", store, true, DeliveryPolicy.LatestMessage); // Attach the audio input to the store Store.Write(audioInput.Out, "Audio", store, true, DeliveryPolicy.LatestMessage); // Create a XY panel in PsiStudio to display the images visualizationClient.AddXYPanel(); images.Show(visualizationClient); // Create a timeline panel in PsiStudio to display the audio waveform visualizationClient.AddTimelinePanel(); audioInput.Out.Show(visualizationClient); // Run the pipeline pipeline.RunAsync(); Console.WriteLine("Press any key to finish recording"); Console.ReadKey(); } }
public void AudioBuffer_ReframeByDuration() { var audioFormat = WaveFormat.Create16kHz1Channel16BitPcm(); var audioData = new byte[] { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 }; int inputCount = 100; int inputSize = 10; var inputInterval = TimeSpan.FromTicks(inputSize * 10000 / 32); // 10 bytes @ 32 kBytes/sec var outputInterval = TimeSpan.FromMilliseconds(10); // 10 ms int outputSize = (int)(outputInterval.TotalSeconds * audioFormat.AvgBytesPerSec); var output = new List <(AudioBuffer, DateTime)>(); var startTime = DateTime.MinValue; using (var p = Pipeline.Create()) { // input stream of 10-byte audio buffers var audio = Generators.Repeat(p, new AudioBuffer(audioData, audioFormat), inputCount, inputInterval); // reframe output stream as 10 ms audio buffers var reframed = audio.Reframe(outputInterval); // capture outputs and start time for verification reframed.Do((x, e) => output.Add((x.DeepClone(), e.OriginatingTime))); audio.First().Do((x, e) => startTime = e.OriginatingTime - inputInterval); p.Run(); } // verify no. of reframed output buffers Assert.AreEqual(inputCount * inputSize / outputSize, output.Count); foreach (var(buffer, dt) in output) { // verify output audio buffer originating times startTime += outputInterval; Assert.AreEqual(startTime, dt); // verify audio format remains the same Assert.AreEqual(audioFormat, buffer.Format); // verify the output audio bytes by constructing the expected output from a concatenation of the input data var expectedOutput = Enumerable.Repeat(audioData, outputSize / inputSize).SelectMany(x => x).Concat(audioData.Take(outputSize % inputSize)); CollectionAssert.AreEqual(expectedOutput.ToArray(), buffer.Data); // shift the input data to account for any partial bytes when constructing the expected output above audioData = audioData.Skip(outputSize % inputSize).Concat(audioData.Take(outputSize % inputSize)).ToArray(); } }
public void AudioBuffer_ReframeSmaller() { var audioFormat = WaveFormat.Create16kHz1Channel16BitPcm(); var audioData = new byte[] { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 }; int inputCount = 100; int inputSize = 10; int outputSize = 6; var inputInterval = TimeSpan.FromTicks(inputSize * 10000 / 32); // 10 bytes @ 32 kBytes/sec var outputInterval = TimeSpan.FromTicks(outputSize * 10000 / 32); // 6 bytes @ 32 kBytes/sec var output = new List <(AudioBuffer, DateTime)>(); var startTime = DateTime.MinValue; using (var p = Pipeline.Create()) { // input stream of 10-byte audio buffers var audio = Generators.Repeat(p, new AudioBuffer(audioData, audioFormat), inputCount, inputInterval); // reframe output stream as 6-byte audio buffers var reframed = audio.Reframe(outputSize); // capture outputs and start time for verification reframed.Do((x, e) => output.Add((x.DeepClone(), e.OriginatingTime))); audio.First().Do((x, e) => startTime = e.OriginatingTime - inputInterval); p.Run(); } // verify no. of reframed output buffers Assert.AreEqual(inputCount * inputSize / outputSize, output.Count); foreach (var(buffer, dt) in output) { // verify output audio buffer originating times startTime += outputInterval; Assert.AreEqual(startTime, dt); // verify audio format remains the same Assert.AreEqual(audioFormat, buffer.Format); // verify that the output audio bytes match the first [outputSize] bytes of the input data CollectionAssert.AreEqual(audioData.Take(outputSize).ToArray(), buffer.Data); // shift the input data to be aligned with the start of the next expected output buffer audioData = audioData.Skip(outputSize).Concat(audioData.Take(outputSize)).ToArray(); } }
public void AudioBuffer_16kHz1Channel16BitPcm3Samples() { short[] rawValues = new short[] { -32768, 32767, 12345 }; byte[] rawBytes = rawValues.SelectMany(x => BitConverter.GetBytes(x)).ToArray(); AudioBuffer buffer = new AudioBuffer(rawBytes, WaveFormat.Create16kHz1Channel16BitPcm()); Assert.AreEqual(-32768, BitConverter.ToInt16(buffer.Data, 0)); Assert.AreEqual(32767, BitConverter.ToInt16(buffer.Data, 2)); Assert.AreEqual(12345, BitConverter.ToInt16(buffer.Data, 4)); Assert.AreEqual(6, buffer.Length); Assert.AreEqual(6, buffer.Data.Length); Assert.AreEqual(WaveFormat.Create16kHz1Channel16BitPcm(), buffer.Format); Assert.AreEqual(TimeSpan.FromTicks(rawValues.Length * (10000000L / 16000)), buffer.Duration); CollectionAssert.AreEqual(new double[] { -32768, 32767, 12345 }, this.GetSamples(buffer).ToArray()); }
public void WaveFormat_Create16kHz1Channel16BitPcm() { // Define "native" WAVEFORMATEX structure for PCM byte[] formatBytes = new byte[] { 0x01, 0x00, // FormatTag = 1 0x01, 0x00, // Channels = 1 0x80, 0x3e, 0x00, 0x00, // SamplesPerSec = 16000 0x00, 0x7d, 0x00, 0x00, // AvgBytesPerSec = 32000 0x02, 0x00, // BlockAlign = 2 0x10, 0x00, // BitsPerSample = 16 0x00, 0x00, // ExtraSize = 0 }; // Create equivalent managed WaveFormat object WaveFormat format = WaveFormat.Create16kHz1Channel16BitPcm(); // Verify against expected this.MarshalAndVerify(format, formatBytes); }
static void Main(string[] args) { using (Pipeline pipeline = Pipeline.Create()) { WaveFormat waveFormat = WaveFormat.Create16kHz1Channel16BitPcm(); IProducer <AudioBuffer> audioInput = new AudioCapture(pipeline, new AudioCaptureConfiguration() { OutputFormat = waveFormat }); DataFaucet <AudioBuffer> df = new DataFaucet <AudioBuffer>(pipeline); audioInput.PipeTo(df); AggregateDump dump = new AggregateDump(pipeline); df.PipeTo(dump); GoogleASR gsr = new GoogleASR(pipeline, "en"); //gsr for google speech recognition dump.PipeTo(gsr); GoogleTranslate gt = new GoogleTranslate(pipeline, "en", "de"); //gt for google translate gsr.PipeTo(gt); GoogleSpeak gs = new GoogleSpeak(pipeline, waveFormat, "de-DE"); //gs for google speak gt.PipeTo(gs); AudioOutput aOut = new AudioOutput(pipeline); //aOut for audio out gs.PipeTo(aOut); ActiveMQ rasa = new ActiveMQ(pipeline, "rasa.PSI", "rasa.PYTHON"); gsr.PipeTo(rasa); GUI gui = new GUI(df, dump, gsr, gt); Thread thread = new Thread(() => { gui.ShowDialog(); }); thread.Start(); pipeline.RunAsync(); Console.ReadKey(true); } }
/// <summary> /// Initializes a new instance of the <see cref="AcousticFeaturesExtractorConfiguration"/> class. /// </summary> public AcousticFeaturesExtractorConfiguration() { // Default parameters for acoustic features computation this.FrameDurationInSeconds = 0.025f; this.FrameRateInHz = 100.0f; this.AddDither = true; this.DitherScaleFactor = 1.0f; this.StartFrequency = 250.0f; this.EndFrequency = 7000.0f; this.LowEndFrequency = 3000.0f; this.HighStartFrequency = 2500.0f; this.EntropyBandwidth = 2500.0f; this.ComputeLogEnergy = true; this.ComputeZeroCrossingRate = true; this.ComputeFrequencyDomainEnergy = true; this.ComputeLowFrequencyEnergy = true; this.ComputeHighFrequencyEnergy = true; this.ComputeSpectralEntropy = true; this.ComputeFFT = false; this.ComputeFFTPower = false; // Defaults to 16 kHz, 16-bit, 1-channel PCM samples this.InputFormat = WaveFormat.Create16kHz1Channel16BitPcm(); }
/// <summary> /// Builds and runs a speech recognition pipeline using the Azure speech service. Requires a valid Cognitive Services /// subscription key. See https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/get-started. /// </summary> public static void RunAzureSpeech() { // Create the pipeline object. using (Pipeline pipeline = Pipeline.Create()) { // Create the AudioSource component to capture audio from the default device in 16 kHz 1-channel // PCM format as required by the speech recognition component. var audio = new AudioCapture(pipeline, new AudioCaptureConfiguration { DeviceName = deviceName, Format = WaveFormat.Create16kHz1Channel16BitPcm() }); // Create the speech recognizer component var recognizer = new ContinuousSpeechRecognizer(pipeline, azureSubscriptionKey, azureRegion); // Subscribe the recognizer to the annotated audio audio.PipeTo(recognizer); // Print the recognized text of the final recognition result to the console. recognizer.Out.Do((result, e) => Console.WriteLine($"{e.OriginatingTime.TimeOfDay}: {result}")); // Register an event handler to catch pipeline errors pipeline.PipelineExceptionNotHandled += Pipeline_PipelineException; // Register an event handler to be notified when the pipeline completes pipeline.PipelineCompleted += Pipeline_PipelineCompleted; // Run the pipeline pipeline.RunAsync(); // Azure speech transcribes speech to text Console.WriteLine("Say anything"); Console.WriteLine("Press any key to exit..."); Console.ReadKey(true); } }
public void AudioBuffer_HasValidData() { Assert.IsFalse(default(AudioBuffer).HasValidData); Assert.IsTrue(new AudioBuffer(new byte[2], WaveFormat.Create16kHz1Channel16BitPcm()).HasValidData); }
/// <summary> /// Builds and runs a speech recognition pipeline using the Azure speech recognizer. Requires a valid Cognitive Services /// subscription key. See https://docs.microsoft.com/en-us/azure/cognitive-services/cognitive-services-apis-create-account. /// </summary> /// <remarks> /// If you are getting a <see cref="System.InvalidOperationException"/> with the message 'AzureSpeechRecognizer returned /// OnConversationError with error code: LoginFailed. Original error text: Transport error', this most likely is due to /// an invalid subscription key. Please check your Azure portal at https://portal.azure.com and ensure that you have /// added a subscription to the Azure Speech API on your account. /// </remarks> /// <param name="outputLogPath">The path under which to write log data.</param> /// <param name="inputLogPath">The path from which to read audio input data.</param> public static void RunAzureSpeech(string outputLogPath = null, string inputLogPath = null) { // Create the pipeline object. using (Pipeline pipeline = Pipeline.Create()) { // Use either live audio from the microphone or audio from a previously saved log IProducer <AudioBuffer> audioInput = null; if (inputLogPath != null) { // Open the MicrophoneAudio stream from the last saved log var store = PsiStore.Open(pipeline, Program.AppName, inputLogPath); audioInput = store.OpenStream <AudioBuffer>($"{Program.AppName}.MicrophoneAudio"); } else { // Create the AudioCapture component to capture audio from the default device in 16 kHz 1-channel // PCM format as required by both the voice activity detector and speech recognition components. audioInput = new AudioCapture(pipeline, WaveFormat.Create16kHz1Channel16BitPcm()); } // Perform voice activity detection using the voice activity detector component var vad = new SystemVoiceActivityDetector(pipeline); audioInput.PipeTo(vad); // Create Azure speech recognizer component var recognizer = new AzureSpeechRecognizer(pipeline, new AzureSpeechRecognizerConfiguration() { SubscriptionKey = Program.azureSubscriptionKey, Region = Program.azureRegion }); // The input audio to the Azure speech recognizer needs to be annotated with a voice activity flag. // This can be constructed by using the Psi Join() operator to combine the audio and VAD streams. var annotatedAudio = audioInput.Join(vad); // Subscribe the recognizer to the annotated audio annotatedAudio.PipeTo(recognizer); // Partial and final speech recognition results are posted on the same stream. Here // we use Psi's Where() operator to filter out only the final recognition results. var finalResults = recognizer.Out.Where(result => result.IsFinal); // Print the recognized text of the final recognition result to the console. finalResults.Do(result => Console.WriteLine(result.Text)); // Create a data store to log the data to if necessary. A data store is necessary // only if output logging is enabled. var dataStore = CreateDataStore(pipeline, outputLogPath); // For disk logging only if (dataStore != null) { // Log the microphone audio and recognition results audioInput.Write($"{Program.AppName}.MicrophoneAudio", dataStore); finalResults.Write($"{Program.AppName}.FinalRecognitionResults", dataStore); vad.Write($"{Program.AppName}.VoiceActivity", dataStore); } // Register an event handler to catch pipeline errors pipeline.PipelineExceptionNotHandled += Pipeline_PipelineException; // Register an event handler to be notified when the pipeline completes pipeline.PipelineCompleted += Pipeline_PipelineCompleted; // Run the pipeline pipeline.RunAsync(); // Azure speech transcribes speech to text Console.WriteLine("Say anything"); Console.WriteLine("Press any key to exit..."); Console.ReadKey(true); } }
/// <summary> /// Initializes a new instance of the <see cref="AudioCaptureConfiguration"/> class. /// </summary> /// <remarks>Defaults to 16kHz, 1 channel, 16-bit PCM.</remarks> public AudioCaptureConfiguration() : this("plughw:0,0", WaveFormat.Create16kHz1Channel16BitPcm()) { }
/// <summary> /// Builds and runs a speech recognition pipeline using the .NET System.Speech recognizer and a set of fixed grammars. /// </summary> /// <param name="outputLogPath">The path under which to write log data.</param> /// <param name="inputLogPath">The path from which to read audio input data.</param> /// <param name="showLiveVisualization">A flag indicating whether to display live data in PsiStudio as the pipeline is running.</param> public static void RunSystemSpeech(string outputLogPath = null, string inputLogPath = null, bool showLiveVisualization = true, string facilitatorIP = "localhost", int facilitatorPort = 9000, int localPort = 8090) { // Create the pipeline object. using (Pipeline pipeline = Pipeline.Create()) { // Needed only for live visualization DateTime startTime = DateTime.Now; // Use either live audio from the microphone or audio from a previously saved log IProducer <AudioBuffer> audioInput = null; if (inputLogPath != null) { // Open the MicrophoneAudio stream from the last saved log var store = Store.Open(pipeline, Program.AppName, inputLogPath); audioInput = store.OpenStream <AudioBuffer>($"{Program.AppName}.MicrophoneAudio"); // Get the originating time of the start of the data in the store. We will use this // to set the correct start time in the visualizer (if live visualization is on). startTime = store.OriginatingTimeInterval.Left; } else { // Create the AudioSource component to capture audio from the default device in 16 kHz 1-channel // PCM format as required by both the voice activity detector and speech recognition components. audioInput = new AudioSource(pipeline, new AudioSourceConfiguration() { OutputFormat = WaveFormat.Create16kHz1Channel16BitPcm() }); } // Create System.Speech recognizer component var recognizer = CreateSpeechRecognizer(pipeline); // Subscribe the recognizer to the input audio audioInput.PipeTo(recognizer); // Partial and final speech recognition results are posted on the same stream. Here // we use Psi's Where() operator to filter out only the final recognition results. var finalResults = recognizer.Out.Where(result => result.IsFinal); finalResults.Do(x => Console.WriteLine(x)); //KioskUI.KioskUI ui = new KioskUI.KioskUI(pipeline); //SystemSpeechSynthesizer speechSynth = CreateSpeechSynthesizer(pipeline); KioskInputTextPreProcessor preproc = new NU.Kqml.KioskInputTextPreProcessor(pipeline, (SystemSpeechRecognizer)recognizer); finalResults.PipeTo(preproc.In); preproc.Out.Do(x => Console.WriteLine($"Processed: {x}")); //preproc.Out.PipeTo(ui.UserInput); if (facilitatorIP != "none") { python = new SocketStringConsumer(pipeline, facilitatorIP, facilitatorPort, localPort); //preproc.Out.PipeTo(ui.UserInput); //python.Out.PipeTo(ui.CompResponse); //python.Out.PipeTo(speechSynth); } else { //preproc.Out.PipeTo(ui.CompResponse); //preproc.Out.PipeTo(speechSynth); } //speechSynth.SpeakCompleted.Do(x => preproc.setAccepting()); // Create a data store to log the data to if necessary. A data store is necessary // only if output logging or live visualization are enabled. var dataStore = CreateDataStore(pipeline, outputLogPath, showLiveVisualization); // For disk logging or live visualization only if (dataStore != null) { // Log the microphone audio and recognition results audioInput.Write($"{Program.AppName}.MicrophoneAudio", dataStore); finalResults.Write($"{Program.AppName}.FinalRecognitionResults", dataStore); } // Register an event handler to catch pipeline errors pipeline.PipelineCompletionEvent += PipelineCompletionEvent; // Run the pipeline pipeline.RunAsync(); Console.WriteLine("Press any key to exit..."); Console.ReadKey(true); } }
/// <summary> /// Builds and runs a speech recognition pipeline using the .NET System.Speech recognizer and a set of fixed grammars. /// </summary> /// <param name="outputLogPath">The path under which to write log data.</param> /// <param name="inputLogPath">The path from which to read audio input data.</param> public static void RunSystemSpeech(string outputLogPath = null, string inputLogPath = null) { // Create the pipeline object. using (Pipeline pipeline = Pipeline.Create()) { // Use either live audio from the microphone or audio from a previously saved log IProducer <AudioBuffer> audioInput = null; if (inputLogPath != null) { // Open the MicrophoneAudio stream from the last saved log var store = PsiStore.Open(pipeline, Program.AppName, inputLogPath); audioInput = store.OpenStream <AudioBuffer>($"{Program.AppName}.MicrophoneAudio"); } else { // Create the AudioCapture component to capture audio from the default device in 16 kHz 1-channel // PCM format as required by both the voice activity detector and speech recognition components. audioInput = new AudioCapture(pipeline, WaveFormat.Create16kHz1Channel16BitPcm()); } // Create System.Speech recognizer component var recognizer = new SystemSpeechRecognizer( pipeline, new SystemSpeechRecognizerConfiguration() { Language = "en-US", Grammars = new GrammarInfo[] { new GrammarInfo() { Name = Program.AppName, FileName = "SampleGrammar.grxml" }, }, }); // Subscribe the recognizer to the input audio audioInput.PipeTo(recognizer); // Partial and final speech recognition results are posted on the same stream. Here // we use Psi's Where() operator to filter out only the final recognition results. var finalResults = recognizer.Out.Where(result => result.IsFinal); // Print the final recognition result to the console. finalResults.Do(result => { Console.WriteLine($"{result.Text} (confidence: {result.Confidence})"); }); // Create a data store to log the data to if necessary. A data store is necessary // only if output logging is enabled. var dataStore = CreateDataStore(pipeline, outputLogPath); // For disk logging only if (dataStore != null) { // Log the microphone audio and recognition results audioInput.Write($"{Program.AppName}.MicrophoneAudio", dataStore); finalResults.Write($"{Program.AppName}.FinalRecognitionResults", dataStore); } // Register an event handler to catch pipeline errors pipeline.PipelineExceptionNotHandled += Pipeline_PipelineException; // Register an event handler to be notified when the pipeline completes pipeline.PipelineCompleted += Pipeline_PipelineCompleted; // Run the pipeline pipeline.RunAsync(); // The file SampleGrammar.grxml defines a grammar to transcribe numbers Console.WriteLine("Say any number between 0 and 100"); Console.WriteLine("Press any key to exit..."); Console.ReadKey(true); } }
private void ButtonCreate16kHz1Channel16BitPcm_Click(object sender, RoutedEventArgs e) { Update(WaveFormat.Create16kHz1Channel16BitPcm()); }
/// <summary> /// This is the main code for our Multimodal Speech Detection demo. /// </summary> private void PerformMultiModalSpeechDetection() { Console.WriteLine("Initializing Psi."); bool detected = false; // First create our \Psi pipeline using (var pipeline = Pipeline.Create("MultiModalSpeechDetection")) { // Register an event handler to catch pipeline errors pipeline.PipelineExceptionNotHandled += Pipeline_PipelineException; // Register an event handler to be notified when the pipeline completes pipeline.PipelineCompleted += Pipeline_PipelineCompleted; // Next create our Kinect sensor. We will be using the color images, face tracking, and audio from the Kinect sensor var kinectSensorConfig = new KinectSensorConfiguration(); kinectSensorConfig.OutputColor = true; kinectSensorConfig.OutputAudio = true; kinectSensorConfig.OutputBodies = true; // In order to detect faces using Kinect you must also enable detection of bodies var kinectSensor = new KinectSensor(pipeline, kinectSensorConfig); var kinectFaceDetector = new Microsoft.Psi.Kinect.Face.KinectFaceDetector(pipeline, kinectSensor, Microsoft.Psi.Kinect.Face.KinectFaceDetectorConfiguration.Default); // Create our Voice Activation Detector var speechDetector = new SystemVoiceActivityDetector(pipeline); var convertedAudio = kinectSensor.Audio.Resample(WaveFormat.Create16kHz1Channel16BitPcm()); convertedAudio.PipeTo(speechDetector); // Use the Kinect's face track to determine if the mouth is opened var mouthOpenAsFloat = kinectFaceDetector.Faces.Where(faces => faces.Count > 0).Select((List <Microsoft.Psi.Kinect.Face.KinectFace> list) => { if (!detected) { detected = true; Console.WriteLine("Found your face"); } bool open = (list[0] != null) ? list[0].FaceProperties[Microsoft.Kinect.Face.FaceProperty.MouthOpen] == Microsoft.Kinect.DetectionResult.Yes : false; return(open ? 1.0 : 0.0); }); // Next take the "mouthOpen" value and create a hold on that value (so that we don't see 1,0,1,0,1 but instead would see 1,1,1,1,0.8,0.6,0.4) var mouthOpen = mouthOpenAsFloat.Hold(0.1); // Next join the results of the speechDetector with the mouthOpen generator and only select samples where // we have detected speech and that the mouth was open. var mouthAndSpeechDetector = speechDetector.Join(mouthOpen, hundredMs).Select((t, e) => t.Item1 && t.Item2); // Convert our speech into text var speechRecognition = convertedAudio.SpeechToText(mouthAndSpeechDetector); speechRecognition.Do((s, t) => { if (s.Item1.Length > 0) { Console.WriteLine("You said: " + s.Item1); } }); // Create a stream of landmarks (points) from the face detector var facePoints = new List <Tuple <System.Windows.Point, string> >(); var landmarks = kinectFaceDetector.Faces.Where(faces => faces.Count > 0).Select((List <Microsoft.Psi.Kinect.Face.KinectFace> list) => { facePoints.Clear(); System.Windows.Point pt1 = new System.Windows.Point( list[0].FacePointsInColorSpace[Microsoft.Kinect.Face.FacePointType.EyeLeft].X, list[0].FacePointsInColorSpace[Microsoft.Kinect.Face.FacePointType.EyeLeft].Y); facePoints.Add(Tuple.Create(pt1, string.Empty)); System.Windows.Point pt2 = new System.Windows.Point( list[0].FacePointsInColorSpace[Microsoft.Kinect.Face.FacePointType.EyeRight].X, list[0].FacePointsInColorSpace[Microsoft.Kinect.Face.FacePointType.EyeRight].Y); facePoints.Add(Tuple.Create(pt2, string.Empty)); System.Windows.Point pt3 = new System.Windows.Point( list[0].FacePointsInColorSpace[Microsoft.Kinect.Face.FacePointType.MouthCornerLeft].X, list[0].FacePointsInColorSpace[Microsoft.Kinect.Face.FacePointType.MouthCornerLeft].Y); facePoints.Add(Tuple.Create(pt3, string.Empty)); System.Windows.Point pt4 = new System.Windows.Point( list[0].FacePointsInColorSpace[Microsoft.Kinect.Face.FacePointType.MouthCornerRight].X, list[0].FacePointsInColorSpace[Microsoft.Kinect.Face.FacePointType.MouthCornerRight].Y); facePoints.Add(Tuple.Create(pt4, string.Empty)); System.Windows.Point pt5 = new System.Windows.Point( list[0].FacePointsInColorSpace[Microsoft.Kinect.Face.FacePointType.Nose].X, list[0].FacePointsInColorSpace[Microsoft.Kinect.Face.FacePointType.Nose].Y); facePoints.Add(Tuple.Create(pt5, string.Empty)); return(facePoints); }); // ******************************************************************** // Finally create a Live Visualizer using PsiStudio. // We must persist our streams to a store in order for Live Viz to work properly // ******************************************************************** // Create store for the data. Live Visualizer can only read data from a store. var pathToStore = Environment.GetFolderPath(Environment.SpecialFolder.MyVideos); Microsoft.Psi.Data.Exporter store = Store.Create(pipeline, ApplicationName, pathToStore); mouthOpen.Select(v => v ? 1d : 0d).Write("MouthOpen", store); speechDetector.Select(v => v ? 1d : 0d).Write("VAD", store); mouthAndSpeechDetector.Write("Join(MouthOpen,VAD)", store); kinectSensor.Audio.Write("Audio", store); var images = kinectSensor.ColorImage.EncodeJpeg(90, DeliveryPolicy.LatestMessage).Out; Store.Write(images, "Images", store, true, DeliveryPolicy.LatestMessage); landmarks.Write("FaceLandmarks", store); // Run the pipeline pipeline.RunAsync(); Console.WriteLine("Press any key to finish recording"); Console.ReadKey(); } }
/// <summary> /// Initializes a new instance of the <see cref="AudioCaptureConfiguration"/> class. /// </summary> /// <remarks>Defaults to 16kHz, 1 channel, 16-bit PCM.</remarks> /// <param name="name">Device name (e.g. "plughw:0,0").</param> public AudioCaptureConfiguration(string name) : this(name, WaveFormat.Create16kHz1Channel16BitPcm()) { }