public DeepSpeechRecognizer(Pipeline pipeline) : base(pipeline, nameof(DeepSpeechRecognizer)) { // Create connector this.audioIn = this.CreateInputConnectorFrom <AudioBuffer>(pipeline, nameof(this.AudioIn)); // Define the outputs var textOut = this.CreateOutputConnectorTo <String>(pipeline, nameof(this.TextOut)); this.TextOut = textOut.Out; // sub-recognizer var recognizer = new DeepSpeechSubRecognizer(this); // voice activity detector var voiceDet = new SystemVoiceActivityDetector(this); this.audioIn.Out.PipeTo(voiceDet); //this.audioIn.Do(x => //{ // Console.Write('.'); //}); var voiceAudio = this.audioIn.Out.Join(voiceDet.Out, Reproducible.Nearest <bool>(RelativeTimeInterval.Future())); voiceAudio.PipeTo(recognizer.In); recognizer.PipeTo(textOut); }
/// <summary> /// This is the main code for our Multimodal Speech Detection demo. /// </summary> private void PerformMultiModalSpeechDetection() { Console.WriteLine("Initializing Psi."); bool detected = false; // First create our \Psi pipeline using (var pipeline = Pipeline.Create("MultiModalSpeechDetection")) { // Register an event handler to catch pipeline errors pipeline.PipelineExceptionNotHandled += Pipeline_PipelineException; // Register an event handler to be notified when the pipeline completes pipeline.PipelineCompleted += Pipeline_PipelineCompleted; // Next create our Kinect sensor. We will be using the color images, face tracking, and audio from the Kinect sensor var kinectSensorConfig = new KinectSensorConfiguration(); kinectSensorConfig.OutputColor = true; kinectSensorConfig.OutputAudio = true; kinectSensorConfig.OutputBodies = true; // In order to detect faces using Kinect you must also enable detection of bodies var kinectSensor = new KinectSensor(pipeline, kinectSensorConfig); var kinectFaceDetector = new Microsoft.Psi.Kinect.Face.KinectFaceDetector(pipeline, kinectSensor, Microsoft.Psi.Kinect.Face.KinectFaceDetectorConfiguration.Default); // Create our Voice Activation Detector var speechDetector = new SystemVoiceActivityDetector(pipeline); var convertedAudio = kinectSensor.Audio.Resample(WaveFormat.Create16kHz1Channel16BitPcm()); convertedAudio.PipeTo(speechDetector); // Use the Kinect's face track to determine if the mouth is opened var mouthOpenAsFloat = kinectFaceDetector.Faces.Where(faces => faces.Count > 0).Select((List <Microsoft.Psi.Kinect.Face.KinectFace> list) => { if (!detected) { detected = true; Console.WriteLine("Found your face"); } bool open = (list[0] != null) ? list[0].FaceProperties[Microsoft.Kinect.Face.FaceProperty.MouthOpen] == Microsoft.Kinect.DetectionResult.Yes : false; return(open ? 1.0 : 0.0); }); // Next take the "mouthOpen" value and create a hold on that value (so that we don't see 1,0,1,0,1 but instead would see 1,1,1,1,0.8,0.6,0.4) var mouthOpen = mouthOpenAsFloat.Hold(0.1); // Next join the results of the speechDetector with the mouthOpen generator and only select samples where // we have detected speech and that the mouth was open. var mouthAndSpeechDetector = speechDetector.Join(mouthOpen, hundredMs).Select((t, e) => t.Item1 && t.Item2); // Convert our speech into text var speechRecognition = convertedAudio.SpeechToText(mouthAndSpeechDetector); speechRecognition.Do((s, t) => { if (s.Item1.Length > 0) { Console.WriteLine("You said: " + s.Item1); } }); // Create a stream of landmarks (points) from the face detector var facePoints = new List <Tuple <System.Windows.Point, string> >(); var landmarks = kinectFaceDetector.Faces.Where(faces => faces.Count > 0).Select((List <Microsoft.Psi.Kinect.Face.KinectFace> list) => { facePoints.Clear(); System.Windows.Point pt1 = new System.Windows.Point( list[0].FacePointsInColorSpace[Microsoft.Kinect.Face.FacePointType.EyeLeft].X, list[0].FacePointsInColorSpace[Microsoft.Kinect.Face.FacePointType.EyeLeft].Y); facePoints.Add(Tuple.Create(pt1, string.Empty)); System.Windows.Point pt2 = new System.Windows.Point( list[0].FacePointsInColorSpace[Microsoft.Kinect.Face.FacePointType.EyeRight].X, list[0].FacePointsInColorSpace[Microsoft.Kinect.Face.FacePointType.EyeRight].Y); facePoints.Add(Tuple.Create(pt2, string.Empty)); System.Windows.Point pt3 = new System.Windows.Point( list[0].FacePointsInColorSpace[Microsoft.Kinect.Face.FacePointType.MouthCornerLeft].X, list[0].FacePointsInColorSpace[Microsoft.Kinect.Face.FacePointType.MouthCornerLeft].Y); facePoints.Add(Tuple.Create(pt3, string.Empty)); System.Windows.Point pt4 = new System.Windows.Point( list[0].FacePointsInColorSpace[Microsoft.Kinect.Face.FacePointType.MouthCornerRight].X, list[0].FacePointsInColorSpace[Microsoft.Kinect.Face.FacePointType.MouthCornerRight].Y); facePoints.Add(Tuple.Create(pt4, string.Empty)); System.Windows.Point pt5 = new System.Windows.Point( list[0].FacePointsInColorSpace[Microsoft.Kinect.Face.FacePointType.Nose].X, list[0].FacePointsInColorSpace[Microsoft.Kinect.Face.FacePointType.Nose].Y); facePoints.Add(Tuple.Create(pt5, string.Empty)); return(facePoints); }); // ******************************************************************** // Finally create a Live Visualizer using PsiStudio. // We must persist our streams to a store in order for Live Viz to work properly // ******************************************************************** // Create store for the data. Live Visualizer can only read data from a store. var pathToStore = Environment.GetFolderPath(Environment.SpecialFolder.MyVideos); Microsoft.Psi.Data.Exporter store = Store.Create(pipeline, ApplicationName, pathToStore); mouthOpen.Select(v => v ? 1d : 0d).Write("MouthOpen", store); speechDetector.Select(v => v ? 1d : 0d).Write("VAD", store); mouthAndSpeechDetector.Write("Join(MouthOpen,VAD)", store); kinectSensor.Audio.Write("Audio", store); var images = kinectSensor.ColorImage.EncodeJpeg(90, DeliveryPolicy.LatestMessage).Out; Store.Write(images, "Images", store, true, DeliveryPolicy.LatestMessage); landmarks.Write("FaceLandmarks", store); // Run the pipeline pipeline.RunAsync(); Console.WriteLine("Press any key to finish recording"); Console.ReadKey(); } }
/// <summary> /// Builds and runs a speech recognition pipeline using the Azure speech recognizer. Requires a valid Cognitive Services /// subscription key. See https://docs.microsoft.com/en-us/azure/cognitive-services/cognitive-services-apis-create-account. /// </summary> /// <remarks> /// If you are getting a <see cref="System.InvalidOperationException"/> with the message 'AzureSpeechRecognizer returned /// OnConversationError with error code: LoginFailed. Original error text: Transport error', this most likely is due to /// an invalid subscription key. Please check your Azure portal at https://portal.azure.com and ensure that you have /// added a subscription to the Azure Speech API on your account. /// </remarks> /// <param name="outputLogPath">The path under which to write log data.</param> /// <param name="inputLogPath">The path from which to read audio input data.</param> public static void RunAzureSpeech(string outputLogPath = null, string inputLogPath = null) { // Create the pipeline object. using (Pipeline pipeline = Pipeline.Create()) { // Use either live audio from the microphone or audio from a previously saved log IProducer <AudioBuffer> audioInput = null; if (inputLogPath != null) { // Open the MicrophoneAudio stream from the last saved log var store = PsiStore.Open(pipeline, Program.AppName, inputLogPath); audioInput = store.OpenStream <AudioBuffer>($"{Program.AppName}.MicrophoneAudio"); } else { // Create the AudioCapture component to capture audio from the default device in 16 kHz 1-channel // PCM format as required by both the voice activity detector and speech recognition components. audioInput = new AudioCapture(pipeline, WaveFormat.Create16kHz1Channel16BitPcm()); } // Perform voice activity detection using the voice activity detector component var vad = new SystemVoiceActivityDetector(pipeline); audioInput.PipeTo(vad); // Create Azure speech recognizer component var recognizer = new AzureSpeechRecognizer(pipeline, new AzureSpeechRecognizerConfiguration() { SubscriptionKey = Program.azureSubscriptionKey, Region = Program.azureRegion }); // The input audio to the Azure speech recognizer needs to be annotated with a voice activity flag. // This can be constructed by using the Psi Join() operator to combine the audio and VAD streams. var annotatedAudio = audioInput.Join(vad); // Subscribe the recognizer to the annotated audio annotatedAudio.PipeTo(recognizer); // Partial and final speech recognition results are posted on the same stream. Here // we use Psi's Where() operator to filter out only the final recognition results. var finalResults = recognizer.Out.Where(result => result.IsFinal); // Print the recognized text of the final recognition result to the console. finalResults.Do(result => Console.WriteLine(result.Text)); // Create a data store to log the data to if necessary. A data store is necessary // only if output logging is enabled. var dataStore = CreateDataStore(pipeline, outputLogPath); // For disk logging only if (dataStore != null) { // Log the microphone audio and recognition results audioInput.Write($"{Program.AppName}.MicrophoneAudio", dataStore); finalResults.Write($"{Program.AppName}.FinalRecognitionResults", dataStore); vad.Write($"{Program.AppName}.VoiceActivity", dataStore); } // Register an event handler to catch pipeline errors pipeline.PipelineExceptionNotHandled += Pipeline_PipelineException; // Register an event handler to be notified when the pipeline completes pipeline.PipelineCompleted += Pipeline_PipelineCompleted; // Run the pipeline pipeline.RunAsync(); // Azure speech transcribes speech to text Console.WriteLine("Say anything"); Console.WriteLine("Press any key to exit..."); Console.ReadKey(true); } }
public static void Main(string[] args) { bool detected = false; bool usingKqml = true; string facilitatorIP = args[0]; int facilitatorPort = int.Parse(args[1]); int localPort = int.Parse(args[2]); InitTimer(); Console.WriteLine("Starting Kinect-based Kiosk. Verify that Kinect is setup before continuing"); using (Pipeline pipeline = Pipeline.Create()) { // Components Microsoft.Psi.Kinect.v1.KinectSensor kinectSensor = new Microsoft.Psi.Kinect.v1.KinectSensor(pipeline); Microsoft.Psi.Kinect.v1.SkeletonFaceTracker faceTracker = new Microsoft.Psi.Kinect.v1.SkeletonFaceTracker(pipeline, kinectSensor.kinectSensor); var speechDetector = new SystemVoiceActivityDetector(pipeline); var recognizer = Speech.Program.CreateSpeechRecognizer(pipeline); var merger = new Speech.SpeechMerger(pipeline); var synthesizer = Speech.Program.CreateSpeechSynthesizer(pipeline); NU.Kqml.SocketStringConsumer kqml = null; NU.Kqml.KioskInputTextPreProcessor preproc = new NU.Kqml.KioskInputTextPreProcessor(pipeline, (SystemSpeechRecognizer)recognizer); KioskUI.KioskUI ui = new KioskUI.KioskUI(pipeline); // Wiring together the components var joinedFrames = kinectSensor.ColorImage.Join(kinectSensor.DepthImage).Join(kinectSensor.Skeletons); joinedFrames.PipeTo(faceTracker); var mouthOpenAsFloat = faceTracker.FaceDetected.Select((bool x) => { if (!detected) { Console.WriteLine("Face found"); detected = true; } return(x ? 1.0 : 0.0); }); // Hold faceDetected to true for a while, even after face is gone var faceDetected = mouthOpenAsFloat.Hold(0.1, 0.05); faceDetected.PipeTo(ui.FaceDetected); // Send audio to recognizer if face is detected and ready to accept more input //kinectSensor.Audio.Join(faceDetected, _300ms).Where(result => result.Item2 && isAccepting).Select(pair => { // return pair.Item1; //}).PipeTo(recognizer); kinectSensor.Audio.Join(faceDetected, _300ms).Pair(synthesizer.StateChanged).Where(result => result.Item2 && result.Item3.State == System.Speech.Synthesis.SynthesizerState.Ready).Select(pair => { return(pair.Item1); }).PipeTo(recognizer); // Get final results of speech recognition var finalResults = recognizer.Out.Where(result => result.IsFinal); var recognitionResult = finalResults.Select(r => // Need to add a Where Item2, but skipping for now { var ssrResult = r as IStreamingSpeechRecognitionResult; Console.WriteLine($"{ssrResult.Text} (confidence: {ssrResult.Confidence})"); return(ssrResult); }); if (usingKqml) { Console.WriteLine("Setting up connection to Companion"); int facilitatorPort_num = Convert.ToInt32(facilitatorPort); int localPort_num = Convert.ToInt32(localPort); Console.WriteLine("Your Companion IP address is: " + facilitatorIP); Console.WriteLine("Your Companion port is: " + facilitatorPort); Console.WriteLine("Your local port is: " + localPort); // setup interface to Companion kqml = new NU.Kqml.SocketStringConsumer(pipeline, facilitatorIP, facilitatorPort_num, localPort_num); // Send user input to preprocess recognitionResult.PipeTo(preproc.In); // Set accepting flag based on preprocessor output var non_trivial_result = preproc.Out.Where(x => { if (x == null) { //setAccepting(); return(false); } else { //setNotAccepting(); return(true); } }); preproc.AutoResponse.PipeTo(merger.OtherIn); // Send processed user input to Companion and UI non_trivial_result.PipeTo(kqml.In); non_trivial_result.PipeTo(ui.UserInput); non_trivial_result.PipeTo(merger.LastOut); // Get response from Companion and forward to UI and synthesizer kqml.Out.Do(x => Console.WriteLine(x)); kqml.Out.PipeTo(merger.In); merger.Out.PipeTo(ui.CompResponse); merger.Out.PipeTo(synthesizer); // When speaking complete, ready to accept more input //synthesizer.SpeakCompleted.Delay(_500ms).Do(x => setAccepting()); } else { Console.WriteLine("Status: Not using KQML"); recognitionResult.PipeTo(preproc.In); var non_trivial_result = preproc.Out.Where(x => { if (x == null) { setAccepting(); return(false); } else { setNotAccepting(); return(true); } }); non_trivial_result.PipeTo(ui.UserInput); var delayed = non_trivial_result.Select(result => { Thread.Sleep(3000); return(result); }); TimeSpan the_wait = TimeSpan.FromSeconds(13.0); delayed.PipeTo(ui.CompResponse); delayed.PipeTo(synthesizer); synthesizer.SpeakCompleted.Do(x => setAccepting()); } // Setup psi studio visualizations //SetupDataStore(pipeline, @"..\..\..\Videos\" + AppName, "", true, kinectSensor, faceTracker, finalResults); // Register an event handler to catch pipeline errors pipeline.PipelineCompletionEvent += PipelineCompletionEvent; // Run the pipeline pipeline.RunAsync(); Console.WriteLine("Press any key to exit..."); Console.ReadKey(true); } }