public DeepSpeechRecognizer(Pipeline pipeline) : base(pipeline, nameof(DeepSpeechRecognizer))
        {
            // Create connector
            this.audioIn = this.CreateInputConnectorFrom <AudioBuffer>(pipeline, nameof(this.AudioIn));

            // Define the outputs
            var textOut = this.CreateOutputConnectorTo <String>(pipeline, nameof(this.TextOut));

            this.TextOut = textOut.Out;

            // sub-recognizer
            var recognizer = new DeepSpeechSubRecognizer(this);

            // voice activity detector
            var voiceDet = new SystemVoiceActivityDetector(this);

            this.audioIn.Out.PipeTo(voiceDet);
            //this.audioIn.Do(x =>
            //{
            //    Console.Write('.');
            //});
            var voiceAudio = this.audioIn.Out.Join(voiceDet.Out, Reproducible.Nearest <bool>(RelativeTimeInterval.Future()));

            voiceAudio.PipeTo(recognizer.In);

            recognizer.PipeTo(textOut);
        }
Ejemplo n.º 2
0
        /// <summary>
        /// This is the main code for our Multimodal Speech Detection demo.
        /// </summary>
        private void PerformMultiModalSpeechDetection()
        {
            Console.WriteLine("Initializing Psi.");

            bool detected = false;

            // First create our \Psi pipeline
            using (var pipeline = Pipeline.Create("MultiModalSpeechDetection"))
            {
                // Register an event handler to catch pipeline errors
                pipeline.PipelineExceptionNotHandled += Pipeline_PipelineException;

                // Register an event handler to be notified when the pipeline completes
                pipeline.PipelineCompleted += Pipeline_PipelineCompleted;

                // Next create our Kinect sensor. We will be using the color images, face tracking, and audio from the Kinect sensor
                var kinectSensorConfig = new KinectSensorConfiguration();
                kinectSensorConfig.OutputColor  = true;
                kinectSensorConfig.OutputAudio  = true;
                kinectSensorConfig.OutputBodies = true; // In order to detect faces using Kinect you must also enable detection of bodies
                var kinectSensor       = new KinectSensor(pipeline, kinectSensorConfig);
                var kinectFaceDetector = new Microsoft.Psi.Kinect.Face.KinectFaceDetector(pipeline, kinectSensor, Microsoft.Psi.Kinect.Face.KinectFaceDetectorConfiguration.Default);

                // Create our Voice Activation Detector
                var speechDetector = new SystemVoiceActivityDetector(pipeline);
                var convertedAudio = kinectSensor.Audio.Resample(WaveFormat.Create16kHz1Channel16BitPcm());
                convertedAudio.PipeTo(speechDetector);

                // Use the Kinect's face track to determine if the mouth is opened
                var mouthOpenAsFloat = kinectFaceDetector.Faces.Where(faces => faces.Count > 0).Select((List <Microsoft.Psi.Kinect.Face.KinectFace> list) =>
                {
                    if (!detected)
                    {
                        detected = true;
                        Console.WriteLine("Found your face");
                    }

                    bool open = (list[0] != null) ? list[0].FaceProperties[Microsoft.Kinect.Face.FaceProperty.MouthOpen] == Microsoft.Kinect.DetectionResult.Yes : false;
                    return(open ? 1.0 : 0.0);
                });

                // Next take the "mouthOpen" value and create a hold on that value (so that we don't see 1,0,1,0,1 but instead would see 1,1,1,1,0.8,0.6,0.4)
                var mouthOpen = mouthOpenAsFloat.Hold(0.1);

                // Next join the results of the speechDetector with the mouthOpen generator and only select samples where
                // we have detected speech and that the mouth was open.
                var mouthAndSpeechDetector = speechDetector.Join(mouthOpen, hundredMs).Select((t, e) => t.Item1 && t.Item2);

                // Convert our speech into text
                var speechRecognition = convertedAudio.SpeechToText(mouthAndSpeechDetector);
                speechRecognition.Do((s, t) =>
                {
                    if (s.Item1.Length > 0)
                    {
                        Console.WriteLine("You said: " + s.Item1);
                    }
                });

                // Create a stream of landmarks (points) from the face detector
                var facePoints = new List <Tuple <System.Windows.Point, string> >();
                var landmarks  = kinectFaceDetector.Faces.Where(faces => faces.Count > 0).Select((List <Microsoft.Psi.Kinect.Face.KinectFace> list) =>
                {
                    facePoints.Clear();
                    System.Windows.Point pt1 = new System.Windows.Point(
                        list[0].FacePointsInColorSpace[Microsoft.Kinect.Face.FacePointType.EyeLeft].X,
                        list[0].FacePointsInColorSpace[Microsoft.Kinect.Face.FacePointType.EyeLeft].Y);
                    facePoints.Add(Tuple.Create(pt1, string.Empty));

                    System.Windows.Point pt2 = new System.Windows.Point(
                        list[0].FacePointsInColorSpace[Microsoft.Kinect.Face.FacePointType.EyeRight].X,
                        list[0].FacePointsInColorSpace[Microsoft.Kinect.Face.FacePointType.EyeRight].Y);
                    facePoints.Add(Tuple.Create(pt2, string.Empty));

                    System.Windows.Point pt3 = new System.Windows.Point(
                        list[0].FacePointsInColorSpace[Microsoft.Kinect.Face.FacePointType.MouthCornerLeft].X,
                        list[0].FacePointsInColorSpace[Microsoft.Kinect.Face.FacePointType.MouthCornerLeft].Y);
                    facePoints.Add(Tuple.Create(pt3, string.Empty));

                    System.Windows.Point pt4 = new System.Windows.Point(
                        list[0].FacePointsInColorSpace[Microsoft.Kinect.Face.FacePointType.MouthCornerRight].X,
                        list[0].FacePointsInColorSpace[Microsoft.Kinect.Face.FacePointType.MouthCornerRight].Y);
                    facePoints.Add(Tuple.Create(pt4, string.Empty));

                    System.Windows.Point pt5 = new System.Windows.Point(
                        list[0].FacePointsInColorSpace[Microsoft.Kinect.Face.FacePointType.Nose].X,
                        list[0].FacePointsInColorSpace[Microsoft.Kinect.Face.FacePointType.Nose].Y);
                    facePoints.Add(Tuple.Create(pt5, string.Empty));
                    return(facePoints);
                });

                // ********************************************************************
                // Finally create a Live Visualizer using PsiStudio.
                // We must persist our streams to a store in order for Live Viz to work properly
                // ********************************************************************

                // Create store for the data. Live Visualizer can only read data from a store.
                var pathToStore = Environment.GetFolderPath(Environment.SpecialFolder.MyVideos);
                Microsoft.Psi.Data.Exporter store = Store.Create(pipeline, ApplicationName, pathToStore);

                mouthOpen.Select(v => v ? 1d : 0d).Write("MouthOpen", store);

                speechDetector.Select(v => v ? 1d : 0d).Write("VAD", store);

                mouthAndSpeechDetector.Write("Join(MouthOpen,VAD)", store);

                kinectSensor.Audio.Write("Audio", store);

                var images = kinectSensor.ColorImage.EncodeJpeg(90, DeliveryPolicy.LatestMessage).Out;
                Store.Write(images, "Images", store, true, DeliveryPolicy.LatestMessage);

                landmarks.Write("FaceLandmarks", store);

                // Run the pipeline
                pipeline.RunAsync();

                Console.WriteLine("Press any key to finish recording");
                Console.ReadKey();
            }
        }
Ejemplo n.º 3
0
        /// <summary>
        /// Builds and runs a speech recognition pipeline using the Azure speech recognizer. Requires a valid Cognitive Services
        /// subscription key. See https://docs.microsoft.com/en-us/azure/cognitive-services/cognitive-services-apis-create-account.
        /// </summary>
        /// <remarks>
        /// If you are getting a <see cref="System.InvalidOperationException"/> with the message 'AzureSpeechRecognizer returned
        /// OnConversationError with error code: LoginFailed. Original error text: Transport error', this most likely is due to
        /// an invalid subscription key. Please check your Azure portal at https://portal.azure.com and ensure that you have
        /// added a subscription to the Azure Speech API on your account.
        /// </remarks>
        /// <param name="outputLogPath">The path under which to write log data.</param>
        /// <param name="inputLogPath">The path from which to read audio input data.</param>
        public static void RunAzureSpeech(string outputLogPath = null, string inputLogPath = null)
        {
            // Create the pipeline object.
            using (Pipeline pipeline = Pipeline.Create())
            {
                // Use either live audio from the microphone or audio from a previously saved log
                IProducer <AudioBuffer> audioInput = null;
                if (inputLogPath != null)
                {
                    // Open the MicrophoneAudio stream from the last saved log
                    var store = PsiStore.Open(pipeline, Program.AppName, inputLogPath);
                    audioInput = store.OpenStream <AudioBuffer>($"{Program.AppName}.MicrophoneAudio");
                }
                else
                {
                    // Create the AudioCapture component to capture audio from the default device in 16 kHz 1-channel
                    // PCM format as required by both the voice activity detector and speech recognition components.
                    audioInput = new AudioCapture(pipeline, WaveFormat.Create16kHz1Channel16BitPcm());
                }

                // Perform voice activity detection using the voice activity detector component
                var vad = new SystemVoiceActivityDetector(pipeline);
                audioInput.PipeTo(vad);

                // Create Azure speech recognizer component
                var recognizer = new AzureSpeechRecognizer(pipeline, new AzureSpeechRecognizerConfiguration()
                {
                    SubscriptionKey = Program.azureSubscriptionKey, Region = Program.azureRegion
                });

                // The input audio to the Azure speech recognizer needs to be annotated with a voice activity flag.
                // This can be constructed by using the Psi Join() operator to combine the audio and VAD streams.
                var annotatedAudio = audioInput.Join(vad);

                // Subscribe the recognizer to the annotated audio
                annotatedAudio.PipeTo(recognizer);

                // Partial and final speech recognition results are posted on the same stream. Here
                // we use Psi's Where() operator to filter out only the final recognition results.
                var finalResults = recognizer.Out.Where(result => result.IsFinal);

                // Print the recognized text of the final recognition result to the console.
                finalResults.Do(result => Console.WriteLine(result.Text));

                // Create a data store to log the data to if necessary. A data store is necessary
                // only if output logging is enabled.
                var dataStore = CreateDataStore(pipeline, outputLogPath);

                // For disk logging only
                if (dataStore != null)
                {
                    // Log the microphone audio and recognition results
                    audioInput.Write($"{Program.AppName}.MicrophoneAudio", dataStore);
                    finalResults.Write($"{Program.AppName}.FinalRecognitionResults", dataStore);
                    vad.Write($"{Program.AppName}.VoiceActivity", dataStore);
                }

                // Register an event handler to catch pipeline errors
                pipeline.PipelineExceptionNotHandled += Pipeline_PipelineException;

                // Register an event handler to be notified when the pipeline completes
                pipeline.PipelineCompleted += Pipeline_PipelineCompleted;

                // Run the pipeline
                pipeline.RunAsync();

                // Azure speech transcribes speech to text
                Console.WriteLine("Say anything");

                Console.WriteLine("Press any key to exit...");
                Console.ReadKey(true);
            }
        }
        public static void Main(string[] args)
        {
            bool   detected        = false;
            bool   usingKqml       = true;
            string facilitatorIP   = args[0];
            int    facilitatorPort = int.Parse(args[1]);
            int    localPort       = int.Parse(args[2]);

            InitTimer();

            Console.WriteLine("Starting Kinect-based Kiosk.  Verify that Kinect is setup before continuing");

            using (Pipeline pipeline = Pipeline.Create())
            {
                // Components
                Microsoft.Psi.Kinect.v1.KinectSensor kinectSensor = new Microsoft.Psi.Kinect.v1.KinectSensor(pipeline);

                Microsoft.Psi.Kinect.v1.SkeletonFaceTracker faceTracker = new Microsoft.Psi.Kinect.v1.SkeletonFaceTracker(pipeline, kinectSensor.kinectSensor);

                var speechDetector = new SystemVoiceActivityDetector(pipeline);

                var recognizer = Speech.Program.CreateSpeechRecognizer(pipeline);

                var merger = new Speech.SpeechMerger(pipeline);

                var synthesizer = Speech.Program.CreateSpeechSynthesizer(pipeline);

                NU.Kqml.SocketStringConsumer kqml = null;

                NU.Kqml.KioskInputTextPreProcessor preproc = new NU.Kqml.KioskInputTextPreProcessor(pipeline, (SystemSpeechRecognizer)recognizer);

                KioskUI.KioskUI ui = new KioskUI.KioskUI(pipeline);

                // Wiring together the components
                var joinedFrames = kinectSensor.ColorImage.Join(kinectSensor.DepthImage).Join(kinectSensor.Skeletons);

                joinedFrames.PipeTo(faceTracker);

                var mouthOpenAsFloat = faceTracker.FaceDetected.Select((bool x) =>
                {
                    if (!detected)
                    {
                        Console.WriteLine("Face found");
                        detected = true;
                    }
                    return(x ? 1.0 : 0.0);
                });

                // Hold faceDetected to true for a while, even after face is gone
                var faceDetected = mouthOpenAsFloat.Hold(0.1, 0.05);
                faceDetected.PipeTo(ui.FaceDetected);

                // Send audio to recognizer if face is detected and ready to accept more input
                //kinectSensor.Audio.Join(faceDetected, _300ms).Where(result => result.Item2 && isAccepting).Select(pair => {
                //    return pair.Item1;
                //}).PipeTo(recognizer);
                kinectSensor.Audio.Join(faceDetected, _300ms).Pair(synthesizer.StateChanged).Where(result => result.Item2 && result.Item3.State == System.Speech.Synthesis.SynthesizerState.Ready).Select(pair => {
                    return(pair.Item1);
                }).PipeTo(recognizer);

                // Get final results of speech recognition
                var finalResults = recognizer.Out.Where(result => result.IsFinal);

                var recognitionResult = finalResults.Select(r =>  // Need to add a Where Item2, but skipping for now
                {
                    var ssrResult = r as IStreamingSpeechRecognitionResult;
                    Console.WriteLine($"{ssrResult.Text} (confidence: {ssrResult.Confidence})");
                    return(ssrResult);
                });

                if (usingKqml)
                {
                    Console.WriteLine("Setting up connection to Companion");
                    int facilitatorPort_num = Convert.ToInt32(facilitatorPort);
                    int localPort_num       = Convert.ToInt32(localPort);
                    Console.WriteLine("Your Companion IP address is: " + facilitatorIP);
                    Console.WriteLine("Your Companion port is: " + facilitatorPort);
                    Console.WriteLine("Your local port is: " + localPort);

                    // setup interface to Companion
                    kqml = new NU.Kqml.SocketStringConsumer(pipeline, facilitatorIP, facilitatorPort_num, localPort_num);

                    // Send user input to preprocess
                    recognitionResult.PipeTo(preproc.In);

                    // Set accepting flag based on preprocessor output
                    var non_trivial_result = preproc.Out.Where(x => {
                        if (x == null)
                        {
                            //setAccepting();
                            return(false);
                        }
                        else
                        {
                            //setNotAccepting();
                            return(true);
                        }
                    });

                    preproc.AutoResponse.PipeTo(merger.OtherIn);

                    // Send processed user input to Companion and UI
                    non_trivial_result.PipeTo(kqml.In);
                    non_trivial_result.PipeTo(ui.UserInput);
                    non_trivial_result.PipeTo(merger.LastOut);

                    // Get response from Companion and forward to UI and synthesizer
                    kqml.Out.Do(x => Console.WriteLine(x));
                    kqml.Out.PipeTo(merger.In);
                    merger.Out.PipeTo(ui.CompResponse);
                    merger.Out.PipeTo(synthesizer);

                    // When speaking complete, ready to accept more input
                    //synthesizer.SpeakCompleted.Delay(_500ms).Do(x => setAccepting());
                }
                else
                {
                    Console.WriteLine("Status: Not using KQML");

                    recognitionResult.PipeTo(preproc.In);

                    var non_trivial_result = preproc.Out.Where(x => {
                        if (x == null)
                        {
                            setAccepting();
                            return(false);
                        }
                        else
                        {
                            setNotAccepting();
                            return(true);
                        }
                    });
                    non_trivial_result.PipeTo(ui.UserInput);
                    var delayed = non_trivial_result.Select(result =>
                    {
                        Thread.Sleep(3000);
                        return(result);
                    });
                    TimeSpan the_wait = TimeSpan.FromSeconds(13.0);
                    delayed.PipeTo(ui.CompResponse);
                    delayed.PipeTo(synthesizer);
                    synthesizer.SpeakCompleted.Do(x => setAccepting());
                }

                // Setup psi studio visualizations
                //SetupDataStore(pipeline, @"..\..\..\Videos\" + AppName, "", true, kinectSensor, faceTracker, finalResults);

                // Register an event handler to catch pipeline errors
                pipeline.PipelineCompletionEvent += PipelineCompletionEvent;

                // Run the pipeline
                pipeline.RunAsync();

                Console.WriteLine("Press any key to exit...");
                Console.ReadKey(true);
            }
        }