Пример #1
0
        /// <summary>
        ///     Learns to serialize and deserialize Extraction.Text program.
        /// </summary>
        private static void SerializeProgram()
        {
            var          session = new RegionSession();
            StringRegion input   = RegionSession.CreateStringRegion("Carrie Dodson 100");

            session.AddConstraints(new RegionExample(input, input.Slice(7, 13))); // "Carrie Dodson 100" => "Dodson"

            RegionProgram topRankedProg = session.Learn();

            if (topRankedProg == null)
            {
                Console.Error.WriteLine("Error: Learning fails!");
                return;
            }

            string        serializedProgram   = topRankedProg.Serialize();
            RegionProgram deserializedProgram = Loader.Instance.Region.Load(serializedProgram);
            StringRegion  testInput           = RegionSession.CreateStringRegion("Leonard Robledo 75"); // expect "Robledo"
            StringRegion  output = deserializedProgram.Run(testInput);

            if (output == null)
            {
                Console.Error.WriteLine("Error: Extracting fails!");
                return;
            }
            Console.WriteLine("\"{0}\" => \"{1}\"", testInput, output);
        }
Пример #2
0
        /// <summary>
        ///     Learns a program to extract a single region from a file.
        /// </summary>
        private static void LearnRegion()
        {
            var          session = new RegionSession();
            StringRegion input   = RegionSession.CreateStringRegion("Carrie Dodson 100");

            // Only one example because we extract one region from one file.
            // Position specifies the location between two characters in the file. It starts at 0 (the beginning of the file).
            // An example is identified by a pair of start and end positions.
            session.AddConstraints(new RegionExample(input, input.Slice(7, 13))); // "Carrie Dodson 100" => "Dodson"

            RegionProgram topRankedProg = session.Learn();

            if (topRankedProg == null)
            {
                Console.Error.WriteLine("Error: Learning fails!");
                return;
            }

            StringRegion testInput = RegionSession.CreateStringRegion("Leonard Robledo 75"); // expect "Robledo"
            StringRegion output    = topRankedProg.Run(testInput);

            if (output == null)
            {
                Console.Error.WriteLine("Error: Extracting fails!");
                return;
            }
            Console.WriteLine("\"{0}\" => \"{1}\"", testInput, output);
        }
Пример #3
0
        /// <summary>
        ///     Learns top-ranked 3 region programs.
        ///     Demonstrates access to lower-ranked programs.
        /// </summary>
        private static void LearnTop3RegionPrograms()
        {
            var          session = new RegionSession();
            StringRegion input   = RegionSession.CreateStringRegion("Carrie Dodson 100");

            session.AddConstraints(new RegionExample(input, input.Slice(14, 17))); // "Carrie Dodson 100" => "Dodson"

            IEnumerable <RegionProgram> topKPrograms = session.LearnTopK(3);

            var i = 0;

            StringRegion[] otherInputs =
            {
                input, RegionSession.CreateStringRegion("Leonard Robledo NA"),
                RegionSession.CreateStringRegion("Margaret Cook 320")
            };
            foreach (RegionProgram prog in topKPrograms)
            {
                Console.WriteLine("Program {0}:", ++i);
                foreach (StringRegion str in otherInputs)
                {
                    var r = prog.Run(str);
                    Console.WriteLine(r != null ? r.Value : "null");
                }
            }
        }
Пример #4
0
        /// <summary>
        ///     Learns all region programs that satisfy the examples (advanced feature).
        ///     Demonstrates access to the entire program set.
        /// </summary>
        private static void LearnAllRegionPrograms()
        {
            var          session = new RegionSession();
            StringRegion input   = RegionSession.CreateStringRegion("Carrie Dodson 100");

            session.AddConstraints(new RegionExample(input, input.Slice(14, 17))); // "Carrie Dodson 100" => "Dodson"

            ProgramSet allPrograms = session.LearnAll().ProgramSet;
            IEnumerable <ProgramNode> topKPrograms = allPrograms.TopK(RegionLearner.Instance.ScoreFeature, 3);

            var i = 0;

            StringRegion[] otherInputs =
            {
                input, RegionSession.CreateStringRegion("Leonard Robledo NA"),
                RegionSession.CreateStringRegion("Margaret Cook 320")
            };
            foreach (ProgramNode programNode in topKPrograms)
            {
                Console.WriteLine("Program {0}:", ++i);
                var program = new RegionProgram(programNode, ReferenceKind.Parent);
                foreach (StringRegion str in otherInputs)
                {
                    StringRegion r = program.Run(str);
                    Console.WriteLine(r == null ? "null" : r.Value);
                }
            }
        }
Пример #5
0
        /// <summary>
        ///     Learns a program to extract a single region using another region that appears after it as reference (i.e.,
        ///     succeeding sibling region).
        ///     Demonstrates how sibling referencing works.
        /// </summary>
        private static void LearnRegionReferencingSucceedingSibling()
        {
            var          session = new RegionSession();
            StringRegion input   =
                RegionSession.CreateStringRegion("Carrie Dodson 100\nLeonard Robledo 75\nMargaret Cook 320");

            StringRegion[] records = { input.Slice(0, 17), input.Slice(18, 36), input.Slice(37, 54) };
            StringRegion[] numbers = { input.Slice(14, 17), input.Slice(34, 36), input.Slice(51, 54) };

            // Suppose we want to extract the first name w.r.t the number
            session.AddConstraints(
                new RegionExample(numbers[0], records[0].Slice(0, 6)),  // "Carrie" => "100"
                new RegionExample(numbers[1], records[1].Slice(18, 25)) // "Leonard" => "75"
                );

            RegionProgram topRankedProg = session.Learn();

            if (topRankedProg == null)
            {
                Console.Error.WriteLine("Error: Learning fails!");
                return;
            }

            foreach (StringRegion number in numbers)
            {
                string output = topRankedProg.Run(number)?.Value ?? "null";
                Console.WriteLine("\"{0}\" => \"{1}\"", number, output);
            }
        }
Пример #6
0
        /// <summary>
        ///     Learns a program to extract a region and provides other references to help find the intended program.
        ///     Demonstrates the use of additional references.
        /// </summary>
        private static void LearnRegionWithAdditionalReferences()
        {
            var          session = new RegionSession();
            StringRegion input   =
                RegionSession.CreateStringRegion("Carrie Dodson 100\nLeonard Robledo 75\nMargaret Cook ***");

            StringRegion[] records = { input.Slice(0, 17), input.Slice(18, 36), input.Slice(37, 54) };

            // Suppose we want to extract "100", "75", and "***".
            session.AddConstraints(new RegionExample(records[0], records[0].Slice(14, 17)));
            // "Carrie Dodson 100" => "100"

            // Additional references help Extraction.Text observe the behavior of the learnt programs on unseen data.
            // In this example, if we do not use additional references, Extraction.Text may learn a program that extracts the first number.
            // On the contrary, if other references are present, it knows that this program is not applicable on the third record "Margaret Cook ***",
            // and promotes a more applicable program.
            session.AddInputs(records.Skip(1));

            RegionProgram topRankedProg = session.Learn();

            if (topRankedProg == null)
            {
                Console.Error.WriteLine("Error: Learning fails!");
                return;
            }

            foreach (StringRegion record in records)
            {
                string output = topRankedProg.Run(record)?.Value ?? "null";
                Console.WriteLine("\"{0}\" => \"{1}\"", record, output);
            }
        }
Пример #7
0
        /// <summary>
        ///     Learns a program to extract a region with both positive and negative examples.
        ///     Demonstrates the use of negative examples.
        /// </summary>
        private static void LearnRegionWithNegativeExamples()
        {
            var          session = new RegionSession();
            StringRegion input   =
                RegionSession.CreateStringRegion("Carrie Dodson 100\nLeonard Robledo NA\nMargaret Cook 320");

            StringRegion[] records = { input.Slice(0, 17), input.Slice(18, 36), input.Slice(37, 54) };

            // Suppose we want to extract "100", "320".
            session.AddConstraints(
                new RegionExample(records[0], records[0].Slice(14, 17)), // "Carrie Dodson 100" => "100"
                new RegionNegativeExample(records[1], records[1])        // no extraction in "Leonard Robledo NA"
                );

            // Extraction.Text will find a program whose output does not OVERLAP with any of the negative examples.
            RegionProgram topRankedProg = session.Learn();

            if (topRankedProg == null)
            {
                Console.Error.WriteLine("Error: Learning fails!");
                return;
            }

            foreach (StringRegion record in records)
            {
                string output = topRankedProg.Run(record)?.Value ?? "null";
                Console.WriteLine("\"{0}\" => \"{1}\"", record, output);
            }
        }
Пример #8
0
        /// <summary>
        ///     Learns a program to extract a single region using two examples in two different files.
        ///     Learning from different files is similar to learning with multiple examples from a single file.
        ///     Demonstrates how to learn with examples from different files.
        /// </summary>
        private static void LearnRegionUsingMultipleFiles()
        {
            var          session = new RegionSession();
            StringRegion input1  = RegionSession.CreateStringRegion("Carrie Dodson 100");
            StringRegion input2  = RegionSession.CreateStringRegion("Leonard Robledo 75");

            session.AddConstraints(
                new RegionExample(input1, input1.Slice(7, 13)), // "Carrie Dodson 100" => "Dodson"
                new RegionExample(input2, input2.Slice(8, 15))  // "Leonard Robledo 75" => "Robledo"
                );

            RegionProgram topRankedProg = session.Learn();

            if (topRankedProg == null)
            {
                Console.Error.WriteLine("Error: Learning fails!");
                return;
            }

            StringRegion testInput = RegionSession.CreateStringRegion("Margaret Cook 320"); // expect "Cook"
            StringRegion output    = topRankedProg.Run(testInput);

            if (output == null)
            {
                Console.Error.WriteLine("Error: Extracting fails!");
                return;
            }
            Console.WriteLine("\"{0}\" => \"{1}\"", testInput, output);
        }
        public static async Task <StructureExtractor> TrainExtractorAsync(IEnumerable <Tuple <string, uint, uint> > examples, IEnumerable <string> noneLabeledExamples = null)
        {
            if (null == examples || !examples.Any())
            {
                throw new AggregateException($"{nameof(examples)} must not be null or empty");
            }

            var regionSession = new RegionSession();

            foreach (var example in examples)
            {
                var stringRegion = new StringRegion(example.Item1, Semantics.Tokens);
                var field        = stringRegion.Slice(example.Item2, example.Item3);
                regionSession.AddConstraints(new RegionExample(stringRegion, field));
            }

            if (noneLabeledExamples?.Any() == true)
            {
                regionSession.AddInputs(noneLabeledExamples.Select(e => new StringRegion(e, Semantics.Tokens)));
            }


            var program = await regionSession.LearnAsync();

            if (null == program)
            {
                throw new Exception("No program found");
            }

            return(new StructureExtractor(program));
        }
        public string LearnSingle(List <TextExtractExample> textExtractExamples)
        {
            var session        = new RegionSession();
            var regionExamples = new List <RegionExample>();

            foreach (var textExtractExample in textExtractExamples)
            {
                var inputRegion          = RegionSession.CreateStringRegion(textExtractExample.text);
                var textExtractSelection = textExtractExample.selections.First(); // at most only one example is added per string region
                if (textExtractSelection != null)
                {
                    var exampleRegion = inputRegion.Slice((uint)textExtractSelection.startPos, (uint)textExtractSelection.endPos);
                    var regionExample = new RegionExample(inputRegion, exampleRegion);
                    regionExamples.Add(regionExample);
                }
            }

            session.AddConstraints(regionExamples);
            var program = session.Learn();

            return(program.Serialize());
        }
Пример #11
0
        /// <summary>
        ///     Learns a program to extract a multiple regions using two examples in two different files.
        ///     Learning multiple regions is similar to learning single regions, it's just doing same task again for different regions.
        /// </summary>
        private static void LearnMultipleRegionsUsingMultipleFiles(List <string> paths, List <Dictionary <string, string> > regionsToLearn)
        {
            List <StringRegion> inputs = new List <StringRegion>();

            for (int i = 0; i < paths.Count; i++)
            {
                string s = File.ReadAllText(paths[i]);
                inputs.Add(RegionSession.CreateStringRegion(s));
            }

            int trainingDocumentCount = 2;

            List <string> fieldsToLearn = regionsToLearn[0].Keys.ToList();
            Dictionary <string, RegionSession> sessionPerField = new Dictionary <string, RegionSession>();

            foreach (string field in fieldsToLearn)
            {
                RegionSession session = new RegionSession();
                for (int i = 0; i < trainingDocumentCount; i++)
                {
                    string output = regionsToLearn[i][field];
                    uint   start  = inputs[i].IndexOfRelative(output).Value;
                    uint   end    = (uint)(start + output.Length);

                    RegionExample example = new RegionExample(inputs[i], inputs[i].Slice(start, end));
                    session.AddConstraints(example);
                }
                sessionPerField.Add(field, session);
            }
            Dictionary <string, RegionProgram> programPerField = new Dictionary <string, RegionProgram>();

            foreach (var fieldSessionPair in sessionPerField)
            {
                RegionProgram program = fieldSessionPair.Value.Learn();
                if (program == null)
                {
                    Console.Error.WriteLine("Error: Learning fails for Field : " + fieldSessionPair.Key);
                }
                else
                {
                    programPerField.Add(fieldSessionPair.Key, program);
                }
            }

            //testing

            StreamWriter outputWriter = new StreamWriter(@"..\..\output.txt");

            outputWriter.WriteLine(string.Join("\t|\t", programPerField.Keys));
            for (int i = trainingDocumentCount; i < inputs.Count; i++)
            {
                List <string> values = new List <string>();
                foreach (var fieldProgramPair in programPerField)
                {
                    string value = fieldProgramPair.Value.Run(inputs[i])?.Value;
                    values.Add(value);
                }
                outputWriter.WriteLine(string.Join("\t|\t\t", values));
            }
            outputWriter.Flush();
            outputWriter.Close();
        }