/// <summary> /// Learns to serialize and deserialize Extraction.Text program. /// </summary> private static void SerializeProgram() { var session = new RegionSession(); StringRegion input = RegionSession.CreateStringRegion("Carrie Dodson 100"); session.AddConstraints(new RegionExample(input, input.Slice(7, 13))); // "Carrie Dodson 100" => "Dodson" RegionProgram topRankedProg = session.Learn(); if (topRankedProg == null) { Console.Error.WriteLine("Error: Learning fails!"); return; } string serializedProgram = topRankedProg.Serialize(); RegionProgram deserializedProgram = Loader.Instance.Region.Load(serializedProgram); StringRegion testInput = RegionSession.CreateStringRegion("Leonard Robledo 75"); // expect "Robledo" StringRegion output = deserializedProgram.Run(testInput); if (output == null) { Console.Error.WriteLine("Error: Extracting fails!"); return; } Console.WriteLine("\"{0}\" => \"{1}\"", testInput, output); }
/// <summary> /// Learns a program to extract a single region from a file. /// </summary> private static void LearnRegion() { var session = new RegionSession(); StringRegion input = RegionSession.CreateStringRegion("Carrie Dodson 100"); // Only one example because we extract one region from one file. // Position specifies the location between two characters in the file. It starts at 0 (the beginning of the file). // An example is identified by a pair of start and end positions. session.AddConstraints(new RegionExample(input, input.Slice(7, 13))); // "Carrie Dodson 100" => "Dodson" RegionProgram topRankedProg = session.Learn(); if (topRankedProg == null) { Console.Error.WriteLine("Error: Learning fails!"); return; } StringRegion testInput = RegionSession.CreateStringRegion("Leonard Robledo 75"); // expect "Robledo" StringRegion output = topRankedProg.Run(testInput); if (output == null) { Console.Error.WriteLine("Error: Extracting fails!"); return; } Console.WriteLine("\"{0}\" => \"{1}\"", testInput, output); }
/// <summary> /// Learns top-ranked 3 region programs. /// Demonstrates access to lower-ranked programs. /// </summary> private static void LearnTop3RegionPrograms() { var session = new RegionSession(); StringRegion input = RegionSession.CreateStringRegion("Carrie Dodson 100"); session.AddConstraints(new RegionExample(input, input.Slice(14, 17))); // "Carrie Dodson 100" => "Dodson" IEnumerable <RegionProgram> topKPrograms = session.LearnTopK(3); var i = 0; StringRegion[] otherInputs = { input, RegionSession.CreateStringRegion("Leonard Robledo NA"), RegionSession.CreateStringRegion("Margaret Cook 320") }; foreach (RegionProgram prog in topKPrograms) { Console.WriteLine("Program {0}:", ++i); foreach (StringRegion str in otherInputs) { var r = prog.Run(str); Console.WriteLine(r != null ? r.Value : "null"); } } }
/// <summary> /// Learns all region programs that satisfy the examples (advanced feature). /// Demonstrates access to the entire program set. /// </summary> private static void LearnAllRegionPrograms() { var session = new RegionSession(); StringRegion input = RegionSession.CreateStringRegion("Carrie Dodson 100"); session.AddConstraints(new RegionExample(input, input.Slice(14, 17))); // "Carrie Dodson 100" => "Dodson" ProgramSet allPrograms = session.LearnAll().ProgramSet; IEnumerable <ProgramNode> topKPrograms = allPrograms.TopK(RegionLearner.Instance.ScoreFeature, 3); var i = 0; StringRegion[] otherInputs = { input, RegionSession.CreateStringRegion("Leonard Robledo NA"), RegionSession.CreateStringRegion("Margaret Cook 320") }; foreach (ProgramNode programNode in topKPrograms) { Console.WriteLine("Program {0}:", ++i); var program = new RegionProgram(programNode, ReferenceKind.Parent); foreach (StringRegion str in otherInputs) { StringRegion r = program.Run(str); Console.WriteLine(r == null ? "null" : r.Value); } } }
/// <summary> /// Learns a program to extract a single region using another region that appears after it as reference (i.e., /// succeeding sibling region). /// Demonstrates how sibling referencing works. /// </summary> private static void LearnRegionReferencingSucceedingSibling() { var session = new RegionSession(); StringRegion input = RegionSession.CreateStringRegion("Carrie Dodson 100\nLeonard Robledo 75\nMargaret Cook 320"); StringRegion[] records = { input.Slice(0, 17), input.Slice(18, 36), input.Slice(37, 54) }; StringRegion[] numbers = { input.Slice(14, 17), input.Slice(34, 36), input.Slice(51, 54) }; // Suppose we want to extract the first name w.r.t the number session.AddConstraints( new RegionExample(numbers[0], records[0].Slice(0, 6)), // "Carrie" => "100" new RegionExample(numbers[1], records[1].Slice(18, 25)) // "Leonard" => "75" ); RegionProgram topRankedProg = session.Learn(); if (topRankedProg == null) { Console.Error.WriteLine("Error: Learning fails!"); return; } foreach (StringRegion number in numbers) { string output = topRankedProg.Run(number)?.Value ?? "null"; Console.WriteLine("\"{0}\" => \"{1}\"", number, output); } }
/// <summary> /// Learns a program to extract a region and provides other references to help find the intended program. /// Demonstrates the use of additional references. /// </summary> private static void LearnRegionWithAdditionalReferences() { var session = new RegionSession(); StringRegion input = RegionSession.CreateStringRegion("Carrie Dodson 100\nLeonard Robledo 75\nMargaret Cook ***"); StringRegion[] records = { input.Slice(0, 17), input.Slice(18, 36), input.Slice(37, 54) }; // Suppose we want to extract "100", "75", and "***". session.AddConstraints(new RegionExample(records[0], records[0].Slice(14, 17))); // "Carrie Dodson 100" => "100" // Additional references help Extraction.Text observe the behavior of the learnt programs on unseen data. // In this example, if we do not use additional references, Extraction.Text may learn a program that extracts the first number. // On the contrary, if other references are present, it knows that this program is not applicable on the third record "Margaret Cook ***", // and promotes a more applicable program. session.AddInputs(records.Skip(1)); RegionProgram topRankedProg = session.Learn(); if (topRankedProg == null) { Console.Error.WriteLine("Error: Learning fails!"); return; } foreach (StringRegion record in records) { string output = topRankedProg.Run(record)?.Value ?? "null"; Console.WriteLine("\"{0}\" => \"{1}\"", record, output); } }
/// <summary> /// Learns a program to extract a region with both positive and negative examples. /// Demonstrates the use of negative examples. /// </summary> private static void LearnRegionWithNegativeExamples() { var session = new RegionSession(); StringRegion input = RegionSession.CreateStringRegion("Carrie Dodson 100\nLeonard Robledo NA\nMargaret Cook 320"); StringRegion[] records = { input.Slice(0, 17), input.Slice(18, 36), input.Slice(37, 54) }; // Suppose we want to extract "100", "320". session.AddConstraints( new RegionExample(records[0], records[0].Slice(14, 17)), // "Carrie Dodson 100" => "100" new RegionNegativeExample(records[1], records[1]) // no extraction in "Leonard Robledo NA" ); // Extraction.Text will find a program whose output does not OVERLAP with any of the negative examples. RegionProgram topRankedProg = session.Learn(); if (topRankedProg == null) { Console.Error.WriteLine("Error: Learning fails!"); return; } foreach (StringRegion record in records) { string output = topRankedProg.Run(record)?.Value ?? "null"; Console.WriteLine("\"{0}\" => \"{1}\"", record, output); } }
/// <summary> /// Learns a program to extract a single region using two examples in two different files. /// Learning from different files is similar to learning with multiple examples from a single file. /// Demonstrates how to learn with examples from different files. /// </summary> private static void LearnRegionUsingMultipleFiles() { var session = new RegionSession(); StringRegion input1 = RegionSession.CreateStringRegion("Carrie Dodson 100"); StringRegion input2 = RegionSession.CreateStringRegion("Leonard Robledo 75"); session.AddConstraints( new RegionExample(input1, input1.Slice(7, 13)), // "Carrie Dodson 100" => "Dodson" new RegionExample(input2, input2.Slice(8, 15)) // "Leonard Robledo 75" => "Robledo" ); RegionProgram topRankedProg = session.Learn(); if (topRankedProg == null) { Console.Error.WriteLine("Error: Learning fails!"); return; } StringRegion testInput = RegionSession.CreateStringRegion("Margaret Cook 320"); // expect "Cook" StringRegion output = topRankedProg.Run(testInput); if (output == null) { Console.Error.WriteLine("Error: Extracting fails!"); return; } Console.WriteLine("\"{0}\" => \"{1}\"", testInput, output); }
public static async Task <StructureExtractor> TrainExtractorAsync(IEnumerable <Tuple <string, uint, uint> > examples, IEnumerable <string> noneLabeledExamples = null) { if (null == examples || !examples.Any()) { throw new AggregateException($"{nameof(examples)} must not be null or empty"); } var regionSession = new RegionSession(); foreach (var example in examples) { var stringRegion = new StringRegion(example.Item1, Semantics.Tokens); var field = stringRegion.Slice(example.Item2, example.Item3); regionSession.AddConstraints(new RegionExample(stringRegion, field)); } if (noneLabeledExamples?.Any() == true) { regionSession.AddInputs(noneLabeledExamples.Select(e => new StringRegion(e, Semantics.Tokens))); } var program = await regionSession.LearnAsync(); if (null == program) { throw new Exception("No program found"); } return(new StructureExtractor(program)); }
public string LearnSingle(List <TextExtractExample> textExtractExamples) { var session = new RegionSession(); var regionExamples = new List <RegionExample>(); foreach (var textExtractExample in textExtractExamples) { var inputRegion = RegionSession.CreateStringRegion(textExtractExample.text); var textExtractSelection = textExtractExample.selections.First(); // at most only one example is added per string region if (textExtractSelection != null) { var exampleRegion = inputRegion.Slice((uint)textExtractSelection.startPos, (uint)textExtractSelection.endPos); var regionExample = new RegionExample(inputRegion, exampleRegion); regionExamples.Add(regionExample); } } session.AddConstraints(regionExamples); var program = session.Learn(); return(program.Serialize()); }
/// <summary> /// Learns a program to extract a multiple regions using two examples in two different files. /// Learning multiple regions is similar to learning single regions, it's just doing same task again for different regions. /// </summary> private static void LearnMultipleRegionsUsingMultipleFiles(List <string> paths, List <Dictionary <string, string> > regionsToLearn) { List <StringRegion> inputs = new List <StringRegion>(); for (int i = 0; i < paths.Count; i++) { string s = File.ReadAllText(paths[i]); inputs.Add(RegionSession.CreateStringRegion(s)); } int trainingDocumentCount = 2; List <string> fieldsToLearn = regionsToLearn[0].Keys.ToList(); Dictionary <string, RegionSession> sessionPerField = new Dictionary <string, RegionSession>(); foreach (string field in fieldsToLearn) { RegionSession session = new RegionSession(); for (int i = 0; i < trainingDocumentCount; i++) { string output = regionsToLearn[i][field]; uint start = inputs[i].IndexOfRelative(output).Value; uint end = (uint)(start + output.Length); RegionExample example = new RegionExample(inputs[i], inputs[i].Slice(start, end)); session.AddConstraints(example); } sessionPerField.Add(field, session); } Dictionary <string, RegionProgram> programPerField = new Dictionary <string, RegionProgram>(); foreach (var fieldSessionPair in sessionPerField) { RegionProgram program = fieldSessionPair.Value.Learn(); if (program == null) { Console.Error.WriteLine("Error: Learning fails for Field : " + fieldSessionPair.Key); } else { programPerField.Add(fieldSessionPair.Key, program); } } //testing StreamWriter outputWriter = new StreamWriter(@"..\..\output.txt"); outputWriter.WriteLine(string.Join("\t|\t", programPerField.Keys)); for (int i = trainingDocumentCount; i < inputs.Count; i++) { List <string> values = new List <string>(); foreach (var fieldProgramPair in programPerField) { string value = fieldProgramPair.Value.Run(inputs[i])?.Value; values.Add(value); } outputWriter.WriteLine(string.Join("\t|\t\t", values)); } outputWriter.Flush(); outputWriter.Close(); }