public static IList <Tree> BinarizeTreebank(Treebank treebank, Options op) { TreeBinarizer binarizer = TreeBinarizer.SimpleTreeBinarizer(op.tlpParams.HeadFinder(), op.tlpParams.TreebankLanguagePack()); BasicCategoryTreeTransformer basicTransformer = new BasicCategoryTreeTransformer(op.Langpack()); CompositeTreeTransformer transformer = new CompositeTreeTransformer(); transformer.AddTransformer(binarizer); transformer.AddTransformer(basicTransformer); treebank = treebank.Transform(transformer); IHeadFinder binaryHeadFinder = new BinaryHeadFinder(op.tlpParams.HeadFinder()); IList <Tree> binarizedTrees = Generics.NewArrayList(); foreach (Tree tree in treebank) { Edu.Stanford.Nlp.Trees.Trees.ConvertToCoreLabels(tree); tree.PercolateHeadAnnotations(binaryHeadFinder); // Index from 1. Tools downstream expect index from 1, so for // uses internal to the srparser we have to renormalize the // indices, with the result that here we have to index from 1 tree.IndexLeaves(1, true); binarizedTrees.Add(tree); } return(binarizedTrees); }
public static ITreeTransformer BuildTrainTransformer(Options op) { CompositeTreeTransformer transformer = LexicalizedParser.BuildTrainTransformer(op); return(transformer); }
/// <returns>A Triple of binaryTrainTreebank, binarySecondaryTreebank, binaryTuneTreebank.</returns> public static Triple <Treebank, Treebank, Treebank> GetAnnotatedBinaryTreebankFromTreebank(Treebank trainTreebank, Treebank secondaryTreebank, Treebank tuneTreebank, Options op) { // setup tree transforms ITreebankLangParserParams tlpParams = op.tlpParams; ITreebankLanguagePack tlp = tlpParams.TreebankLanguagePack(); if (op.testOptions.verbose) { PrintWriter pwErr = tlpParams.Pw(System.Console.Error); pwErr.Print("Training "); pwErr.Println(trainTreebank.TextualSummary(tlp)); if (secondaryTreebank != null) { pwErr.Print("Secondary training "); pwErr.Println(secondaryTreebank.TextualSummary(tlp)); } } CompositeTreeTransformer trainTransformer = new CompositeTreeTransformer(); if (op.trainOptions.preTransformer != null) { trainTransformer.AddTransformer(op.trainOptions.preTransformer); } if (op.trainOptions.collinsPunc) { CollinsPuncTransformer collinsPuncTransformer = new CollinsPuncTransformer(tlp); trainTransformer.AddTransformer(collinsPuncTransformer); } log.Info("Binarizing trees..."); Edu.Stanford.Nlp.Parser.Lexparser.TreeAnnotatorAndBinarizer binarizer; if (!op.trainOptions.leftToRight) { binarizer = new Edu.Stanford.Nlp.Parser.Lexparser.TreeAnnotatorAndBinarizer(tlpParams, op.forceCNF, !op.trainOptions.OutsideFactor(), !op.trainOptions.predictSplits, op); } else { binarizer = new Edu.Stanford.Nlp.Parser.Lexparser.TreeAnnotatorAndBinarizer(tlpParams.HeadFinder(), new LeftHeadFinder(), tlpParams, op.forceCNF, !op.trainOptions.OutsideFactor(), !op.trainOptions.predictSplits, op); } trainTransformer.AddTransformer(binarizer); if (op.wordFunction != null) { ITreeTransformer wordFunctionTransformer = new TreeLeafLabelTransformer(op.wordFunction); trainTransformer.AddTransformer(wordFunctionTransformer); } Treebank wholeTreebank; if (secondaryTreebank == null) { wholeTreebank = trainTreebank; } else { wholeTreebank = new CompositeTreebank(trainTreebank, secondaryTreebank); } if (op.trainOptions.selectiveSplit) { op.trainOptions.splitters = ParentAnnotationStats.GetSplitCategories(wholeTreebank, op.trainOptions.tagSelectiveSplit, 0, op.trainOptions.selectiveSplitCutOff, op.trainOptions.tagSelectiveSplitCutOff, tlp); RemoveDeleteSplittersFromSplitters(tlp, op); if (op.testOptions.verbose) { IList <string> list = new List <string>(op.trainOptions.splitters); list.Sort(); log.Info("Parent split categories: " + list); } } if (op.trainOptions.selectivePostSplit) { // Do all the transformations once just to learn selective splits on annotated categories ITreeTransformer myTransformer = new TreeAnnotator(tlpParams.HeadFinder(), tlpParams, op); wholeTreebank = wholeTreebank.Transform(myTransformer); op.trainOptions.postSplitters = ParentAnnotationStats.GetSplitCategories(wholeTreebank, true, 0, op.trainOptions.selectivePostSplitCutOff, op.trainOptions.tagSelectivePostSplitCutOff, tlp); if (op.testOptions.verbose) { log.Info("Parent post annotation split categories: " + op.trainOptions.postSplitters); } } if (op.trainOptions.hSelSplit) { // We run through all the trees once just to gather counts for hSelSplit! int ptt = op.trainOptions.printTreeTransformations; op.trainOptions.printTreeTransformations = 0; binarizer.SetDoSelectiveSplit(false); foreach (Tree tree in wholeTreebank) { trainTransformer.TransformTree(tree); } binarizer.SetDoSelectiveSplit(true); op.trainOptions.printTreeTransformations = ptt; } // we've done all the setup now. here's where the train treebank is transformed. trainTreebank = trainTreebank.Transform(trainTransformer); if (secondaryTreebank != null) { secondaryTreebank = secondaryTreebank.Transform(trainTransformer); } if (op.trainOptions.printAnnotatedStateCounts) { binarizer.PrintStateCounts(); } if (op.trainOptions.printAnnotatedRuleCounts) { binarizer.PrintRuleCounts(); } if (tuneTreebank != null) { tuneTreebank = tuneTreebank.Transform(trainTransformer); } if (op.testOptions.verbose) { binarizer.DumpStats(); } return(new Triple <Treebank, Treebank, Treebank>(trainTreebank, secondaryTreebank, tuneTreebank)); }