public TreeAnnotatorAndBinarizer(IHeadFinder annotationHF, IHeadFinder binarizationHF, ITreebankLangParserParams tlpParams, bool forceCNF, bool insideFactor, bool doSubcategorization, Options op) { this.trainOptions = op.trainOptions; if (doSubcategorization) { annotator = new TreeAnnotator(annotationHF, tlpParams, op); } else { annotator = new TreeAnnotatorAndBinarizer.TreeNullAnnotator(annotationHF); } binarizer = new TreeBinarizer(binarizationHF, tlpParams.TreebankLanguagePack(), insideFactor, trainOptions.markovFactor, trainOptions.markovOrder, trainOptions.CompactGrammar() > 0, trainOptions.CompactGrammar() > 1, trainOptions.HselCut, trainOptions .markFinalStates, trainOptions.simpleBinarizedLabels, trainOptions.noRebinarization); if (trainOptions.selectivePostSplit) { postSplitter = new PostSplitter(tlpParams, op); } else { postSplitter = null; } this.tf = new LabeledScoredTreeFactory(new CategoryWordTagFactory()); this.tlp = tlpParams.TreebankLanguagePack(); this.forceCNF = forceCNF; if (trainOptions.printAnnotatedRuleCounts) { annotatedRuleCounts = new ClassicCounter <Tree>(); } else { annotatedRuleCounts = null; } if (trainOptions.printAnnotatedStateCounts) { annotatedStateCounts = new ClassicCounter <string>(); } else { annotatedStateCounts = null; } }
/// <returns>A Triple of binaryTrainTreebank, binarySecondaryTreebank, binaryTuneTreebank.</returns> public static Triple <Treebank, Treebank, Treebank> GetAnnotatedBinaryTreebankFromTreebank(Treebank trainTreebank, Treebank secondaryTreebank, Treebank tuneTreebank, Options op) { // setup tree transforms ITreebankLangParserParams tlpParams = op.tlpParams; ITreebankLanguagePack tlp = tlpParams.TreebankLanguagePack(); if (op.testOptions.verbose) { PrintWriter pwErr = tlpParams.Pw(System.Console.Error); pwErr.Print("Training "); pwErr.Println(trainTreebank.TextualSummary(tlp)); if (secondaryTreebank != null) { pwErr.Print("Secondary training "); pwErr.Println(secondaryTreebank.TextualSummary(tlp)); } } CompositeTreeTransformer trainTransformer = new CompositeTreeTransformer(); if (op.trainOptions.preTransformer != null) { trainTransformer.AddTransformer(op.trainOptions.preTransformer); } if (op.trainOptions.collinsPunc) { CollinsPuncTransformer collinsPuncTransformer = new CollinsPuncTransformer(tlp); trainTransformer.AddTransformer(collinsPuncTransformer); } log.Info("Binarizing trees..."); Edu.Stanford.Nlp.Parser.Lexparser.TreeAnnotatorAndBinarizer binarizer; if (!op.trainOptions.leftToRight) { binarizer = new Edu.Stanford.Nlp.Parser.Lexparser.TreeAnnotatorAndBinarizer(tlpParams, op.forceCNF, !op.trainOptions.OutsideFactor(), !op.trainOptions.predictSplits, op); } else { binarizer = new Edu.Stanford.Nlp.Parser.Lexparser.TreeAnnotatorAndBinarizer(tlpParams.HeadFinder(), new LeftHeadFinder(), tlpParams, op.forceCNF, !op.trainOptions.OutsideFactor(), !op.trainOptions.predictSplits, op); } trainTransformer.AddTransformer(binarizer); if (op.wordFunction != null) { ITreeTransformer wordFunctionTransformer = new TreeLeafLabelTransformer(op.wordFunction); trainTransformer.AddTransformer(wordFunctionTransformer); } Treebank wholeTreebank; if (secondaryTreebank == null) { wholeTreebank = trainTreebank; } else { wholeTreebank = new CompositeTreebank(trainTreebank, secondaryTreebank); } if (op.trainOptions.selectiveSplit) { op.trainOptions.splitters = ParentAnnotationStats.GetSplitCategories(wholeTreebank, op.trainOptions.tagSelectiveSplit, 0, op.trainOptions.selectiveSplitCutOff, op.trainOptions.tagSelectiveSplitCutOff, tlp); RemoveDeleteSplittersFromSplitters(tlp, op); if (op.testOptions.verbose) { IList <string> list = new List <string>(op.trainOptions.splitters); list.Sort(); log.Info("Parent split categories: " + list); } } if (op.trainOptions.selectivePostSplit) { // Do all the transformations once just to learn selective splits on annotated categories ITreeTransformer myTransformer = new TreeAnnotator(tlpParams.HeadFinder(), tlpParams, op); wholeTreebank = wholeTreebank.Transform(myTransformer); op.trainOptions.postSplitters = ParentAnnotationStats.GetSplitCategories(wholeTreebank, true, 0, op.trainOptions.selectivePostSplitCutOff, op.trainOptions.tagSelectivePostSplitCutOff, tlp); if (op.testOptions.verbose) { log.Info("Parent post annotation split categories: " + op.trainOptions.postSplitters); } } if (op.trainOptions.hSelSplit) { // We run through all the trees once just to gather counts for hSelSplit! int ptt = op.trainOptions.printTreeTransformations; op.trainOptions.printTreeTransformations = 0; binarizer.SetDoSelectiveSplit(false); foreach (Tree tree in wholeTreebank) { trainTransformer.TransformTree(tree); } binarizer.SetDoSelectiveSplit(true); op.trainOptions.printTreeTransformations = ptt; } // we've done all the setup now. here's where the train treebank is transformed. trainTreebank = trainTreebank.Transform(trainTransformer); if (secondaryTreebank != null) { secondaryTreebank = secondaryTreebank.Transform(trainTransformer); } if (op.trainOptions.printAnnotatedStateCounts) { binarizer.PrintStateCounts(); } if (op.trainOptions.printAnnotatedRuleCounts) { binarizer.PrintRuleCounts(); } if (tuneTreebank != null) { tuneTreebank = tuneTreebank.Transform(trainTransformer); } if (op.testOptions.verbose) { binarizer.DumpStats(); } return(new Triple <Treebank, Treebank, Treebank>(trainTreebank, secondaryTreebank, tuneTreebank)); }