public virtual Tree TransformTree(Tree tree) { ILabel l = tree.Label(); if (tree.IsLeaf()) { return(tf.NewLeaf(l)); } string s = l.Value(); s = tlpp.TreebankLanguagePack().BasicCategory(s); if (deletePunct) { // this is broken as it's not the right thing to do when there // is any tag ambiguity -- and there is for ' (POS/''). Sentences // can then have more or less words. It's also unnecessary for EVALB, // since it ignores punctuation anyway if (tree.IsPreTerminal() && tlpp.TreebankLanguagePack().IsEvalBIgnoredPunctuationTag(s)) { return(null); } } // TEMPORARY: eliminate the TOPP constituent if (tree.Children()[0].Label().Value().Equals("TOPP")) { log.Info("Found a TOPP"); tree.SetChildren(tree.Children()[0].Children()); } // Negra has lots of non-unary roots; delete unary roots if (tlpp.TreebankLanguagePack().IsStartSymbol(s) && tree.NumChildren() == 1) { // NB: This deletes the boundary symbol, which is in the tree! return(TransformTree(tree.GetChild(0))); } IList <Tree> children = new List <Tree>(); for (int cNum = 0; cNum < numC; cNum++) { Tree child = tree.GetChild(cNum); Tree newChild = TransformTree(child); if (newChild != null) { children.Add(newChild); } } if (children.IsEmpty()) { return(null); } return(tf.NewTreeNode(new StringLabel(s), children)); }
public BinarizerAnnotator(string annotatorName, Properties props) { this.tlppClass = props.GetProperty(annotatorName + ".tlppClass", DefaultTlppClass); ITreebankLangParserParams tlpp = ReflectionLoading.LoadByReflection(tlppClass); this.binarizer = TreeBinarizer.SimpleTreeBinarizer(tlpp.HeadFinder(), tlpp.TreebankLanguagePack()); }
public static void Main(string[] args) { if (args.Length < minArgs) { System.Console.Out.WriteLine(Usage()); System.Environment.Exit(-1); } Properties options = StringUtils.ArgsToProperties(args, ArgDefs()); Language language = PropertiesUtils.Get(options, "l", Language.English, typeof(Language)); ITreebankLangParserParams tlpp = language.@params; DiskTreebank tb = null; string encoding = options.GetProperty("l", "UTF-8"); bool removeBracket = PropertiesUtils.GetBool(options, "b", false); tlpp.SetInputEncoding(encoding); tlpp.SetOutputEncoding(encoding); tb = tlpp.DiskTreebank(); string[] files = options.GetProperty(string.Empty, string.Empty).Split("\\s+"); if (files.Length != 0) { foreach (string filename in files) { tb.LoadPath(filename); } } else { log.Info(Usage()); System.Environment.Exit(-1); } PrintWriter pwo = tlpp.Pw(); string startSymbol = tlpp.TreebankLanguagePack().StartSymbol(); ITreeFactory tf = new LabeledScoredTreeFactory(); int nTrees = 0; foreach (Tree t in tb) { if (removeBracket) { if (t.Value().Equals(startSymbol)) { t = t.FirstChild(); } } else { if (!t.Value().Equals(startSymbol)) { //Add a bracket if it isn't already there t = tf.NewTreeNode(startSymbol, Java.Util.Collections.SingletonList(t)); } } pwo.Println(t.ToString()); nTrees++; } pwo.Close(); System.Console.Error.Printf("Processed %d trees.%n", nTrees); }
public TreeAnnotatorAndBinarizer(IHeadFinder annotationHF, IHeadFinder binarizationHF, ITreebankLangParserParams tlpParams, bool forceCNF, bool insideFactor, bool doSubcategorization, Options op) { this.trainOptions = op.trainOptions; if (doSubcategorization) { annotator = new TreeAnnotator(annotationHF, tlpParams, op); } else { annotator = new TreeAnnotatorAndBinarizer.TreeNullAnnotator(annotationHF); } binarizer = new TreeBinarizer(binarizationHF, tlpParams.TreebankLanguagePack(), insideFactor, trainOptions.markovFactor, trainOptions.markovOrder, trainOptions.CompactGrammar() > 0, trainOptions.CompactGrammar() > 1, trainOptions.HselCut, trainOptions .markFinalStates, trainOptions.simpleBinarizedLabels, trainOptions.noRebinarization); if (trainOptions.selectivePostSplit) { postSplitter = new PostSplitter(tlpParams, op); } else { postSplitter = null; } this.tf = new LabeledScoredTreeFactory(new CategoryWordTagFactory()); this.tlp = tlpParams.TreebankLanguagePack(); this.forceCNF = forceCNF; if (trainOptions.printAnnotatedRuleCounts) { annotatedRuleCounts = new ClassicCounter <Tree>(); } else { annotatedRuleCounts = null; } if (trainOptions.printAnnotatedStateCounts) { annotatedStateCounts = new ClassicCounter <string>(); } else { annotatedStateCounts = null; } }
public MLEDependencyGrammar(ITagProjection tagProjection, ITreebankLangParserParams tlpParams, bool directional, bool useDistance, bool useCoarseDistance, Options op, IIndex <string> wordIndex, IIndex <string> tagIndex) : base(tlpParams.TreebankLanguagePack(), tagProjection, directional, useDistance, useCoarseDistance, op, wordIndex, tagIndex) { // reduced tag space // public double distanceDecay = 0.0; // extra smoothing hyperparameters for tag projection backoff. Only used if useSmoothTagProjection is true. // back off Bayesian m-estimate of aTW given aT to aPTW given aPT // back off Bayesian m-estimate of aTW_hTd to aPTW_hPTd (?? guessed, not tuned) // back off Bayesian m-estimate of aT_hTd to aPT_hPTd (?? guessed, not tuned) // back off word prediction from tag to projected tag (only used if useUnigramWordSmoothing is true) useSmoothTagProjection = op.useSmoothTagProjection; useUnigramWordSmoothing = op.useUnigramWordSmoothing; argCounter = new ClassicCounter <IntDependency>(); stopCounter = new ClassicCounter <IntDependency>(); double[] smoothParams = tlpParams.MLEDependencyGrammarSmoothingParams(); smooth_aT_hTWd = smoothParams[0]; smooth_aTW_hTWd = smoothParams[1]; smooth_stop = smoothParams[2]; interp = smoothParams[3]; // cdm added Jan 2007 to play with dep grammar smoothing. Integrate this better if we keep it! smoothTP = new BasicCategoryTagProjection(tlpParams.TreebankLanguagePack()); }
/// <summary>Lets you test out the TreeBinarizer on the command line.</summary> /// <remarks> /// Lets you test out the TreeBinarizer on the command line. /// This main method doesn't yet handle as many flags as one would like. /// But it does have: /// <ul> /// <li> -tlp TreebankLanguagePack /// <li>-tlpp TreebankLangParserParams /// <li>-insideFactor /// <li>-markovOrder /// </ul> /// </remarks> /// <param name="args"> /// Command line arguments: flags as above, as above followed by /// treebankPath /// </param> public static void Main(string[] args) { ITreebankLangParserParams tlpp = null; // TreebankLangParserParams tlpp = new EnglishTreebankParserParams(); // TreeReaderFactory trf = new LabeledScoredTreeReaderFactory(); // Looks like it must build CategoryWordTagFactory!! ITreeReaderFactory trf = null; string fileExt = "mrg"; IHeadFinder hf = new ModCollinsHeadFinder(); ITreebankLanguagePack tlp = new PennTreebankLanguagePack(); bool insideFactor = false; bool mf = false; int mo = 1; bool uwl = false; bool uat = false; double sst = 20.0; bool mfs = false; bool simpleLabels = false; bool noRebinarization = false; int i = 0; while (i < args.Length && args[i].StartsWith("-")) { if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-tlp") && i + 1 < args.Length) { try { tlp = (ITreebankLanguagePack)System.Activator.CreateInstance(Sharpen.Runtime.GetType(args[i + 1])); } catch (Exception e) { log.Info("Couldn't instantiate: " + args[i + 1]); throw new Exception(e); } i++; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-tlpp") && i + 1 < args.Length) { try { tlpp = (ITreebankLangParserParams)System.Activator.CreateInstance(Sharpen.Runtime.GetType(args[i + 1])); } catch (Exception e) { log.Info("Couldn't instantiate: " + args[i + 1]); throw new Exception(e); } i++; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-insideFactor")) { insideFactor = true; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-markovOrder") && i + 1 < args.Length) { i++; mo = System.Convert.ToInt32(args[i]); } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-simpleLabels")) { simpleLabels = true; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-noRebinarization")) { noRebinarization = true; } else { log.Info("Unknown option:" + args[i]); } } } } } } i++; } if (i >= args.Length) { log.Info("usage: java TreeBinarizer [-tlpp class|-markovOrder int|...] treebankPath"); System.Environment.Exit(0); } Treebank treebank; if (tlpp != null) { treebank = tlpp.MemoryTreebank(); tlp = tlpp.TreebankLanguagePack(); fileExt = tlp.TreebankFileExtension(); hf = tlpp.HeadFinder(); } else { treebank = new DiskTreebank(trf); } treebank.LoadPath(args[i], fileExt, true); ITreeTransformer tt = new Edu.Stanford.Nlp.Parser.Lexparser.TreeBinarizer(hf, tlp, insideFactor, mf, mo, uwl, uat, sst, mfs, simpleLabels, noRebinarization); foreach (Tree t in treebank) { Tree newT = tt.TransformTree(t); System.Console.Out.WriteLine("Original tree:"); t.PennPrint(); System.Console.Out.WriteLine("Binarized tree:"); newT.PennPrint(); System.Console.Out.WriteLine(); } }
private static Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter GetSegmenterDataFromTreebank(Treebank trainTreebank, Options op, IIndex <string> wordIndex, IIndex <string> tagIndex) { System.Console.Out.WriteLine("Currently " + new DateTime()); // printOptions(true, op); Timing.StartTime(); // setup tree transforms ITreebankLangParserParams tlpParams = op.tlpParams; if (op.testOptions.verbose) { System.Console.Out.Write("Training "); System.Console.Out.WriteLine(trainTreebank.TextualSummary()); } System.Console.Out.Write("Binarizing trees..."); TreeAnnotatorAndBinarizer binarizer; // initialized below if (!op.trainOptions.leftToRight) { binarizer = new TreeAnnotatorAndBinarizer(tlpParams, op.forceCNF, !op.trainOptions.OutsideFactor(), true, op); } else { binarizer = new TreeAnnotatorAndBinarizer(tlpParams.HeadFinder(), new LeftHeadFinder(), tlpParams, op.forceCNF, !op.trainOptions.OutsideFactor(), true, op); } CollinsPuncTransformer collinsPuncTransformer = null; if (op.trainOptions.collinsPunc) { collinsPuncTransformer = new CollinsPuncTransformer(tlpParams.TreebankLanguagePack()); } IList <Tree> binaryTrainTrees = new List <Tree>(); // List<Tree> binaryTuneTrees = new ArrayList<Tree>(); if (op.trainOptions.selectiveSplit) { op.trainOptions.splitters = ParentAnnotationStats.GetSplitCategories(trainTreebank, true, 0, op.trainOptions.selectiveSplitCutOff, op.trainOptions.tagSelectiveSplitCutOff, tlpParams.TreebankLanguagePack()); if (op.testOptions.verbose) { log.Info("Parent split categories: " + op.trainOptions.splitters); } } if (op.trainOptions.selectivePostSplit) { ITreeTransformer myTransformer = new TreeAnnotator(tlpParams.HeadFinder(), tlpParams, op); Treebank annotatedTB = trainTreebank.Transform(myTransformer); op.trainOptions.postSplitters = ParentAnnotationStats.GetSplitCategories(annotatedTB, true, 0, op.trainOptions.selectivePostSplitCutOff, op.trainOptions.tagSelectivePostSplitCutOff, tlpParams.TreebankLanguagePack()); if (op.testOptions.verbose) { log.Info("Parent post annotation split categories: " + op.trainOptions.postSplitters); } } if (op.trainOptions.hSelSplit) { binarizer.SetDoSelectiveSplit(false); foreach (Tree tree in trainTreebank) { if (op.trainOptions.collinsPunc) { tree = collinsPuncTransformer.TransformTree(tree); } tree = binarizer.TransformTree(tree); } binarizer.SetDoSelectiveSplit(true); } foreach (Tree tree_1 in trainTreebank) { if (op.trainOptions.collinsPunc) { tree_1 = collinsPuncTransformer.TransformTree(tree_1); } tree_1 = binarizer.TransformTree(tree_1); binaryTrainTrees.Add(tree_1); } Timing.Tick("done."); if (op.testOptions.verbose) { binarizer.DumpStats(); } System.Console.Out.Write("Extracting Lexicon..."); Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter clex = (Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter)op.tlpParams.Lex(op, wordIndex, tagIndex); clex.InitializeTraining(binaryTrainTrees.Count); clex.Train(binaryTrainTrees); clex.FinishTraining(); Timing.Tick("done."); return(clex); }
/// <returns>A Triple of binaryTrainTreebank, binarySecondaryTreebank, binaryTuneTreebank.</returns> public static Triple <Treebank, Treebank, Treebank> GetAnnotatedBinaryTreebankFromTreebank(Treebank trainTreebank, Treebank secondaryTreebank, Treebank tuneTreebank, Options op) { // setup tree transforms ITreebankLangParserParams tlpParams = op.tlpParams; ITreebankLanguagePack tlp = tlpParams.TreebankLanguagePack(); if (op.testOptions.verbose) { PrintWriter pwErr = tlpParams.Pw(System.Console.Error); pwErr.Print("Training "); pwErr.Println(trainTreebank.TextualSummary(tlp)); if (secondaryTreebank != null) { pwErr.Print("Secondary training "); pwErr.Println(secondaryTreebank.TextualSummary(tlp)); } } CompositeTreeTransformer trainTransformer = new CompositeTreeTransformer(); if (op.trainOptions.preTransformer != null) { trainTransformer.AddTransformer(op.trainOptions.preTransformer); } if (op.trainOptions.collinsPunc) { CollinsPuncTransformer collinsPuncTransformer = new CollinsPuncTransformer(tlp); trainTransformer.AddTransformer(collinsPuncTransformer); } log.Info("Binarizing trees..."); Edu.Stanford.Nlp.Parser.Lexparser.TreeAnnotatorAndBinarizer binarizer; if (!op.trainOptions.leftToRight) { binarizer = new Edu.Stanford.Nlp.Parser.Lexparser.TreeAnnotatorAndBinarizer(tlpParams, op.forceCNF, !op.trainOptions.OutsideFactor(), !op.trainOptions.predictSplits, op); } else { binarizer = new Edu.Stanford.Nlp.Parser.Lexparser.TreeAnnotatorAndBinarizer(tlpParams.HeadFinder(), new LeftHeadFinder(), tlpParams, op.forceCNF, !op.trainOptions.OutsideFactor(), !op.trainOptions.predictSplits, op); } trainTransformer.AddTransformer(binarizer); if (op.wordFunction != null) { ITreeTransformer wordFunctionTransformer = new TreeLeafLabelTransformer(op.wordFunction); trainTransformer.AddTransformer(wordFunctionTransformer); } Treebank wholeTreebank; if (secondaryTreebank == null) { wholeTreebank = trainTreebank; } else { wholeTreebank = new CompositeTreebank(trainTreebank, secondaryTreebank); } if (op.trainOptions.selectiveSplit) { op.trainOptions.splitters = ParentAnnotationStats.GetSplitCategories(wholeTreebank, op.trainOptions.tagSelectiveSplit, 0, op.trainOptions.selectiveSplitCutOff, op.trainOptions.tagSelectiveSplitCutOff, tlp); RemoveDeleteSplittersFromSplitters(tlp, op); if (op.testOptions.verbose) { IList <string> list = new List <string>(op.trainOptions.splitters); list.Sort(); log.Info("Parent split categories: " + list); } } if (op.trainOptions.selectivePostSplit) { // Do all the transformations once just to learn selective splits on annotated categories ITreeTransformer myTransformer = new TreeAnnotator(tlpParams.HeadFinder(), tlpParams, op); wholeTreebank = wholeTreebank.Transform(myTransformer); op.trainOptions.postSplitters = ParentAnnotationStats.GetSplitCategories(wholeTreebank, true, 0, op.trainOptions.selectivePostSplitCutOff, op.trainOptions.tagSelectivePostSplitCutOff, tlp); if (op.testOptions.verbose) { log.Info("Parent post annotation split categories: " + op.trainOptions.postSplitters); } } if (op.trainOptions.hSelSplit) { // We run through all the trees once just to gather counts for hSelSplit! int ptt = op.trainOptions.printTreeTransformations; op.trainOptions.printTreeTransformations = 0; binarizer.SetDoSelectiveSplit(false); foreach (Tree tree in wholeTreebank) { trainTransformer.TransformTree(tree); } binarizer.SetDoSelectiveSplit(true); op.trainOptions.printTreeTransformations = ptt; } // we've done all the setup now. here's where the train treebank is transformed. trainTreebank = trainTreebank.Transform(trainTransformer); if (secondaryTreebank != null) { secondaryTreebank = secondaryTreebank.Transform(trainTransformer); } if (op.trainOptions.printAnnotatedStateCounts) { binarizer.PrintStateCounts(); } if (op.trainOptions.printAnnotatedRuleCounts) { binarizer.PrintRuleCounts(); } if (tuneTreebank != null) { tuneTreebank = tuneTreebank.Transform(trainTransformer); } if (op.testOptions.verbose) { binarizer.DumpStats(); } return(new Triple <Treebank, Treebank, Treebank>(trainTreebank, secondaryTreebank, tuneTreebank)); }
public virtual IList <Tree> GetAnnotatedBinaryTreebankFromTreebank(Treebank trainTreebank) { ITreebankLangParserParams tlpParams = op.tlpParams; ITreebankLanguagePack tlp = tlpParams.TreebankLanguagePack(); if (Verbose) { log.Info("\n\n" + trainTreebank.TextualSummary(tlp)); } log.Info("Binarizing trees..."); TreeAnnotatorAndBinarizer binarizer = new TreeAnnotatorAndBinarizer(tlpParams, op.forceCNF, !op.trainOptions.OutsideFactor(), true, op); Timing.Tick("done."); if (op.trainOptions.selectiveSplit) { op.trainOptions.splitters = ParentAnnotationStats.GetSplitCategories(trainTreebank, op.trainOptions.tagSelectiveSplit, 0, op.trainOptions.selectiveSplitCutOff, op.trainOptions.tagSelectiveSplitCutOff, tlp); RemoveDeleteSplittersFromSplitters(tlp); if (op.testOptions.verbose) { IList <string> list = new List <string>(op.trainOptions.splitters); list.Sort(); log.Info("Parent split categories: " + list); } } // if (op.trainOptions.selectivePostSplit) { // // Do all the transformations once just to learn selective splits on annotated categories // TreeTransformer myTransformer = new TreeAnnotator(tlpParams.headFinder(), tlpParams); // Treebank annotatedTB = trainTreebank.transform(myTransformer); // op.trainOptions.postSplitters = ParentAnnotationStats.getSplitCategories(annotatedTB, true, 0, op.trainOptions.selectivePostSplitCutOff, op.trainOptions.tagSelectivePostSplitCutOff, tlp); // if (op.testOptions.verbose) { // log.info("Parent post annotation split categories: " + op.trainOptions.postSplitters); // } // } if (op.trainOptions.hSelSplit) { // We run through all the trees once just to gather counts for hSelSplit! int ptt = op.trainOptions.printTreeTransformations; op.trainOptions.printTreeTransformations = 0; binarizer.SetDoSelectiveSplit(false); foreach (Tree tree in trainTreebank) { binarizer.TransformTree(tree); } binarizer.SetDoSelectiveSplit(true); op.trainOptions.printTreeTransformations = ptt; } //Tree transformation // IList <Tree> binaryTrainTrees = new List <Tree>(); foreach (Tree tree_1 in trainTreebank) { tree_1 = binarizer.TransformTree(tree_1); if (tree_1.Yield().Count - 1 <= trainLengthLimit) { binaryTrainTrees.Add(tree_1); } } // WSGDEBUG: Lot's of stuff on the grammar // if(VERBOSE) { // binarizer.printStateCounts(); // binarizer.printRuleCounts(); // binarizer.dumpStats(); // } return(binaryTrainTrees); }
public MLEDependencyGrammar(ITreebankLangParserParams tlpParams, bool directional, bool distance, bool coarseDistance, bool basicCategoryTagsInDependencyGrammar, Options op, IIndex <string> wordIndex, IIndex <string> tagIndex) : this(basicCategoryTagsInDependencyGrammar ? new BasicCategoryTagProjection(tlpParams.TreebankLanguagePack()) : new TestTagProjection(), tlpParams, directional, distance, coarseDistance, op, wordIndex, tagIndex) { }
// initial value is -0xDEADBEEF (actually positive because of 2s complement) // Don't change this; set with -v /// <summary>Determines method for print trees on output.</summary> /// <param name="tlpParams">The treebank parser params</param> /// <returns>A suitable tree printing object</returns> public virtual Edu.Stanford.Nlp.Trees.TreePrint TreePrint(ITreebankLangParserParams tlpParams) { ITreebankLanguagePack tlp = tlpParams.TreebankLanguagePack(); return new Edu.Stanford.Nlp.Trees.TreePrint(outputFormat, outputFormatOptions, tlp, tlpParams.HeadFinder(), tlpParams.TypedDependencyHeadFinder()); }
/// <param name="args"/> public static void Main(string[] args) { if (args.Length < MinArgs) { log.Info(Usage()); System.Environment.Exit(-1); } Properties options = StringUtils.ArgsToProperties(args, OptionArgDefs()); bool Verbose = PropertiesUtils.GetBool(options, "v", false); Language Language = PropertiesUtils.Get(options, "l", Language.English, typeof(Language)); int MaxGoldYield = PropertiesUtils.GetInt(options, "g", int.MaxValue); int MaxGuessYield = PropertiesUtils.GetInt(options, "y", int.MaxValue); string[] parsedArgs = options.GetProperty(string.Empty, string.Empty).Split("\\s+"); if (parsedArgs.Length != MinArgs) { log.Info(Usage()); System.Environment.Exit(-1); } File goldFile = new File(parsedArgs[0]); File guessFile = new File(parsedArgs[1]); ITreebankLangParserParams tlpp = Language.@params; PrintWriter pwOut = tlpp.Pw(); Treebank guessTreebank = tlpp.DiskTreebank(); guessTreebank.LoadPath(guessFile); pwOut.Println("GUESS TREEBANK:"); pwOut.Println(guessTreebank.TextualSummary()); Treebank goldTreebank = tlpp.DiskTreebank(); goldTreebank.LoadPath(goldFile); pwOut.Println("GOLD TREEBANK:"); pwOut.Println(goldTreebank.TextualSummary()); Edu.Stanford.Nlp.Parser.Metrics.CollinsDepEval depEval = new Edu.Stanford.Nlp.Parser.Metrics.CollinsDepEval("CollinsDep", true, tlpp.HeadFinder(), tlpp.TreebankLanguagePack().StartSymbol()); ITreeTransformer tc = tlpp.Collinizer(); //PennTreeReader skips over null/malformed parses. So when the yields of the gold/guess trees //don't match, we need to keep looking for the next gold tree that matches. //The evalb ref implementation differs slightly as it expects one tree per line. It assigns //status as follows: // // 0 - Ok (yields match) // 1 - length mismatch // 2 - null parse e.g. (()). // //In the cases of 1,2, evalb does not include the tree pair in the LP/LR computation. IEnumerator <Tree> goldItr = goldTreebank.GetEnumerator(); int goldLineId = 0; int skippedGuessTrees = 0; foreach (Tree guess in guessTreebank) { Tree evalGuess = tc.TransformTree(guess); if (guess.Yield().Count > MaxGuessYield) { skippedGuessTrees++; continue; } bool doneEval = false; while (goldItr.MoveNext() && !doneEval) { Tree gold = goldItr.Current; Tree evalGold = tc.TransformTree(gold); goldLineId++; if (gold.Yield().Count > MaxGoldYield) { continue; } else { if (evalGold.Yield().Count != evalGuess.Yield().Count) { pwOut.Println("Yield mismatch at gold line " + goldLineId); skippedGuessTrees++; break; } } //Default evalb behavior -- skip this guess tree depEval.Evaluate(evalGuess, evalGold, ((Verbose) ? pwOut : null)); doneEval = true; } } //Move to the next guess parse pwOut.Println("================================================================================"); if (skippedGuessTrees != 0) { pwOut.Printf("%s %d guess trees\n", ((MaxGuessYield < int.MaxValue) ? "Skipped" : "Unable to evaluate"), skippedGuessTrees); } depEval.Display(true, pwOut); pwOut.Close(); }
/// <summary>Do the category splitting of the tree passed in.</summary> /// <remarks> /// Do the category splitting of the tree passed in. /// This is initially called on the root node of a tree, and it recursively /// calls itself on children. A depth first left-to-right traversal is /// done whereby a tree node's children are first transformed and then /// the parent is transformed. At the time of calling, the original root /// always sits above the current node. This routine can be assumed to, /// and does, change the tree passed in: it destructively modifies tree nodes, /// and makes new tree structure when it needs to. /// </remarks> /// <param name="t">The tree node to subcategorize.</param> /// <param name="root"> /// The root of the tree. It must contain /// <paramref name="t"/> /// or /// this code will throw a NullPointerException. /// </param> /// <returns>The annotated tree.</returns> private Tree TransformTreeHelper(Tree t, Tree root) { if (t == null) { // handle null return(null); } if (t.IsLeaf()) { //No need to change the label return(t); } string cat = t.Label().Value(); Tree parent; string parentStr; string grandParentStr; if (root == null || t.Equals(root)) { parent = null; parentStr = string.Empty; } else { parent = t.Parent(root); parentStr = parent.Label().Value(); } if (parent == null || parent.Equals(root)) { grandParentStr = string.Empty; } else { grandParentStr = parent.Parent(root).Label().Value(); } string baseParentStr = tlpParams.TreebankLanguagePack().BasicCategory(parentStr); string baseGrandParentStr = tlpParams.TreebankLanguagePack().BasicCategory(grandParentStr); //System.out.println(t.label().value() + " " + parentStr + " " + grandParentStr); if (t.IsPreTerminal()) { // handle tags Tree childResult = TransformTreeHelper(t.Children()[0], null); // recurse string word = childResult.Value(); // would be nicer if Word/CWT ?? if (!trainOptions.noTagSplit) { if (trainOptions.tagPA) { string test = cat + "^" + baseParentStr; if (!trainOptions.tagSelectiveSplit || trainOptions.splitters.Contains(test)) { cat = test; } } if (trainOptions.markUnaryTags && parent.NumChildren() == 1) { cat = cat + "^U"; } } // otherwise, leave the tags alone! // Label label = new CategoryWordTag(cat, word, cat); ILabel label = t.Label().LabelFactory().NewLabel(t.Label()); label.SetValue(cat); if (label is IHasCategory) { ((IHasCategory)label).SetCategory(cat); } if (label is IHasWord) { ((IHasWord)label).SetWord(word); } if (label is IHasTag) { ((IHasTag)label).SetTag(cat); } t.SetLabel(label); t.SetChild(0, childResult); // just in case word is changed if (trainOptions.noTagSplit) { return(t); } else { // language-specific transforms return(tlpParams.TransformTree(t, root)); } } // end isPreTerminal() // handle phrasal categories Tree[] kids = t.Children(); for (int childNum = 0; childNum < kids.Length; childNum++) { Tree child = kids[childNum]; Tree childResult = TransformTreeHelper(child, root); // recursive call t.SetChild(childNum, childResult); } Tree headChild = hf.DetermineHead(t); if (headChild == null || headChild.Label() == null) { throw new Exception("TreeAnnotator: null head found for tree [suggesting incomplete/wrong HeadFinder]:\n" + t); } ILabel headLabel = headChild.Label(); if (!(headLabel is IHasWord)) { throw new Exception("TreeAnnotator: Head label lacks a Word annotation!"); } if (!(headLabel is IHasTag)) { throw new Exception("TreeAnnotator: Head label lacks a Tag annotation!"); } string word_1 = ((IHasWord)headLabel).Word(); string tag = ((IHasTag)headLabel).Tag(); // String baseTag = tlpParams.treebankLanguagePack().basicCategory(tag); string baseCat = tlpParams.TreebankLanguagePack().BasicCategory(cat); /* Sister annotation. Potential problem: if multiple sisters are * strong indicators for a single category's expansions. This * happens concretely in the Chinese Treebank when NP (object) * has left sisters VV and AS. Could lead to too much * sparseness. The ideal solution would be to give the * splitting list an ordering, and take only the highest (~most * informative/reliable) sister annotation. */ if (trainOptions.sisterAnnotate && !trainOptions.smoothing && baseParentStr.Length > 0) { IList <string> leftSis = ListBasicCategories(SisterAnnotationStats.LeftSisterLabels(t, parent)); IList <string> rightSis = ListBasicCategories(SisterAnnotationStats.RightSisterLabels(t, parent)); IList <string> leftAnn = new List <string>(); IList <string> rightAnn = new List <string>(); foreach (string s in leftSis) { //s = baseCat+"=l="+tlpParams.treebankLanguagePack().basicCategory(s); leftAnn.Add(baseCat + "=l=" + tlpParams.TreebankLanguagePack().BasicCategory(s)); } //System.out.println("left-annotated test string " + s); foreach (string s_1 in rightSis) { //s = baseCat+"=r="+tlpParams.treebankLanguagePack().basicCategory(s); rightAnn.Add(baseCat + "=r=" + tlpParams.TreebankLanguagePack().BasicCategory(s_1)); } for (IEnumerator <string> j = rightAnn.GetEnumerator(); j.MoveNext();) { } //System.out.println("new rightsis " + (String)j.next()); //debugging foreach (string annCat in trainOptions.sisterSplitters) { //System.out.println("annotated test string " + annCat); if (leftAnn.Contains(annCat) || rightAnn.Contains(annCat)) { cat = cat + annCat.ReplaceAll("^" + baseCat, string.Empty); break; } } } if (trainOptions.Pa && !trainOptions.smoothing && baseParentStr.Length > 0) { string cat2 = baseCat + "^" + baseParentStr; if (!trainOptions.selectiveSplit || trainOptions.splitters.Contains(cat2)) { cat = cat + "^" + baseParentStr; } } if (trainOptions.gPA && !trainOptions.smoothing && grandParentStr.Length > 0) { if (trainOptions.selectiveSplit) { string cat2 = baseCat + "^" + baseParentStr + "~" + baseGrandParentStr; if (cat.Contains("^") && trainOptions.splitters.Contains(cat2)) { cat = cat + "~" + baseGrandParentStr; } } else { cat = cat + "~" + baseGrandParentStr; } } if (trainOptions.markUnary > 0) { if (trainOptions.markUnary == 1 && kids.Length == 1 && kids[0].Depth() >= 2) { cat = cat + "-U"; } else { if (trainOptions.markUnary == 2 && parent != null && parent.NumChildren() == 1 && t.Depth() >= 2) { cat = cat + "-u"; } } } if (trainOptions.rightRec && RightRec(t, baseCat)) { cat = cat + "-R"; } if (trainOptions.leftRec && LeftRec(t, baseCat)) { cat = cat + "-L"; } if (trainOptions.splitPrePreT && t.IsPrePreTerminal()) { cat = cat + "-PPT"; } // Label label = new CategoryWordTag(cat, word, tag); ILabel label_1 = t.Label().LabelFactory().NewLabel(t.Label()); label_1.SetValue(cat); if (label_1 is IHasCategory) { ((IHasCategory)label_1).SetCategory(cat); } if (label_1 is IHasWord) { ((IHasWord)label_1).SetWord(word_1); } if (label_1 is IHasTag) { ((IHasTag)label_1).SetTag(tag); } t.SetLabel(label_1); return(tlpParams.TransformTree(t, root)); }
public virtual Tree TransformTreeHelper(Tree t, Tree root, ITreeFactory tf) { Tree result; Tree parent; string parentStr; string grandParentStr; if (root == null || t.Equals(root)) { parent = null; parentStr = string.Empty; } else { parent = t.Parent(root); parentStr = parent.Label().Value(); } if (parent == null || parent.Equals(root)) { grandParentStr = string.Empty; } else { Tree grandParent = parent.Parent(root); grandParentStr = grandParent.Label().Value(); } string cat = t.Label().Value(); string baseParentStr = tlpParams.TreebankLanguagePack().BasicCategory(parentStr); string baseGrandParentStr = tlpParams.TreebankLanguagePack().BasicCategory(grandParentStr); if (t.IsLeaf()) { return(tf.NewLeaf(new Word(t.Label().Value()))); } string word = t.HeadTerminal(hf).Value(); if (t.IsPreTerminal()) { nonTerms.IncrementCount(t.Label().Value()); } else { nonTerms.IncrementCount(t.Label().Value()); if (trainOptions.postPA && !trainOptions.smoothing && baseParentStr.Length > 0) { string cat2; if (trainOptions.postSplitWithBaseCategory) { cat2 = cat + '^' + baseParentStr; } else { cat2 = cat + '^' + parentStr; } if (!trainOptions.selectivePostSplit || trainOptions.postSplitters.Contains(cat2)) { cat = cat2; } } if (trainOptions.postGPA && !trainOptions.smoothing && grandParentStr.Length > 0) { string cat2; if (trainOptions.postSplitWithBaseCategory) { cat2 = cat + '~' + baseGrandParentStr; } else { cat2 = cat + '~' + grandParentStr; } if (trainOptions.selectivePostSplit) { if (cat.Contains("^") && trainOptions.postSplitters.Contains(cat2)) { cat = cat2; } } else { cat = cat2; } } } result = tf.NewTreeNode(new CategoryWordTag(cat, word, cat), Collections.EmptyList <Tree>()); List <Tree> newKids = new List <Tree>(); Tree[] kids = t.Children(); foreach (Tree kid in kids) { newKids.Add(TransformTreeHelper(kid, root, tf)); } result.SetChildren(newKids); return(result); }