private void TestOnTreebank(LexicalizedParser pd, ITreebankLangParserParams tlpParams, Treebank testTreebank, string treebankRoot, IIndex <string> stateIndex) { Timing.StartTime(); ITreeTransformer annotator = new TreeAnnotator(tlpParams.HeadFinder(), tlpParams, op); // CDM: Aug 2004: With new implementation of treebank split categories, // I've hardwired this to load English ones. Otherwise need training data. // op.trainOptions.splitters = new HashSet(Arrays.asList(op.tlpParams.splitters())); op.trainOptions.splitters = ParentAnnotationStats.GetEnglishSplitCategories(treebankRoot); op.trainOptions.sisterSplitters = Generics.NewHashSet(Arrays.AsList(op.tlpParams.SisterSplitters())); foreach (Tree goldTree in testTreebank) { goldTree = annotator.TransformTree(goldTree); // System.out.println(); // System.out.println("Checking tree: " + goldTree); foreach (Tree localTree in goldTree) { // now try to use the grammar to score this local tree if (localTree.IsLeaf() || localTree.IsPreTerminal() || localTree.Children().Length < 2) { continue; } System.Console.Out.WriteLine(LocalTreeToRule(localTree)); double score = ComputeLocalTreeScore(localTree, stateIndex, pd); if (score == double.NegativeInfinity) { } // System.out.println(localTreeToRule(localTree)); System.Console.Out.WriteLine("score: " + score); } } }
public BinarizerAnnotator(string annotatorName, Properties props) { this.tlppClass = props.GetProperty(annotatorName + ".tlppClass", DefaultTlppClass); ITreebankLangParserParams tlpp = ReflectionLoading.LoadByReflection(tlppClass); this.binarizer = TreeBinarizer.SimpleTreeBinarizer(tlpp.HeadFinder(), tlpp.TreebankLanguagePack()); }
/// <summary>Lets you test out the TreeBinarizer on the command line.</summary> /// <remarks> /// Lets you test out the TreeBinarizer on the command line. /// This main method doesn't yet handle as many flags as one would like. /// But it does have: /// <ul> /// <li> -tlp TreebankLanguagePack /// <li>-tlpp TreebankLangParserParams /// <li>-insideFactor /// <li>-markovOrder /// </ul> /// </remarks> /// <param name="args"> /// Command line arguments: flags as above, as above followed by /// treebankPath /// </param> public static void Main(string[] args) { ITreebankLangParserParams tlpp = null; // TreebankLangParserParams tlpp = new EnglishTreebankParserParams(); // TreeReaderFactory trf = new LabeledScoredTreeReaderFactory(); // Looks like it must build CategoryWordTagFactory!! ITreeReaderFactory trf = null; string fileExt = "mrg"; IHeadFinder hf = new ModCollinsHeadFinder(); ITreebankLanguagePack tlp = new PennTreebankLanguagePack(); bool insideFactor = false; bool mf = false; int mo = 1; bool uwl = false; bool uat = false; double sst = 20.0; bool mfs = false; bool simpleLabels = false; bool noRebinarization = false; int i = 0; while (i < args.Length && args[i].StartsWith("-")) { if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-tlp") && i + 1 < args.Length) { try { tlp = (ITreebankLanguagePack)System.Activator.CreateInstance(Sharpen.Runtime.GetType(args[i + 1])); } catch (Exception e) { log.Info("Couldn't instantiate: " + args[i + 1]); throw new Exception(e); } i++; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-tlpp") && i + 1 < args.Length) { try { tlpp = (ITreebankLangParserParams)System.Activator.CreateInstance(Sharpen.Runtime.GetType(args[i + 1])); } catch (Exception e) { log.Info("Couldn't instantiate: " + args[i + 1]); throw new Exception(e); } i++; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-insideFactor")) { insideFactor = true; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-markovOrder") && i + 1 < args.Length) { i++; mo = System.Convert.ToInt32(args[i]); } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-simpleLabels")) { simpleLabels = true; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-noRebinarization")) { noRebinarization = true; } else { log.Info("Unknown option:" + args[i]); } } } } } } i++; } if (i >= args.Length) { log.Info("usage: java TreeBinarizer [-tlpp class|-markovOrder int|...] treebankPath"); System.Environment.Exit(0); } Treebank treebank; if (tlpp != null) { treebank = tlpp.MemoryTreebank(); tlp = tlpp.TreebankLanguagePack(); fileExt = tlp.TreebankFileExtension(); hf = tlpp.HeadFinder(); } else { treebank = new DiskTreebank(trf); } treebank.LoadPath(args[i], fileExt, true); ITreeTransformer tt = new Edu.Stanford.Nlp.Parser.Lexparser.TreeBinarizer(hf, tlp, insideFactor, mf, mo, uwl, uat, sst, mfs, simpleLabels, noRebinarization); foreach (Tree t in treebank) { Tree newT = tt.TransformTree(t); System.Console.Out.WriteLine("Original tree:"); t.PennPrint(); System.Console.Out.WriteLine("Binarized tree:"); newT.PennPrint(); System.Console.Out.WriteLine(); } }
private static Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter GetSegmenterDataFromTreebank(Treebank trainTreebank, Options op, IIndex <string> wordIndex, IIndex <string> tagIndex) { System.Console.Out.WriteLine("Currently " + new DateTime()); // printOptions(true, op); Timing.StartTime(); // setup tree transforms ITreebankLangParserParams tlpParams = op.tlpParams; if (op.testOptions.verbose) { System.Console.Out.Write("Training "); System.Console.Out.WriteLine(trainTreebank.TextualSummary()); } System.Console.Out.Write("Binarizing trees..."); TreeAnnotatorAndBinarizer binarizer; // initialized below if (!op.trainOptions.leftToRight) { binarizer = new TreeAnnotatorAndBinarizer(tlpParams, op.forceCNF, !op.trainOptions.OutsideFactor(), true, op); } else { binarizer = new TreeAnnotatorAndBinarizer(tlpParams.HeadFinder(), new LeftHeadFinder(), tlpParams, op.forceCNF, !op.trainOptions.OutsideFactor(), true, op); } CollinsPuncTransformer collinsPuncTransformer = null; if (op.trainOptions.collinsPunc) { collinsPuncTransformer = new CollinsPuncTransformer(tlpParams.TreebankLanguagePack()); } IList <Tree> binaryTrainTrees = new List <Tree>(); // List<Tree> binaryTuneTrees = new ArrayList<Tree>(); if (op.trainOptions.selectiveSplit) { op.trainOptions.splitters = ParentAnnotationStats.GetSplitCategories(trainTreebank, true, 0, op.trainOptions.selectiveSplitCutOff, op.trainOptions.tagSelectiveSplitCutOff, tlpParams.TreebankLanguagePack()); if (op.testOptions.verbose) { log.Info("Parent split categories: " + op.trainOptions.splitters); } } if (op.trainOptions.selectivePostSplit) { ITreeTransformer myTransformer = new TreeAnnotator(tlpParams.HeadFinder(), tlpParams, op); Treebank annotatedTB = trainTreebank.Transform(myTransformer); op.trainOptions.postSplitters = ParentAnnotationStats.GetSplitCategories(annotatedTB, true, 0, op.trainOptions.selectivePostSplitCutOff, op.trainOptions.tagSelectivePostSplitCutOff, tlpParams.TreebankLanguagePack()); if (op.testOptions.verbose) { log.Info("Parent post annotation split categories: " + op.trainOptions.postSplitters); } } if (op.trainOptions.hSelSplit) { binarizer.SetDoSelectiveSplit(false); foreach (Tree tree in trainTreebank) { if (op.trainOptions.collinsPunc) { tree = collinsPuncTransformer.TransformTree(tree); } tree = binarizer.TransformTree(tree); } binarizer.SetDoSelectiveSplit(true); } foreach (Tree tree_1 in trainTreebank) { if (op.trainOptions.collinsPunc) { tree_1 = collinsPuncTransformer.TransformTree(tree_1); } tree_1 = binarizer.TransformTree(tree_1); binaryTrainTrees.Add(tree_1); } Timing.Tick("done."); if (op.testOptions.verbose) { binarizer.DumpStats(); } System.Console.Out.Write("Extracting Lexicon..."); Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter clex = (Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter)op.tlpParams.Lex(op, wordIndex, tagIndex); clex.InitializeTraining(binaryTrainTrees.Count); clex.Train(binaryTrainTrees); clex.FinishTraining(); Timing.Tick("done."); return(clex); }
public TreeAnnotatorAndBinarizer(ITreebankLangParserParams tlpParams, bool forceCNF, bool insideFactor, bool doSubcategorization, Options op) : this(tlpParams.HeadFinder(), tlpParams.HeadFinder(), tlpParams, forceCNF, insideFactor, doSubcategorization, op) { }
/// <returns>A Triple of binaryTrainTreebank, binarySecondaryTreebank, binaryTuneTreebank.</returns> public static Triple <Treebank, Treebank, Treebank> GetAnnotatedBinaryTreebankFromTreebank(Treebank trainTreebank, Treebank secondaryTreebank, Treebank tuneTreebank, Options op) { // setup tree transforms ITreebankLangParserParams tlpParams = op.tlpParams; ITreebankLanguagePack tlp = tlpParams.TreebankLanguagePack(); if (op.testOptions.verbose) { PrintWriter pwErr = tlpParams.Pw(System.Console.Error); pwErr.Print("Training "); pwErr.Println(trainTreebank.TextualSummary(tlp)); if (secondaryTreebank != null) { pwErr.Print("Secondary training "); pwErr.Println(secondaryTreebank.TextualSummary(tlp)); } } CompositeTreeTransformer trainTransformer = new CompositeTreeTransformer(); if (op.trainOptions.preTransformer != null) { trainTransformer.AddTransformer(op.trainOptions.preTransformer); } if (op.trainOptions.collinsPunc) { CollinsPuncTransformer collinsPuncTransformer = new CollinsPuncTransformer(tlp); trainTransformer.AddTransformer(collinsPuncTransformer); } log.Info("Binarizing trees..."); Edu.Stanford.Nlp.Parser.Lexparser.TreeAnnotatorAndBinarizer binarizer; if (!op.trainOptions.leftToRight) { binarizer = new Edu.Stanford.Nlp.Parser.Lexparser.TreeAnnotatorAndBinarizer(tlpParams, op.forceCNF, !op.trainOptions.OutsideFactor(), !op.trainOptions.predictSplits, op); } else { binarizer = new Edu.Stanford.Nlp.Parser.Lexparser.TreeAnnotatorAndBinarizer(tlpParams.HeadFinder(), new LeftHeadFinder(), tlpParams, op.forceCNF, !op.trainOptions.OutsideFactor(), !op.trainOptions.predictSplits, op); } trainTransformer.AddTransformer(binarizer); if (op.wordFunction != null) { ITreeTransformer wordFunctionTransformer = new TreeLeafLabelTransformer(op.wordFunction); trainTransformer.AddTransformer(wordFunctionTransformer); } Treebank wholeTreebank; if (secondaryTreebank == null) { wholeTreebank = trainTreebank; } else { wholeTreebank = new CompositeTreebank(trainTreebank, secondaryTreebank); } if (op.trainOptions.selectiveSplit) { op.trainOptions.splitters = ParentAnnotationStats.GetSplitCategories(wholeTreebank, op.trainOptions.tagSelectiveSplit, 0, op.trainOptions.selectiveSplitCutOff, op.trainOptions.tagSelectiveSplitCutOff, tlp); RemoveDeleteSplittersFromSplitters(tlp, op); if (op.testOptions.verbose) { IList <string> list = new List <string>(op.trainOptions.splitters); list.Sort(); log.Info("Parent split categories: " + list); } } if (op.trainOptions.selectivePostSplit) { // Do all the transformations once just to learn selective splits on annotated categories ITreeTransformer myTransformer = new TreeAnnotator(tlpParams.HeadFinder(), tlpParams, op); wholeTreebank = wholeTreebank.Transform(myTransformer); op.trainOptions.postSplitters = ParentAnnotationStats.GetSplitCategories(wholeTreebank, true, 0, op.trainOptions.selectivePostSplitCutOff, op.trainOptions.tagSelectivePostSplitCutOff, tlp); if (op.testOptions.verbose) { log.Info("Parent post annotation split categories: " + op.trainOptions.postSplitters); } } if (op.trainOptions.hSelSplit) { // We run through all the trees once just to gather counts for hSelSplit! int ptt = op.trainOptions.printTreeTransformations; op.trainOptions.printTreeTransformations = 0; binarizer.SetDoSelectiveSplit(false); foreach (Tree tree in wholeTreebank) { trainTransformer.TransformTree(tree); } binarizer.SetDoSelectiveSplit(true); op.trainOptions.printTreeTransformations = ptt; } // we've done all the setup now. here's where the train treebank is transformed. trainTreebank = trainTreebank.Transform(trainTransformer); if (secondaryTreebank != null) { secondaryTreebank = secondaryTreebank.Transform(trainTransformer); } if (op.trainOptions.printAnnotatedStateCounts) { binarizer.PrintStateCounts(); } if (op.trainOptions.printAnnotatedRuleCounts) { binarizer.PrintRuleCounts(); } if (tuneTreebank != null) { tuneTreebank = tuneTreebank.Transform(trainTransformer); } if (op.testOptions.verbose) { binarizer.DumpStats(); } return(new Triple <Treebank, Treebank, Treebank>(trainTreebank, secondaryTreebank, tuneTreebank)); }
// initial value is -0xDEADBEEF (actually positive because of 2s complement) // Don't change this; set with -v /// <summary>Determines method for print trees on output.</summary> /// <param name="tlpParams">The treebank parser params</param> /// <returns>A suitable tree printing object</returns> public virtual Edu.Stanford.Nlp.Trees.TreePrint TreePrint(ITreebankLangParserParams tlpParams) { ITreebankLanguagePack tlp = tlpParams.TreebankLanguagePack(); return new Edu.Stanford.Nlp.Trees.TreePrint(outputFormat, outputFormatOptions, tlp, tlpParams.HeadFinder(), tlpParams.TypedDependencyHeadFinder()); }
/// <param name="args"/> public static void Main(string[] args) { if (args.Length < MinArgs) { log.Info(Usage()); System.Environment.Exit(-1); } Properties options = StringUtils.ArgsToProperties(args, OptionArgDefs()); bool Verbose = PropertiesUtils.GetBool(options, "v", false); Language Language = PropertiesUtils.Get(options, "l", Language.English, typeof(Language)); int MaxGoldYield = PropertiesUtils.GetInt(options, "g", int.MaxValue); int MaxGuessYield = PropertiesUtils.GetInt(options, "y", int.MaxValue); string[] parsedArgs = options.GetProperty(string.Empty, string.Empty).Split("\\s+"); if (parsedArgs.Length != MinArgs) { log.Info(Usage()); System.Environment.Exit(-1); } File goldFile = new File(parsedArgs[0]); File guessFile = new File(parsedArgs[1]); ITreebankLangParserParams tlpp = Language.@params; PrintWriter pwOut = tlpp.Pw(); Treebank guessTreebank = tlpp.DiskTreebank(); guessTreebank.LoadPath(guessFile); pwOut.Println("GUESS TREEBANK:"); pwOut.Println(guessTreebank.TextualSummary()); Treebank goldTreebank = tlpp.DiskTreebank(); goldTreebank.LoadPath(goldFile); pwOut.Println("GOLD TREEBANK:"); pwOut.Println(goldTreebank.TextualSummary()); Edu.Stanford.Nlp.Parser.Metrics.CollinsDepEval depEval = new Edu.Stanford.Nlp.Parser.Metrics.CollinsDepEval("CollinsDep", true, tlpp.HeadFinder(), tlpp.TreebankLanguagePack().StartSymbol()); ITreeTransformer tc = tlpp.Collinizer(); //PennTreeReader skips over null/malformed parses. So when the yields of the gold/guess trees //don't match, we need to keep looking for the next gold tree that matches. //The evalb ref implementation differs slightly as it expects one tree per line. It assigns //status as follows: // // 0 - Ok (yields match) // 1 - length mismatch // 2 - null parse e.g. (()). // //In the cases of 1,2, evalb does not include the tree pair in the LP/LR computation. IEnumerator <Tree> goldItr = goldTreebank.GetEnumerator(); int goldLineId = 0; int skippedGuessTrees = 0; foreach (Tree guess in guessTreebank) { Tree evalGuess = tc.TransformTree(guess); if (guess.Yield().Count > MaxGuessYield) { skippedGuessTrees++; continue; } bool doneEval = false; while (goldItr.MoveNext() && !doneEval) { Tree gold = goldItr.Current; Tree evalGold = tc.TransformTree(gold); goldLineId++; if (gold.Yield().Count > MaxGoldYield) { continue; } else { if (evalGold.Yield().Count != evalGuess.Yield().Count) { pwOut.Println("Yield mismatch at gold line " + goldLineId); skippedGuessTrees++; break; } } //Default evalb behavior -- skip this guess tree depEval.Evaluate(evalGuess, evalGold, ((Verbose) ? pwOut : null)); doneEval = true; } } //Move to the next guess parse pwOut.Println("================================================================================"); if (skippedGuessTrees != 0) { pwOut.Printf("%s %d guess trees\n", ((MaxGuessYield < int.MaxValue) ? "Skipped" : "Unable to evaluate"), skippedGuessTrees); } depEval.Display(true, pwOut); pwOut.Close(); }
public PostSplitter(ITreebankLangParserParams tlpParams, Options op) { this.tlpParams = tlpParams; this.hf = tlpParams.HeadFinder(); this.trainOptions = op.trainOptions; }