public virtual void Run(bool pathsAreFiles, bool displayWords, bool displayOOV) { if (useSplit) { IList <TreebankStats.ObservedCorpusStats> allSplitStats = new List <TreebankStats.ObservedCorpusStats>(); makeVocab = true; foreach (KeyValuePair <TreebankStats.Split, ICollection <string> > split in splitFileLists) { DiskTreebank tb = tlpp.DiskTreebank(); IFileFilter splitFilter = new TreebankStats.SplitFilter(split.Value); foreach (string path in pathNames) { tb.LoadPath(path, splitFilter); } TreebankStats.ObservedCorpusStats splitStats = GatherStats(tb, languageName.ToString() + "." + split.Key.ToString()); allSplitStats.Add(splitStats); makeVocab = false; } Display(AggregateStats(allSplitStats), displayWords, displayOOV); foreach (TreebankStats.ObservedCorpusStats ocs in allSplitStats) { Display(ocs, displayWords, displayOOV); } } else { if (pathsAreFiles) { makeVocab = true; foreach (string path in pathNames) { DiskTreebank tb = tlpp.DiskTreebank(); tb.LoadPath(path, null); TreebankStats.ObservedCorpusStats stats = GatherStats(tb, languageName.ToString() + " " + path); Display(stats, displayWords, displayOOV); makeVocab = false; } } else { trainVocab = Generics.NewHashSet(); DiskTreebank tb = tlpp.DiskTreebank(); foreach (string path in pathNames) { tb.LoadPath(path, null); } TreebankStats.ObservedCorpusStats allStats = GatherStats(tb, languageName.ToString()); Display(allStats, displayWords, displayOOV); } } }
public static void Main(string[] args) { if (args.Length < minArgs) { System.Console.Out.WriteLine(Usage()); System.Environment.Exit(-1); } Properties options = StringUtils.ArgsToProperties(args, ArgDefs()); Language language = PropertiesUtils.Get(options, "l", Language.English, typeof(Language)); ITreebankLangParserParams tlpp = language.@params; DiskTreebank tb = null; string encoding = options.GetProperty("l", "UTF-8"); bool removeBracket = PropertiesUtils.GetBool(options, "b", false); tlpp.SetInputEncoding(encoding); tlpp.SetOutputEncoding(encoding); tb = tlpp.DiskTreebank(); string[] files = options.GetProperty(string.Empty, string.Empty).Split("\\s+"); if (files.Length != 0) { foreach (string filename in files) { tb.LoadPath(filename); } } else { log.Info(Usage()); System.Environment.Exit(-1); } PrintWriter pwo = tlpp.Pw(); string startSymbol = tlpp.TreebankLanguagePack().StartSymbol(); ITreeFactory tf = new LabeledScoredTreeFactory(); int nTrees = 0; foreach (Tree t in tb) { if (removeBracket) { if (t.Value().Equals(startSymbol)) { t = t.FirstChild(); } } else { if (!t.Value().Equals(startSymbol)) { //Add a bracket if it isn't already there t = tf.NewTreeNode(startSymbol, Java.Util.Collections.SingletonList(t)); } } pwo.Println(t.ToString()); nTrees++; } pwo.Close(); System.Console.Error.Printf("Processed %d trees.%n", nTrees); }
// private static String stripTag(String tag) { // if (tag.startsWith("DT")) { // String newTag = tag.substring(2, tag.length()); // return newTag.length() > 0 ? newTag : tag; // } // return tag; // } /// <param name="args"/> public static void Main(string[] args) { if (args.Length != 3) { System.Console.Error.Printf("Usage: java %s language filename features%n", typeof(TreebankFactoredLexiconStats).FullName); System.Environment.Exit(-1); } Language language = Language.ValueOf(args[0]); ITreebankLangParserParams tlpp = language.@params; if (language.Equals(Language.Arabic)) { string[] options = new string[] { "-arabicFactored" }; tlpp.SetOptionFlag(options, 0); } else { string[] options = new string[] { "-frenchFactored" }; tlpp.SetOptionFlag(options, 0); } Treebank tb = tlpp.DiskTreebank(); tb.LoadPath(args[1]); MorphoFeatureSpecification morphoSpec = language.Equals(Language.Arabic) ? new ArabicMorphoFeatureSpecification() : new FrenchMorphoFeatureSpecification(); string[] features = args[2].Trim().Split(","); foreach (string feature in features) { morphoSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.ValueOf(feature)); } // Counters ICounter <string> wordTagCounter = new ClassicCounter <string>(30000); ICounter <string> morphTagCounter = new ClassicCounter <string>(500); // Counter<String> signatureTagCounter = new ClassicCounter<String>(); ICounter <string> morphCounter = new ClassicCounter <string>(500); ICounter <string> wordCounter = new ClassicCounter <string>(30000); ICounter <string> tagCounter = new ClassicCounter <string>(300); ICounter <string> lemmaCounter = new ClassicCounter <string>(25000); ICounter <string> lemmaTagCounter = new ClassicCounter <string>(25000); ICounter <string> richTagCounter = new ClassicCounter <string>(1000); ICounter <string> reducedTagCounter = new ClassicCounter <string>(500); ICounter <string> reducedTagLemmaCounter = new ClassicCounter <string>(500); IDictionary <string, ICollection <string> > wordLemmaMap = Generics.NewHashMap(); TwoDimensionalIntCounter <string, string> lemmaReducedTagCounter = new TwoDimensionalIntCounter <string, string>(30000); TwoDimensionalIntCounter <string, string> reducedTagTagCounter = new TwoDimensionalIntCounter <string, string>(500); TwoDimensionalIntCounter <string, string> tagReducedTagCounter = new TwoDimensionalIntCounter <string, string>(300); int numTrees = 0; foreach (Tree tree in tb) { foreach (Tree subTree in tree) { if (!subTree.IsLeaf()) { tlpp.TransformTree(subTree, tree); } } IList <ILabel> pretermList = tree.PreTerminalYield(); IList <ILabel> yield = tree.Yield(); System.Diagnostics.Debug.Assert(yield.Count == pretermList.Count); int yieldLen = yield.Count; for (int i = 0; i < yieldLen; ++i) { string tag = pretermList[i].Value(); string word = yield[i].Value(); string morph = ((CoreLabel)yield[i]).OriginalText(); // Note: if there is no lemma, then we use the surface form. Pair <string, string> lemmaTag = MorphoFeatureSpecification.SplitMorphString(word, morph); string lemma = lemmaTag.First(); string richTag = lemmaTag.Second(); // WSGDEBUG if (tag.Contains("MW")) { lemma += "-MWE"; } lemmaCounter.IncrementCount(lemma); lemmaTagCounter.IncrementCount(lemma + tag); richTagCounter.IncrementCount(richTag); string reducedTag = morphoSpec.StrToFeatures(richTag).ToString(); reducedTagCounter.IncrementCount(reducedTag); reducedTagLemmaCounter.IncrementCount(reducedTag + lemma); wordTagCounter.IncrementCount(word + tag); morphTagCounter.IncrementCount(morph + tag); morphCounter.IncrementCount(morph); wordCounter.IncrementCount(word); tagCounter.IncrementCount(tag); reducedTag = reducedTag.Equals(string.Empty) ? "NONE" : reducedTag; if (wordLemmaMap.Contains(word)) { wordLemmaMap[word].Add(lemma); } else { ICollection <string> lemmas = Generics.NewHashSet(1); wordLemmaMap[word] = lemmas; } lemmaReducedTagCounter.IncrementCount(lemma, reducedTag); reducedTagTagCounter.IncrementCount(lemma + reducedTag, tag); tagReducedTagCounter.IncrementCount(tag, reducedTag); } ++numTrees; } // Barf... System.Console.Out.WriteLine("Language: " + language.ToString()); System.Console.Out.Printf("#trees:\t%d%n", numTrees); System.Console.Out.Printf("#tokens:\t%d%n", (int)wordCounter.TotalCount()); System.Console.Out.Printf("#words:\t%d%n", wordCounter.KeySet().Count); System.Console.Out.Printf("#tags:\t%d%n", tagCounter.KeySet().Count); System.Console.Out.Printf("#wordTagPairs:\t%d%n", wordTagCounter.KeySet().Count); System.Console.Out.Printf("#lemmas:\t%d%n", lemmaCounter.KeySet().Count); System.Console.Out.Printf("#lemmaTagPairs:\t%d%n", lemmaTagCounter.KeySet().Count); System.Console.Out.Printf("#feattags:\t%d%n", reducedTagCounter.KeySet().Count); System.Console.Out.Printf("#feattag+lemmas:\t%d%n", reducedTagLemmaCounter.KeySet().Count); System.Console.Out.Printf("#richtags:\t%d%n", richTagCounter.KeySet().Count); System.Console.Out.Printf("#richtag+lemma:\t%d%n", morphCounter.KeySet().Count); System.Console.Out.Printf("#richtag+lemmaTagPairs:\t%d%n", morphTagCounter.KeySet().Count); // Extra System.Console.Out.WriteLine("=================="); StringBuilder sbNoLemma = new StringBuilder(); StringBuilder sbMultLemmas = new StringBuilder(); foreach (KeyValuePair <string, ICollection <string> > wordLemmas in wordLemmaMap) { string word = wordLemmas.Key; ICollection <string> lemmas = wordLemmas.Value; if (lemmas.Count == 0) { sbNoLemma.Append("NO LEMMAS FOR WORD: " + word + "\n"); continue; } if (lemmas.Count > 1) { sbMultLemmas.Append("MULTIPLE LEMMAS: " + word + " " + SetToString(lemmas) + "\n"); continue; } string lemma = lemmas.GetEnumerator().Current; ICollection <string> reducedTags = lemmaReducedTagCounter.GetCounter(lemma).KeySet(); if (reducedTags.Count > 1) { System.Console.Out.Printf("%s --> %s%n", word, lemma); foreach (string reducedTag in reducedTags) { int count = lemmaReducedTagCounter.GetCount(lemma, reducedTag); string posTags = SetToString(reducedTagTagCounter.GetCounter(lemma + reducedTag).KeySet()); System.Console.Out.Printf("\t%s\t%d\t%s%n", reducedTag, count, posTags); } System.Console.Out.WriteLine(); } } System.Console.Out.WriteLine("=================="); System.Console.Out.WriteLine(sbNoLemma.ToString()); System.Console.Out.WriteLine(sbMultLemmas.ToString()); System.Console.Out.WriteLine("=================="); IList <string> tags = new List <string>(tagReducedTagCounter.FirstKeySet()); tags.Sort(); foreach (string tag_1 in tags) { System.Console.Out.WriteLine(tag_1); ICollection <string> reducedTags = tagReducedTagCounter.GetCounter(tag_1).KeySet(); foreach (string reducedTag in reducedTags) { int count = tagReducedTagCounter.GetCount(tag_1, reducedTag); // reducedTag = reducedTag.equals("") ? "NONE" : reducedTag; System.Console.Out.Printf("\t%s\t%d%n", reducedTag, count); } System.Console.Out.WriteLine(); } System.Console.Out.WriteLine("=================="); }
/// <param name="args"/> public static void Main(string[] args) { if (args.Length != 4) { System.Console.Error.Printf("Usage: java %s language features train_file dev_file%n", typeof(Edu.Stanford.Nlp.Parser.Lexparser.FactoredLexicon).FullName); System.Environment.Exit(-1); } // Command line options Language language = Language.ValueOf(args[0]); ITreebankLangParserParams tlpp = language.@params; Treebank trainTreebank = tlpp.DiskTreebank(); trainTreebank.LoadPath(args[2]); Treebank devTreebank = tlpp.DiskTreebank(); devTreebank.LoadPath(args[3]); MorphoFeatureSpecification morphoSpec; Options options = GetOptions(language); if (language.Equals(Language.Arabic)) { morphoSpec = new ArabicMorphoFeatureSpecification(); string[] languageOptions = new string[] { "-arabicFactored" }; tlpp.SetOptionFlag(languageOptions, 0); } else { if (language.Equals(Language.French)) { morphoSpec = new FrenchMorphoFeatureSpecification(); string[] languageOptions = new string[] { "-frenchFactored" }; tlpp.SetOptionFlag(languageOptions, 0); } else { throw new NotSupportedException(); } } string featureList = args[1]; string[] features = featureList.Trim().Split(","); foreach (string feature in features) { morphoSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.ValueOf(feature)); } System.Console.Out.WriteLine("Language: " + language.ToString()); System.Console.Out.WriteLine("Features: " + args[1]); // Create word and tag indices // Save trees in a collection since the interface requires that.... System.Console.Out.Write("Loading training trees..."); IList <Tree> trainTrees = new List <Tree>(19000); IIndex <string> wordIndex = new HashIndex <string>(); IIndex <string> tagIndex = new HashIndex <string>(); foreach (Tree tree in trainTreebank) { foreach (Tree subTree in tree) { if (!subTree.IsLeaf()) { tlpp.TransformTree(subTree, tree); } } trainTrees.Add(tree); } System.Console.Out.Printf("Done! (%d trees)%n", trainTrees.Count); // Setup and train the lexicon. System.Console.Out.Write("Collecting sufficient statistics for lexicon..."); Edu.Stanford.Nlp.Parser.Lexparser.FactoredLexicon lexicon = new Edu.Stanford.Nlp.Parser.Lexparser.FactoredLexicon(options, morphoSpec, wordIndex, tagIndex); lexicon.InitializeTraining(trainTrees.Count); lexicon.Train(trainTrees, null); lexicon.FinishTraining(); System.Console.Out.WriteLine("Done!"); trainTrees = null; // Load the tuning set System.Console.Out.Write("Loading tuning set..."); IList <FactoredLexiconEvent> tuningSet = GetTuningSet(devTreebank, lexicon, tlpp); System.Console.Out.Printf("...Done! (%d events)%n", tuningSet.Count); // Print the probabilities that we obtain // TODO(spenceg): Implement tagging accuracy with FactLex int nCorrect = 0; ICounter <string> errors = new ClassicCounter <string>(); foreach (FactoredLexiconEvent @event in tuningSet) { IEnumerator <IntTaggedWord> itr = lexicon.RuleIteratorByWord(@event.Word(), @event.GetLoc(), @event.FeatureStr()); ICounter <int> logScores = new ClassicCounter <int>(); bool noRules = true; int goldTagId = -1; while (itr.MoveNext()) { noRules = false; IntTaggedWord iTW = itr.Current; if (iTW.Tag() == @event.TagId()) { log.Info("GOLD-"); goldTagId = iTW.Tag(); } float tagScore = lexicon.Score(iTW, @event.GetLoc(), @event.Word(), @event.FeatureStr()); logScores.IncrementCount(iTW.Tag(), tagScore); } if (noRules) { System.Console.Error.Printf("NO TAGGINGS: %s %s%n", @event.Word(), @event.FeatureStr()); } else { // Score the tagging int hypTagId = Counters.Argmax(logScores); if (hypTagId == goldTagId) { ++nCorrect; } else { string goldTag = goldTagId < 0 ? "UNSEEN" : lexicon.tagIndex.Get(goldTagId); errors.IncrementCount(goldTag); } } log.Info(); } // Output accuracy double acc = (double)nCorrect / (double)tuningSet.Count; System.Console.Error.Printf("%n%nACCURACY: %.2f%n%n", acc * 100.0); log.Info("% of errors by type:"); IList <string> biggestKeys = new List <string>(errors.KeySet()); biggestKeys.Sort(Counters.ToComparator(errors, false, true)); Counters.Normalize(errors); foreach (string key in biggestKeys) { System.Console.Error.Printf("%s\t%.2f%n", key, errors.GetCount(key) * 100.0); } }
/// <summary>Run the Evalb scoring metric on guess/gold input.</summary> /// <remarks>Run the Evalb scoring metric on guess/gold input. The default language is English.</remarks> /// <param name="args"/> public static void Main(string[] args) { if (args.Length < minArgs) { log.Info(Usage()); System.Environment.Exit(-1); } Properties options = StringUtils.ArgsToProperties(args, OptionArgDefs()); Language language = PropertiesUtils.Get(options, "l", Language.English, typeof(Language)); ITreebankLangParserParams tlpp = language.@params; int maxGoldYield = PropertiesUtils.GetInt(options, "y", int.MaxValue); bool Verbose = PropertiesUtils.GetBool(options, "v", false); bool sortByF1 = PropertiesUtils.HasProperty(options, "s"); int worstKTreesToEmit = PropertiesUtils.GetInt(options, "s", 0); PriorityQueue <Triple <double, Tree, Tree> > queue = sortByF1 ? new PriorityQueue <Triple <double, Tree, Tree> >(2000, new Evalb.F1Comparator()) : null; bool doCatLevel = PropertiesUtils.GetBool(options, "c", false); string labelRegex = options.GetProperty("f", null); string encoding = options.GetProperty("e", "UTF-8"); string[] parsedArgs = options.GetProperty(string.Empty, string.Empty).Split("\\s+"); if (parsedArgs.Length != minArgs) { log.Info(Usage()); System.Environment.Exit(-1); } string goldFile = parsedArgs[0]; string guessFile = parsedArgs[1]; // Command-line has been parsed. Configure the metric for evaluation. tlpp.SetInputEncoding(encoding); PrintWriter pwOut = tlpp.Pw(); Treebank guessTreebank = tlpp.DiskTreebank(); guessTreebank.LoadPath(guessFile); pwOut.Println("GUESS TREEBANK:"); pwOut.Println(guessTreebank.TextualSummary()); Treebank goldTreebank = tlpp.DiskTreebank(); goldTreebank.LoadPath(goldFile); pwOut.Println("GOLD TREEBANK:"); pwOut.Println(goldTreebank.TextualSummary()); Evalb metric = new Evalb("Evalb LP/LR", true); EvalbByCat evalbCat = (doCatLevel) ? new EvalbByCat("EvalbByCat LP/LR", true, labelRegex) : null; ITreeTransformer tc = tlpp.Collinizer(); //The evalb ref implementation assigns status for each tree pair as follows: // // 0 - Ok (yields match) // 1 - length mismatch // 2 - null parse e.g. (()). // //In the cases of 1,2, evalb does not include the tree pair in the LP/LR computation. IEnumerator <Tree> goldItr = goldTreebank.GetEnumerator(); IEnumerator <Tree> guessItr = guessTreebank.GetEnumerator(); int goldLineId = 0; int guessLineId = 0; int skippedGuessTrees = 0; while (guessItr.MoveNext() && goldItr.MoveNext()) { Tree guessTree = guessItr.Current; IList <ILabel> guessYield = guessTree.Yield(); guessLineId++; Tree goldTree = goldItr.Current; IList <ILabel> goldYield = goldTree.Yield(); goldLineId++; // Check that we should evaluate this tree if (goldYield.Count > maxGoldYield) { skippedGuessTrees++; continue; } // Only trees with equal yields can be evaluated if (goldYield.Count != guessYield.Count) { pwOut.Printf("Yield mismatch gold: %d tokens vs. guess: %d tokens (lines: gold %d guess %d)%n", goldYield.Count, guessYield.Count, goldLineId, guessLineId); skippedGuessTrees++; continue; } Tree evalGuess = tc.TransformTree(guessTree); Tree evalGold = tc.TransformTree(goldTree); metric.Evaluate(evalGuess, evalGold, ((Verbose) ? pwOut : null)); if (doCatLevel) { evalbCat.Evaluate(evalGuess, evalGold, ((Verbose) ? pwOut : null)); } if (sortByF1) { StoreTrees(queue, guessTree, goldTree, metric.GetLastF1()); } } if (guessItr.MoveNext() || goldItr.MoveNext()) { System.Console.Error.Printf("Guess/gold files do not have equal lengths (guess: %d gold: %d)%n.", guessLineId, goldLineId); } pwOut.Println("================================================================================"); if (skippedGuessTrees != 0) { pwOut.Printf("%s %d guess trees\n", "Unable to evaluate", skippedGuessTrees); } metric.Display(true, pwOut); pwOut.Println(); if (doCatLevel) { evalbCat.Display(true, pwOut); pwOut.Println(); } if (sortByF1) { EmitSortedTrees(queue, worstKTreesToEmit, guessFile); } pwOut.Close(); }
public static void Main(string[] args) { if (args.Length < minArgs) { System.Console.Out.WriteLine(usage); System.Environment.Exit(-1); } // Process command-line options Properties options = StringUtils.ArgsToProperties(args, optionArgDefinitions); string fileName = options.GetProperty(string.Empty); if (fileName == null || fileName.Equals(string.Empty)) { System.Console.Out.WriteLine(usage); System.Environment.Exit(-1); } Language language = PropertiesUtils.Get(options, "l", Language.English, typeof(Language)); ITreebankLangParserParams tlpp = language.@params; string encoding = options.GetProperty("e", "UTF-8"); tlpp.SetInputEncoding(encoding); tlpp.SetOutputEncoding(encoding); DiskTreebank tb = tlpp.DiskTreebank(); tb.LoadPath(fileName); // Statistics ICounter <string> binaryRuleTypes = new ClassicCounter <string>(20000); IList <int> branchingFactors = new List <int>(20000); int nTrees = 0; int nUnaryRules = 0; int nBinaryRules = 0; int binaryBranchingFactors = 0; // Read the treebank PrintWriter pw = tlpp.Pw(); foreach (Tree tree in tb) { if (tree.Value().Equals("ROOT")) { tree = tree.FirstChild(); } ++nTrees; foreach (Tree subTree in tree) { if (subTree.IsPhrasal()) { if (subTree.NumChildren() > 1) { ++nBinaryRules; branchingFactors.Add(subTree.NumChildren()); binaryBranchingFactors += subTree.NumChildren(); binaryRuleTypes.IncrementCount(TreeToRuleString(subTree)); } else { ++nUnaryRules; } } } } double mean = (double)binaryBranchingFactors / (double)nBinaryRules; System.Console.Out.Printf("#trees:\t%d%n", nTrees); System.Console.Out.Printf("#binary:\t%d%n", nBinaryRules); System.Console.Out.Printf("#binary types:\t%d%n", binaryRuleTypes.KeySet().Count); System.Console.Out.Printf("mean branching:\t%.4f%n", mean); System.Console.Out.Printf("stddev branching:\t%.4f%n", StandardDeviation(branchingFactors, mean)); System.Console.Out.Printf("rule entropy:\t%.5f%n", Counters.Entropy(binaryRuleTypes)); System.Console.Out.Printf("#unaries:\t%d%n", nUnaryRules); }
/// <summary>Execute with no arguments for usage.</summary> public static void Main(string[] args) { if (!ValidateCommandLine(args)) { log.Info(Usage); System.Environment.Exit(-1); } ITreebankLangParserParams tlpp = Language.@params; PrintWriter pwOut = tlpp.Pw(); Treebank guessTreebank = tlpp.DiskTreebank(); guessTreebank.LoadPath(guessFile); pwOut.Println("GUESS TREEBANK:"); pwOut.Println(guessTreebank.TextualSummary()); Treebank goldTreebank = tlpp.DiskTreebank(); goldTreebank.LoadPath(goldFile); pwOut.Println("GOLD TREEBANK:"); pwOut.Println(goldTreebank.TextualSummary()); Edu.Stanford.Nlp.Parser.Metrics.LeafAncestorEval metric = new Edu.Stanford.Nlp.Parser.Metrics.LeafAncestorEval("LeafAncestor"); ITreeTransformer tc = tlpp.Collinizer(); //The evalb ref implementation assigns status for each tree pair as follows: // // 0 - Ok (yields match) // 1 - length mismatch // 2 - null parse e.g. (()). // //In the cases of 1,2, evalb does not include the tree pair in the LP/LR computation. IEnumerator <Tree> goldItr = goldTreebank.GetEnumerator(); IEnumerator <Tree> guessItr = guessTreebank.GetEnumerator(); int goldLineId = 0; int guessLineId = 0; int skippedGuessTrees = 0; while (guessItr.MoveNext() && goldItr.MoveNext()) { Tree guessTree = guessItr.Current; IList <ILabel> guessYield = guessTree.Yield(); guessLineId++; Tree goldTree = goldItr.Current; IList <ILabel> goldYield = goldTree.Yield(); goldLineId++; // Check that we should evaluate this tree if (goldYield.Count > MaxGoldYield) { skippedGuessTrees++; continue; } // Only trees with equal yields can be evaluated if (goldYield.Count != guessYield.Count) { pwOut.Printf("Yield mismatch gold: %d tokens vs. guess: %d tokens (lines: gold %d guess %d)%n", goldYield.Count, guessYield.Count, goldLineId, guessLineId); skippedGuessTrees++; continue; } Tree evalGuess = tc.TransformTree(guessTree); Tree evalGold = tc.TransformTree(goldTree); metric.Evaluate(evalGuess, evalGold, ((Verbose) ? pwOut : null)); } if (guessItr.MoveNext() || goldItr.MoveNext()) { System.Console.Error.Printf("Guess/gold files do not have equal lengths (guess: %d gold: %d)%n.", guessLineId, goldLineId); } pwOut.Println("================================================================================"); if (skippedGuessTrees != 0) { pwOut.Printf("%s %d guess trees%n", "Unable to evaluate", skippedGuessTrees); } metric.Display(true, pwOut); pwOut.Close(); }
/// <param name="args"/> public static void Main(string[] args) { if (args.Length < MinArgs) { log.Info(Usage()); System.Environment.Exit(-1); } Properties options = StringUtils.ArgsToProperties(args, OptionArgDefs()); bool Verbose = PropertiesUtils.GetBool(options, "v", false); Language Language = PropertiesUtils.Get(options, "l", Language.English, typeof(Language)); int MaxGoldYield = PropertiesUtils.GetInt(options, "g", int.MaxValue); int MaxGuessYield = PropertiesUtils.GetInt(options, "y", int.MaxValue); string[] parsedArgs = options.GetProperty(string.Empty, string.Empty).Split("\\s+"); if (parsedArgs.Length != MinArgs) { log.Info(Usage()); System.Environment.Exit(-1); } File goldFile = new File(parsedArgs[0]); File guessFile = new File(parsedArgs[1]); ITreebankLangParserParams tlpp = Language.@params; PrintWriter pwOut = tlpp.Pw(); Treebank guessTreebank = tlpp.DiskTreebank(); guessTreebank.LoadPath(guessFile); pwOut.Println("GUESS TREEBANK:"); pwOut.Println(guessTreebank.TextualSummary()); Treebank goldTreebank = tlpp.DiskTreebank(); goldTreebank.LoadPath(goldFile); pwOut.Println("GOLD TREEBANK:"); pwOut.Println(goldTreebank.TextualSummary()); Edu.Stanford.Nlp.Parser.Metrics.CollinsDepEval depEval = new Edu.Stanford.Nlp.Parser.Metrics.CollinsDepEval("CollinsDep", true, tlpp.HeadFinder(), tlpp.TreebankLanguagePack().StartSymbol()); ITreeTransformer tc = tlpp.Collinizer(); //PennTreeReader skips over null/malformed parses. So when the yields of the gold/guess trees //don't match, we need to keep looking for the next gold tree that matches. //The evalb ref implementation differs slightly as it expects one tree per line. It assigns //status as follows: // // 0 - Ok (yields match) // 1 - length mismatch // 2 - null parse e.g. (()). // //In the cases of 1,2, evalb does not include the tree pair in the LP/LR computation. IEnumerator <Tree> goldItr = goldTreebank.GetEnumerator(); int goldLineId = 0; int skippedGuessTrees = 0; foreach (Tree guess in guessTreebank) { Tree evalGuess = tc.TransformTree(guess); if (guess.Yield().Count > MaxGuessYield) { skippedGuessTrees++; continue; } bool doneEval = false; while (goldItr.MoveNext() && !doneEval) { Tree gold = goldItr.Current; Tree evalGold = tc.TransformTree(gold); goldLineId++; if (gold.Yield().Count > MaxGoldYield) { continue; } else { if (evalGold.Yield().Count != evalGuess.Yield().Count) { pwOut.Println("Yield mismatch at gold line " + goldLineId); skippedGuessTrees++; break; } } //Default evalb behavior -- skip this guess tree depEval.Evaluate(evalGuess, evalGold, ((Verbose) ? pwOut : null)); doneEval = true; } } //Move to the next guess parse pwOut.Println("================================================================================"); if (skippedGuessTrees != 0) { pwOut.Printf("%s %d guess trees\n", ((MaxGuessYield < int.MaxValue) ? "Skipped" : "Unable to evaluate"), skippedGuessTrees); } depEval.Display(true, pwOut); pwOut.Close(); }
public static void Main(string[] args) { if (args.Length < minArgs) { System.Console.Out.WriteLine(usage); System.Environment.Exit(-1); } // Process command-line options Properties options = StringUtils.ArgsToProperties(args, optionArgDefinitions); string fileName = options.GetProperty(string.Empty); if (fileName == null || fileName.Equals(string.Empty)) { System.Console.Out.WriteLine(usage); System.Environment.Exit(-1); } int maxLen = PropertiesUtils.GetInt(options, "y", int.MaxValue); bool printTrees = PropertiesUtils.GetBool(options, "p", false); bool flattenTrees = PropertiesUtils.GetBool(options, "f", false); bool printPOS = PropertiesUtils.GetBool(options, "a", false); bool printTnT = PropertiesUtils.GetBool(options, "t", false); Language language = PropertiesUtils.Get(options, "l", Language.English, typeof(Language)); ITreebankLangParserParams tlpp = language.@params; string encoding = options.GetProperty("e", "UTF-8"); tlpp.SetInputEncoding(encoding); tlpp.SetOutputEncoding(encoding); DiskTreebank tb = tlpp.DiskTreebank(); tb.LoadPath(fileName); // Read the treebank PrintWriter pw = tlpp.Pw(); int numTrees = 0; foreach (Tree tree in tb) { if (tree.Yield().Count > maxLen) { continue; } ++numTrees; if (printTrees) { pw.Println(tree.ToString()); } else { if (flattenTrees) { pw.Println(SentenceUtils.ListToString(tree.Yield())); } else { if (printPOS) { pw.Println(SentenceUtils.ListToString(tree.PreTerminalYield())); } else { if (printTnT) { IList <CoreLabel> yield = tree.TaggedLabeledYield(); foreach (CoreLabel label in yield) { pw.Printf("%s\t%s%n", label.Word(), label.Tag()); } pw.Println(); } } } } } System.Console.Error.Printf("Read %d trees.%n", numTrees); }