// just static main public static void Main(string[] args) { string treeString = "(ROOT (S (NP (PRP$ My) (NN dog)) (ADVP (RB also)) (VP (VBZ likes) (S (VP (VBG eating) (NP (NN sausage))))) (. .)))"; // Typically the tree is constructed by parsing or reading a // treebank. This is just for example purposes Tree tree = Tree.ValueOf(treeString); // This creates English uncollapsed dependencies as a // SemanticGraph. If you are creating many SemanticGraphs, you // should use a GrammaticalStructureFactory and use it to generate // the intermediate GrammaticalStructure instead SemanticGraph graph = SemanticGraphFactory.GenerateUncollapsedDependencies(tree); // Alternatively, this could have been the Chinese params or any // other language supported. As of 2014, only English and Chinese ITreebankLangParserParams @params = new EnglishTreebankParserParams(); IGrammaticalStructureFactory gsf = @params.TreebankLanguagePack().GrammaticalStructureFactory(@params.TreebankLanguagePack().PunctuationWordRejectFilter(), @params.TypedDependencyHeadFinder()); GrammaticalStructure gs = gsf.NewGrammaticalStructure(tree); log.Info(graph); SemgrexPattern semgrex = SemgrexPattern.Compile("{}=A <<nsubj {}=B"); SemgrexMatcher matcher = semgrex.Matcher(graph); // This will produce two results on the given tree: "likes" is an // ancestor of both "dog" and "my" via the nsubj relation while (matcher.Find()) { log.Info(matcher.GetNode("A") + " <<nsubj " + matcher.GetNode("B")); } }
/// <summary> /// This is hardwired to calculate the split categories from English /// Penn Treebank sections 2-21 with a default cutoff of 300 (as used /// in ACL03PCFG). /// </summary> /// <remarks> /// This is hardwired to calculate the split categories from English /// Penn Treebank sections 2-21 with a default cutoff of 300 (as used /// in ACL03PCFG). It was added to upgrading of code in cases where no /// Treebank was available, and the pre-stored list was being used). /// </remarks> public static ICollection <string> GetEnglishSplitCategories(string treebankRoot) { ITreebankLangParserParams tlpParams = new EnglishTreebankParserParams(); Treebank trees = tlpParams.MemoryTreebank(); trees.LoadPath(treebankRoot, new NumberRangeFileFilter(200, 2199, true)); return(GetSplitCategories(trees, 300.0, tlpParams.TreebankLanguagePack())); }
public static void Main(string[] args) { if (args.Length < minArgs) { System.Console.Out.WriteLine(usage.ToString()); System.Environment.Exit(-1); } ITreebankLangParserParams tlpp = new EnglishTreebankParserParams(); DiskTreebank tb = null; string encoding = "UTF-8"; Language lang = Language.English; for (int i = 0; i < args.Length; i++) { if (args[i].StartsWith("-")) { switch (args[i]) { case "-l": { lang = Language.ValueOf(args[++i].Trim()); tlpp = lang.@params; break; } case "-e": { encoding = args[++i]; break; } default: { System.Console.Out.WriteLine(usage.ToString()); System.Environment.Exit(-1); break; } } } else { if (tb == null) { if (tlpp == null) { System.Console.Out.WriteLine(usage.ToString()); System.Environment.Exit(-1); } else { tlpp.SetInputEncoding(encoding); tlpp.SetOutputEncoding(encoding); tb = tlpp.DiskTreebank(); } } tb.LoadPath(args[i]); } } PrintWriter pw = tlpp.Pw(); Options op = new Options(); Options.LexOptions lexOptions = op.lexOptions; if (lang == Language.French) { lexOptions.useUnknownWordSignatures = 1; lexOptions.smartMutation = false; lexOptions.unknownSuffixSize = 2; lexOptions.unknownPrefixSize = 1; } else { if (lang == Language.Arabic) { lexOptions.smartMutation = false; lexOptions.useUnknownWordSignatures = 9; lexOptions.unknownPrefixSize = 1; lexOptions.unknownSuffixSize = 1; } } IIndex <string> wordIndex = new HashIndex <string>(); IIndex <string> tagIndex = new HashIndex <string>(); ILexicon lex = tlpp.Lex(op, wordIndex, tagIndex); int computeAfter = (int)(0.50 * tb.Count); ICounter <string> vocab = new ClassicCounter <string>(); ICounter <string> unkCounter = new ClassicCounter <string>(); int treeId = 0; foreach (Tree t in tb) { IList <ILabel> yield = t.Yield(); int posId = 0; foreach (ILabel word in yield) { vocab.IncrementCount(word.Value()); if (treeId > computeAfter && vocab.GetCount(word.Value()) < 2.0) { // if(lex.getUnknownWordModel().getSignature(word.value(), posId++).equals("UNK")) // pw.println(word.value()); unkCounter.IncrementCount(lex.GetUnknownWordModel().GetSignature(word.Value(), posId++)); } } treeId++; } IList <string> biggestKeys = new List <string>(unkCounter.KeySet()); biggestKeys.Sort(Counters.ToComparatorDescending(unkCounter)); foreach (string wordType in biggestKeys) { pw.Printf("%s\t%d%n", wordType, (int)unkCounter.GetCount(wordType)); } pw.Close(); pw.Close(); }
/// <summary>Run the scoring metric on guess/gold input.</summary> /// <remarks> /// Run the scoring metric on guess/gold input. This method performs "Collinization." /// The default language is English. /// </remarks> /// <param name="args"/> public static void Main(string[] args) { if (args.Length < minArgs) { System.Console.Out.WriteLine(usage.ToString()); System.Environment.Exit(-1); } ITreebankLangParserParams tlpp = new EnglishTreebankParserParams(); int maxGoldYield = int.MaxValue; bool Verbose = false; string encoding = "UTF-8"; string guessFile = null; string goldFile = null; IDictionary <string, string[]> argsMap = StringUtils.ArgsToMap(args, optionArgDefs); foreach (KeyValuePair <string, string[]> opt in argsMap) { if (opt.Key == null) { continue; } if (opt.Key.Equals("-l")) { Language lang = Language.ValueOf(opt.Value[0].Trim()); tlpp = lang.@params; } else { if (opt.Key.Equals("-y")) { maxGoldYield = System.Convert.ToInt32(opt.Value[0].Trim()); } else { if (opt.Key.Equals("-v")) { Verbose = true; } else { if (opt.Key.Equals("-c")) { Edu.Stanford.Nlp.Parser.Metrics.TaggingEval.doCatLevelEval = true; } else { if (opt.Key.Equals("-e")) { encoding = opt.Value[0]; } else { log.Info(usage.ToString()); System.Environment.Exit(-1); } } } } } //Non-option arguments located at key null string[] rest = argsMap[null]; if (rest == null || rest.Length < minArgs) { log.Info(usage.ToString()); System.Environment.Exit(-1); } goldFile = rest[0]; guessFile = rest[1]; } tlpp.SetInputEncoding(encoding); PrintWriter pwOut = tlpp.Pw(); Treebank guessTreebank = tlpp.DiskTreebank(); guessTreebank.LoadPath(guessFile); pwOut.Println("GUESS TREEBANK:"); pwOut.Println(guessTreebank.TextualSummary()); Treebank goldTreebank = tlpp.DiskTreebank(); goldTreebank.LoadPath(goldFile); pwOut.Println("GOLD TREEBANK:"); pwOut.Println(goldTreebank.TextualSummary()); Edu.Stanford.Nlp.Parser.Metrics.TaggingEval metric = new Edu.Stanford.Nlp.Parser.Metrics.TaggingEval("Tagging LP/LR"); ITreeTransformer tc = tlpp.Collinizer(); //The evalb ref implementation assigns status for each tree pair as follows: // // 0 - Ok (yields match) // 1 - length mismatch // 2 - null parse e.g. (()). // //In the cases of 1,2, evalb does not include the tree pair in the LP/LR computation. IEnumerator <Tree> goldItr = goldTreebank.GetEnumerator(); IEnumerator <Tree> guessItr = guessTreebank.GetEnumerator(); int goldLineId = 0; int guessLineId = 0; int skippedGuessTrees = 0; while (guessItr.MoveNext() && goldItr.MoveNext()) { Tree guessTree = guessItr.Current; IList <ILabel> guessYield = guessTree.Yield(); guessLineId++; Tree goldTree = goldItr.Current; IList <ILabel> goldYield = goldTree.Yield(); goldLineId++; // Check that we should evaluate this tree if (goldYield.Count > maxGoldYield) { skippedGuessTrees++; continue; } // Only trees with equal yields can be evaluated if (goldYield.Count != guessYield.Count) { pwOut.Printf("Yield mismatch gold: %d tokens vs. guess: %d tokens (lines: gold %d guess %d)%n", goldYield.Count, guessYield.Count, goldLineId, guessLineId); skippedGuessTrees++; continue; } Tree evalGuess = tc.TransformTree(guessTree); Tree evalGold = tc.TransformTree(goldTree); metric.Evaluate(evalGuess, evalGold, ((Verbose) ? pwOut : null)); } if (guessItr.MoveNext() || goldItr.MoveNext()) { System.Console.Error.Printf("Guess/gold files do not have equal lengths (guess: %d gold: %d)%n.", guessLineId, goldLineId); } pwOut.Println("================================================================================"); if (skippedGuessTrees != 0) { pwOut.Printf("%s %d guess trees\n", "Unable to evaluate", skippedGuessTrees); } metric.Display(true, pwOut); pwOut.Println(); pwOut.Close(); }
public static void Main(string[] args) { if (args.Length < minArgs) { System.Console.Out.WriteLine(usage.ToString()); System.Environment.Exit(-1); } ITreebankLangParserParams tlpp = new EnglishTreebankParserParams(); DiskTreebank tb = null; string encoding = "UTF-8"; TregexPattern rootMatch = null; for (int i = 0; i < args.Length; i++) { if (args[i].StartsWith("-")) { switch (args[i]) { case "-l": { Language lang = Language.ValueOf(args[++i].Trim()); tlpp = lang.@params; break; } case "-e": { encoding = args[++i]; break; } default: { System.Console.Out.WriteLine(usage.ToString()); System.Environment.Exit(-1); break; } } } else { rootMatch = TregexPattern.Compile("@" + args[i++]); if (tb == null) { if (tlpp == null) { System.Console.Out.WriteLine(usage.ToString()); System.Environment.Exit(-1); } else { tlpp.SetInputEncoding(encoding); tlpp.SetOutputEncoding(encoding); tb = tlpp.DiskTreebank(); } } tb.LoadPath(args[i++]); } } ICounter <string> rhsCounter = new ClassicCounter <string>(); foreach (Tree t in tb) { TregexMatcher m = rootMatch.Matcher(t); while (m.FindNextMatchingNode()) { Tree match = m.GetMatch(); StringBuilder sb = new StringBuilder(); foreach (Tree kid in match.Children()) { sb.Append(kid.Value()).Append(" "); } rhsCounter.IncrementCount(sb.ToString().Trim()); } } IList <string> biggestKeys = new List <string>(rhsCounter.KeySet()); biggestKeys.Sort(Counters.ToComparatorDescending(rhsCounter)); PrintWriter pw = tlpp.Pw(); foreach (string rhs in biggestKeys) { pw.Printf("%s\t%d%n", rhs, (int)rhsCounter.GetCount(rhs)); } pw.Close(); }
public static void Main(string[] args) { if (args.Length < minArgs) { System.Console.Out.WriteLine(usage.ToString()); System.Environment.Exit(-1); } ITreebankLangParserParams tlpp = new EnglishTreebankParserParams(); DiskTreebank tb = null; string encoding = "UTF-8"; string puncTag = null; for (int i = 0; i < args.Length; i++) { if (args[i].StartsWith("-")) { switch (args[i]) { case "-l": { Language lang = Language.ValueOf(args[++i].Trim()); tlpp = lang.@params; break; } case "-e": { encoding = args[++i]; break; } default: { System.Console.Out.WriteLine(usage.ToString()); System.Environment.Exit(-1); break; } } } else { puncTag = args[i++]; if (tb == null) { if (tlpp == null) { System.Console.Out.WriteLine(usage.ToString()); System.Environment.Exit(-1); } else { tlpp.SetInputEncoding(encoding); tlpp.SetOutputEncoding(encoding); tb = tlpp.DiskTreebank(); } } tb.LoadPath(args[i]); } } ICounter <string> puncTypes = new ClassicCounter <string>(); foreach (Tree t in tb) { IList <CoreLabel> yield = t.TaggedLabeledYield(); foreach (CoreLabel word in yield) { if (word.Tag().Equals(puncTag)) { puncTypes.IncrementCount(word.Word()); } } } IList <string> biggestKeys = new List <string>(puncTypes.KeySet()); biggestKeys.Sort(Counters.ToComparatorDescending(puncTypes)); PrintWriter pw = tlpp.Pw(); foreach (string wordType in biggestKeys) { pw.Printf("%s\t%d%n", wordType, (int)puncTypes.GetCount(wordType)); } pw.Close(); }
/// <summary>Run the scoring metric on guess/gold input.</summary> /// <remarks> /// Run the scoring metric on guess/gold input. This method performs "Collinization." /// The default language is English. /// </remarks> /// <param name="args"/> public static void Main(string[] args) { if (args.Length < minArgs) { System.Console.Out.WriteLine(usage.ToString()); System.Environment.Exit(-1); } ITreebankLangParserParams tlpp = new EnglishTreebankParserParams(); int maxGoldYield = int.MaxValue; int maxGuessYield = int.MaxValue; bool Verbose = false; bool skipGuess = false; bool tagMode = false; string guessFile = null; string goldFile = null; for (int i = 0; i < args.Length; i++) { if (args[i].StartsWith("-")) { switch (args[i]) { case "-l": { Language lang = Language.ValueOf(args[++i].Trim()); tlpp = lang.@params; break; } case "-y": { maxGoldYield = System.Convert.ToInt32(args[++i].Trim()); break; } case "-t": { tagMode = true; break; } case "-v": { Verbose = true; break; } case "-g": { maxGuessYield = System.Convert.ToInt32(args[++i].Trim()); skipGuess = true; break; } default: { System.Console.Out.WriteLine(usage.ToString()); System.Environment.Exit(-1); break; } } } else { //Required parameters goldFile = args[i++]; guessFile = args[i]; break; } } PrintWriter pwOut = tlpp.Pw(); Treebank guessTreebank = tlpp.DiskTreebank(); guessTreebank.LoadPath(guessFile); pwOut.Println("GUESS TREEBANK:"); pwOut.Println(guessTreebank.TextualSummary()); Treebank goldTreebank = tlpp.DiskTreebank(); goldTreebank.LoadPath(goldFile); pwOut.Println("GOLD TREEBANK:"); pwOut.Println(goldTreebank.TextualSummary()); string evalName = (tagMode) ? "TsarfatyTAG" : "TsarfatySEG"; Edu.Stanford.Nlp.Parser.Metrics.TsarfatyEval eval = new Edu.Stanford.Nlp.Parser.Metrics.TsarfatyEval(evalName, tagMode); ITreeTransformer tc = tlpp.Collinizer(); //PennTreeReader skips over null/malformed parses. So when the yields of the gold/guess trees //don't match, we need to keep looking for the next gold tree that matches. //The evalb ref implementation differs slightly as it expects one tree per line. It assigns //status as follows: // // 0 - Ok (yields match) // 1 - length mismatch // 2 - null parse e.g. (()). // //In the cases of 1,2, evalb does not include the tree pair in the LP/LR computation. IEnumerator <Tree> goldItr = goldTreebank.GetEnumerator(); int goldLineId = 0; int skippedGuessTrees = 0; foreach (Tree guess in guessTreebank) { Tree evalGuess = tc.TransformTree(guess); List <ILabel> guessSent = guess.Yield(); string guessChars = SentenceUtils.ListToString(guessSent).ReplaceAll("\\s+", string.Empty); if (guessSent.Count > maxGuessYield) { skippedGuessTrees++; continue; } bool doneEval = false; while (goldItr.MoveNext() && !doneEval) { Tree gold = goldItr.Current; Tree evalGold = tc.TransformTree(gold); goldLineId++; List <ILabel> goldSent = gold.Yield(); string goldChars = SentenceUtils.ListToString(goldSent).ReplaceAll("\\s+", string.Empty); if (goldSent.Count > maxGoldYield) { continue; } else { if (goldChars.Length != guessChars.Length) { pwOut.Printf("Char level yield mismatch at line %d (guess: %d gold: %d)\n", goldLineId, guessChars.Length, goldChars.Length); skippedGuessTrees++; break; } } //Default evalb behavior -- skip this guess tree eval.Evaluate(evalGuess, evalGold, ((Verbose) ? pwOut : null)); doneEval = true; } } //Move to the next guess parse pwOut.Println("================================================================================"); if (skippedGuessTrees != 0) { pwOut.Printf("%s %d guess trees\n", ((skipGuess) ? "Skipped" : "Unable to evaluate"), skippedGuessTrees); } eval.Display(true, pwOut); pwOut.Println(); pwOut.Close(); }