private TreebankStats.ObservedCorpusStats GatherStats(DiskTreebank tb, string name) { TreebankStats.ObservedCorpusStats ocs = new TreebankStats.ObservedCorpusStats(name); if (makeVocab) { trainVocab = Generics.NewHashSet(); } System.Console.Out.WriteLine("Reading treebank:"); foreach (Tree t in tb) { Pair <int, int> treeFacts = DissectTree(t, ocs, makeVocab); ocs.AddStatsForTree(t.Yield().Count, treeFacts.First(), treeFacts.Second()); if (ocs.numTrees % 100 == 0) { System.Console.Out.Write("."); } else { if (ocs.numTrees % 8001 == 0) { System.Console.Out.WriteLine(); } } } ocs.ComputeFinalValues(); System.Console.Out.WriteLine("done!"); return(ocs); }
private static void Display(TreebankStats.ObservedCorpusStats corpStats, bool displayWords, bool displayOOV) { System.Console.Out.WriteLine("####################################################################"); System.Console.Out.WriteLine("## " + corpStats.GetName()); System.Console.Out.WriteLine("####################################################################"); System.Console.Out.WriteLine(); corpStats.Display(displayWords, displayOOV); }
/// <summary>Returns pair of (depth,breadth) of tree.</summary> /// <remarks>Returns pair of (depth,breadth) of tree. Does a breadth-first search.</remarks> /// <param name="t"/> /// <param name="ocs"/> /// <param name="addToVocab"/> private static Pair <int, int> DissectTree(Tree t, TreebankStats.ObservedCorpusStats ocs, bool addToVocab) { Stack <Pair <int, Tree> > stack = new Stack <Pair <int, Tree> >(); stack.Push(new Pair <int, Tree>(0, t)); int maxBreadth = 0; int maxDepth = -1; if (t == null) { throw new Exception("Null tree passed to dissectTree()"); } else { while (!stack.IsEmpty()) { Pair <int, Tree> depthNode = stack.Pop(); int nodeDepth = depthNode.First(); Tree node = depthNode.Second(); if (nodeDepth != maxDepth) { maxDepth = nodeDepth; if (node.IsPhrasal() && stack.Count + 1 > maxBreadth) { maxBreadth = stack.Count + 1; } } if (node.IsPhrasal()) { ocs.AddPhrasalBranch(node.Value(), node.Children().Length); } else { if (node.IsPreTerminal()) { ocs.posTags.IncrementCount(node.Value()); } else { if (node.IsLeaf()) { ocs.words.IncrementCount(node.Value()); if (addToVocab) { trainVocab.Add(node.Value()); } } } } foreach (Tree kid in node.Children()) { stack.Push(new Pair <int, Tree>(nodeDepth + 1, kid)); } } } return(new Pair <int, int>(maxDepth, maxBreadth)); }
private static TreebankStats.ObservedCorpusStats AggregateStats(IList <TreebankStats.ObservedCorpusStats> allStats) { if (allStats.Count == 0) { return(null); } else { if (allStats.Count == 1) { return(allStats[0]); } } TreebankStats.ObservedCorpusStats agStats = new TreebankStats.ObservedCorpusStats("CORPUS"); foreach (TreebankStats.ObservedCorpusStats ocs in allStats) { agStats.numTrees += ocs.numTrees; agStats.breadth2 += ocs.breadth2; Sharpen.Collections.AddAll(agStats.breadths, ocs.breadths); agStats.depth2 += ocs.depth2; Sharpen.Collections.AddAll(agStats.depths, ocs.depths); agStats.length2 += ocs.length2; Sharpen.Collections.AddAll(agStats.lengths, ocs.lengths); if (ocs.minLength < agStats.minLength) { agStats.minLength = ocs.minLength; } if (ocs.maxLength > agStats.maxLength) { agStats.maxLength = ocs.maxLength; } if (ocs.minBreadth < agStats.minBreadth) { agStats.minBreadth = ocs.minBreadth; } if (ocs.maxBreadth > agStats.maxBreadth) { agStats.maxBreadth = ocs.maxBreadth; } if (ocs.minDepth < agStats.minDepth) { agStats.minDepth = ocs.minDepth; } if (ocs.maxDepth > agStats.maxDepth) { agStats.maxDepth = ocs.maxDepth; } agStats.words.AddAll(ocs.words); agStats.posTags.AddAll(ocs.posTags); agStats.phrasalBranching2.AddAll(ocs.phrasalBranching2); agStats.phrasalBranchingNum2.AddAll(ocs.phrasalBranchingNum2); } agStats.ComputeFinalValues(); return(agStats); }
public virtual void Run(bool pathsAreFiles, bool displayWords, bool displayOOV) { if (useSplit) { IList <TreebankStats.ObservedCorpusStats> allSplitStats = new List <TreebankStats.ObservedCorpusStats>(); makeVocab = true; foreach (KeyValuePair <TreebankStats.Split, ICollection <string> > split in splitFileLists) { DiskTreebank tb = tlpp.DiskTreebank(); IFileFilter splitFilter = new TreebankStats.SplitFilter(split.Value); foreach (string path in pathNames) { tb.LoadPath(path, splitFilter); } TreebankStats.ObservedCorpusStats splitStats = GatherStats(tb, languageName.ToString() + "." + split.Key.ToString()); allSplitStats.Add(splitStats); makeVocab = false; } Display(AggregateStats(allSplitStats), displayWords, displayOOV); foreach (TreebankStats.ObservedCorpusStats ocs in allSplitStats) { Display(ocs, displayWords, displayOOV); } } else { if (pathsAreFiles) { makeVocab = true; foreach (string path in pathNames) { DiskTreebank tb = tlpp.DiskTreebank(); tb.LoadPath(path, null); TreebankStats.ObservedCorpusStats stats = GatherStats(tb, languageName.ToString() + " " + path); Display(stats, displayWords, displayOOV); makeVocab = false; } } else { trainVocab = Generics.NewHashSet(); DiskTreebank tb = tlpp.DiskTreebank(); foreach (string path in pathNames) { tb.LoadPath(path, null); } TreebankStats.ObservedCorpusStats allStats = GatherStats(tb, languageName.ToString()); Display(allStats, displayWords, displayOOV); } } }