private TreebankStats.ObservedCorpusStats GatherStats(DiskTreebank tb, string name)
 {
     TreebankStats.ObservedCorpusStats ocs = new TreebankStats.ObservedCorpusStats(name);
     if (makeVocab)
     {
         trainVocab = Generics.NewHashSet();
     }
     System.Console.Out.WriteLine("Reading treebank:");
     foreach (Tree t in tb)
     {
         Pair <int, int> treeFacts = DissectTree(t, ocs, makeVocab);
         ocs.AddStatsForTree(t.Yield().Count, treeFacts.First(), treeFacts.Second());
         if (ocs.numTrees % 100 == 0)
         {
             System.Console.Out.Write(".");
         }
         else
         {
             if (ocs.numTrees % 8001 == 0)
             {
                 System.Console.Out.WriteLine();
             }
         }
     }
     ocs.ComputeFinalValues();
     System.Console.Out.WriteLine("done!");
     return(ocs);
 }
 private static void Display(TreebankStats.ObservedCorpusStats corpStats, bool displayWords, bool displayOOV)
 {
     System.Console.Out.WriteLine("####################################################################");
     System.Console.Out.WriteLine("## " + corpStats.GetName());
     System.Console.Out.WriteLine("####################################################################");
     System.Console.Out.WriteLine();
     corpStats.Display(displayWords, displayOOV);
 }
        /// <summary>Returns pair of (depth,breadth) of tree.</summary>
        /// <remarks>Returns pair of (depth,breadth) of tree. Does a breadth-first search.</remarks>
        /// <param name="t"/>
        /// <param name="ocs"/>
        /// <param name="addToVocab"/>
        private static Pair <int, int> DissectTree(Tree t, TreebankStats.ObservedCorpusStats ocs, bool addToVocab)
        {
            Stack <Pair <int, Tree> > stack = new Stack <Pair <int, Tree> >();

            stack.Push(new Pair <int, Tree>(0, t));
            int maxBreadth = 0;
            int maxDepth   = -1;

            if (t == null)
            {
                throw new Exception("Null tree passed to dissectTree()");
            }
            else
            {
                while (!stack.IsEmpty())
                {
                    Pair <int, Tree> depthNode = stack.Pop();
                    int  nodeDepth             = depthNode.First();
                    Tree node = depthNode.Second();
                    if (nodeDepth != maxDepth)
                    {
                        maxDepth = nodeDepth;
                        if (node.IsPhrasal() && stack.Count + 1 > maxBreadth)
                        {
                            maxBreadth = stack.Count + 1;
                        }
                    }
                    if (node.IsPhrasal())
                    {
                        ocs.AddPhrasalBranch(node.Value(), node.Children().Length);
                    }
                    else
                    {
                        if (node.IsPreTerminal())
                        {
                            ocs.posTags.IncrementCount(node.Value());
                        }
                        else
                        {
                            if (node.IsLeaf())
                            {
                                ocs.words.IncrementCount(node.Value());
                                if (addToVocab)
                                {
                                    trainVocab.Add(node.Value());
                                }
                            }
                        }
                    }
                    foreach (Tree kid in node.Children())
                    {
                        stack.Push(new Pair <int, Tree>(nodeDepth + 1, kid));
                    }
                }
            }
            return(new Pair <int, int>(maxDepth, maxBreadth));
        }
 private static TreebankStats.ObservedCorpusStats AggregateStats(IList <TreebankStats.ObservedCorpusStats> allStats)
 {
     if (allStats.Count == 0)
     {
         return(null);
     }
     else
     {
         if (allStats.Count == 1)
         {
             return(allStats[0]);
         }
     }
     TreebankStats.ObservedCorpusStats agStats = new TreebankStats.ObservedCorpusStats("CORPUS");
     foreach (TreebankStats.ObservedCorpusStats ocs in allStats)
     {
         agStats.numTrees += ocs.numTrees;
         agStats.breadth2 += ocs.breadth2;
         Sharpen.Collections.AddAll(agStats.breadths, ocs.breadths);
         agStats.depth2 += ocs.depth2;
         Sharpen.Collections.AddAll(agStats.depths, ocs.depths);
         agStats.length2 += ocs.length2;
         Sharpen.Collections.AddAll(agStats.lengths, ocs.lengths);
         if (ocs.minLength < agStats.minLength)
         {
             agStats.minLength = ocs.minLength;
         }
         if (ocs.maxLength > agStats.maxLength)
         {
             agStats.maxLength = ocs.maxLength;
         }
         if (ocs.minBreadth < agStats.minBreadth)
         {
             agStats.minBreadth = ocs.minBreadth;
         }
         if (ocs.maxBreadth > agStats.maxBreadth)
         {
             agStats.maxBreadth = ocs.maxBreadth;
         }
         if (ocs.minDepth < agStats.minDepth)
         {
             agStats.minDepth = ocs.minDepth;
         }
         if (ocs.maxDepth > agStats.maxDepth)
         {
             agStats.maxDepth = ocs.maxDepth;
         }
         agStats.words.AddAll(ocs.words);
         agStats.posTags.AddAll(ocs.posTags);
         agStats.phrasalBranching2.AddAll(ocs.phrasalBranching2);
         agStats.phrasalBranchingNum2.AddAll(ocs.phrasalBranchingNum2);
     }
     agStats.ComputeFinalValues();
     return(agStats);
 }
 public virtual void Run(bool pathsAreFiles, bool displayWords, bool displayOOV)
 {
     if (useSplit)
     {
         IList <TreebankStats.ObservedCorpusStats> allSplitStats = new List <TreebankStats.ObservedCorpusStats>();
         makeVocab = true;
         foreach (KeyValuePair <TreebankStats.Split, ICollection <string> > split in splitFileLists)
         {
             DiskTreebank tb          = tlpp.DiskTreebank();
             IFileFilter  splitFilter = new TreebankStats.SplitFilter(split.Value);
             foreach (string path in pathNames)
             {
                 tb.LoadPath(path, splitFilter);
             }
             TreebankStats.ObservedCorpusStats splitStats = GatherStats(tb, languageName.ToString() + "." + split.Key.ToString());
             allSplitStats.Add(splitStats);
             makeVocab = false;
         }
         Display(AggregateStats(allSplitStats), displayWords, displayOOV);
         foreach (TreebankStats.ObservedCorpusStats ocs in allSplitStats)
         {
             Display(ocs, displayWords, displayOOV);
         }
     }
     else
     {
         if (pathsAreFiles)
         {
             makeVocab = true;
             foreach (string path in pathNames)
             {
                 DiskTreebank tb = tlpp.DiskTreebank();
                 tb.LoadPath(path, null);
                 TreebankStats.ObservedCorpusStats stats = GatherStats(tb, languageName.ToString() + "  " + path);
                 Display(stats, displayWords, displayOOV);
                 makeVocab = false;
             }
         }
         else
         {
             trainVocab = Generics.NewHashSet();
             DiskTreebank tb = tlpp.DiskTreebank();
             foreach (string path in pathNames)
             {
                 tb.LoadPath(path, null);
             }
             TreebankStats.ObservedCorpusStats allStats = GatherStats(tb, languageName.ToString());
             Display(allStats, displayWords, displayOOV);
         }
     }
 }