/// <summary> /// Return various statistics about the treebank (number of sentences, /// words, tag set, etc.). /// </summary> /// <param name="tlp"> /// The TreebankLanguagePack used to determine punctuation and an /// appropriate character encoding /// </param> /// <returns>A big string for human consumption describing the treebank</returns> public virtual string TextualSummary(ITreebankLanguagePack tlp) { int numTrees = 0; int numTreesLE40 = 0; int numNonUnaryRoots = 0; Tree nonUnaryEg = null; ClassicCounter <Tree> nonUnaries = new ClassicCounter <Tree>(); ClassicCounter <string> roots = new ClassicCounter <string>(); ClassicCounter <string> starts = new ClassicCounter <string>(); ClassicCounter <string> puncts = new ClassicCounter <string>(); int numUnenclosedLeaves = 0; int numLeaves = 0; int numNonPhrasal = 0; int numPreTerminalWithMultipleChildren = 0; int numWords = 0; int numTags = 0; int shortestSentence = int.MaxValue; int longestSentence = 0; int numNullLabel = 0; ICollection <string> words = Generics.NewHashSet(); ClassicCounter <string> tags = new ClassicCounter <string>(); ClassicCounter <string> cats = new ClassicCounter <string>(); Tree leafEg = null; Tree preTerminalMultipleChildrenEg = null; Tree nullLabelEg = null; Tree rootRewritesAsTaggedWordEg = null; foreach (Tree t in this) { roots.IncrementCount(t.Value()); numTrees++; int leng = t.Yield().Count; if (leng <= 40) { numTreesLE40++; } if (leng < shortestSentence) { shortestSentence = leng; } if (leng > longestSentence) { longestSentence = leng; } if (t.NumChildren() > 1) { if (numNonUnaryRoots == 0) { nonUnaryEg = t; } if (numNonUnaryRoots < 100) { nonUnaries.IncrementCount(t.LocalTree()); } numNonUnaryRoots++; } else { if (t.IsLeaf()) { numUnenclosedLeaves++; } else { Tree t2 = t.FirstChild(); if (t2.IsLeaf()) { numLeaves++; leafEg = t; } else { if (t2.IsPreTerminal()) { if (numNonPhrasal == 0) { rootRewritesAsTaggedWordEg = t; } numNonPhrasal++; } } starts.IncrementCount(t2.Value()); } } foreach (Tree subtree in t) { ILabel lab = subtree.Label(); if (lab == null || lab.Value() == null || lab.Value().IsEmpty()) { if (numNullLabel == 0) { nullLabelEg = subtree; } numNullLabel++; if (lab == null) { subtree.SetLabel(new StringLabel(string.Empty)); } else { if (lab.Value() == null) { subtree.Label().SetValue(string.Empty); } } } if (subtree.IsLeaf()) { numWords++; words.Add(subtree.Value()); } else { if (subtree.IsPreTerminal()) { numTags++; tags.IncrementCount(subtree.Value()); if (tlp != null && tlp.IsPunctuationTag(subtree.Value())) { puncts.IncrementCount(subtree.FirstChild().Value()); } } else { if (subtree.IsPhrasal()) { bool hasLeafChild = false; foreach (Tree kt in subtree.Children()) { if (kt.IsLeaf()) { hasLeafChild = true; } } if (hasLeafChild) { numPreTerminalWithMultipleChildren++; if (preTerminalMultipleChildrenEg == null) { preTerminalMultipleChildrenEg = subtree; } } cats.IncrementCount(subtree.Value()); } else { throw new InvalidOperationException("Treebank: Bad tree in treebank!: " + subtree); } } } } } StringWriter sw = new StringWriter(2000); PrintWriter pw = new PrintWriter(sw); NumberFormat nf = NumberFormat.GetNumberInstance(); nf.SetMaximumFractionDigits(0); pw.Println("Treebank has " + numTrees + " trees (" + numTreesLE40 + " of length <= 40) and " + numWords + " words (tokens)"); if (numTrees > 0) { if (numTags != numWords) { pw.Println(" Warning! numTags differs and is " + numTags); } if (roots.Size() == 1) { string root = (string)Sharpen.Collections.ToArray(roots.KeySet())[0]; pw.Println(" The root category is: " + root); } else { pw.Println(" Warning! " + roots.Size() + " different roots in treebank: " + Counters.ToString(roots, nf)); } if (numNonUnaryRoots > 0) { pw.Print(" Warning! " + numNonUnaryRoots + " trees without unary initial rewrite. "); if (numNonUnaryRoots > 100) { pw.Print("First 100 "); } pw.Println("Rewrites: " + Counters.ToString(nonUnaries, nf)); pw.Println(" Example: " + nonUnaryEg); } if (numUnenclosedLeaves > 0 || numLeaves > 0 || numNonPhrasal > 0) { pw.Println(" Warning! Non-phrasal trees: " + numUnenclosedLeaves + " bare leaves; " + numLeaves + " root rewrites as leaf; and " + numNonPhrasal + " root rewrites as tagged word"); if (numLeaves > 0) { pw.Println(" Example bad root rewrites as leaf: " + leafEg); } if (numNonPhrasal > 0) { pw.Println(" Example bad root rewrites as tagged word: " + rootRewritesAsTaggedWordEg); } } if (numNullLabel > 0) { pw.Println(" Warning! " + numNullLabel + " tree nodes with null or empty string labels, e.g.:"); pw.Println(" " + nullLabelEg); } if (numPreTerminalWithMultipleChildren > 0) { pw.Println(" Warning! " + numPreTerminalWithMultipleChildren + " preterminal nodes with multiple children."); pw.Println(" Example: " + preTerminalMultipleChildrenEg); } pw.Println(" Sentences range from " + shortestSentence + " to " + longestSentence + " words, with an average length of " + (((numWords * 100) / numTrees) / 100.0) + " words."); pw.Println(" " + cats.Size() + " phrasal category types, " + tags.Size() + " tag types, and " + words.Count + " word types"); string[] empties = new string[] { "*", "0", "*T*", "*RNR*", "*U*", "*?*", "*EXP*", "*ICH*", "*NOT*", "*PPA*", "*OP*", "*pro*", "*PRO*" }; // What a dopey choice using 0 as an empty element name!! // The problem with the below is that words aren't turned into a basic // category, but empties commonly are indexed.... Would need to look // for them with a suffix of -[0-9]+ ICollection <string> knownEmpties = Generics.NewHashSet(Arrays.AsList(empties)); ICollection <string> emptiesIntersection = Sets.Intersection(words, knownEmpties); if (!emptiesIntersection.IsEmpty()) { pw.Println(" Caution! " + emptiesIntersection.Count + " word types are known empty elements: " + emptiesIntersection); } ICollection <string> joint = Sets.Intersection(cats.KeySet(), tags.KeySet()); if (!joint.IsEmpty()) { pw.Println(" Warning! " + joint.Count + " items are tags and categories: " + joint); } foreach (string cat in cats.KeySet()) { if (cat != null && cat.Contains("@")) { pw.Println(" Warning!! Stanford Parser does not work with categories containing '@' like: " + cat); break; } } foreach (string cat_1 in tags.KeySet()) { if (cat_1 != null && cat_1.Contains("@")) { pw.Println(" Warning!! Stanford Parser does not work with tags containing '@' like: " + cat_1); break; } } pw.Println(" Cats: " + Counters.ToString(cats, nf)); pw.Println(" Tags: " + Counters.ToString(tags, nf)); pw.Println(" " + starts.Size() + " start categories: " + Counters.ToString(starts, nf)); if (!puncts.IsEmpty()) { pw.Println(" Puncts: " + Counters.ToString(puncts, nf)); } } return(sw.ToString()); }