Ejemplo n.º 1
0
 public override IUnknownWordModel FinishTraining()
 {
     // make sure the unseen counter isn't empty!  If it is, put in
     // a uniform unseen over tags
     if (unSeenCounter.IsEmpty())
     {
         System.Console.Error.Printf("%s: WARNING: Unseen word counter is empty!", this.GetType().FullName);
         int numTags = tagIndex.Size();
         for (int tt = 0; tt < numTags; tt++)
         {
             if (!BoundaryTag.Equals(tagIndex.Get(tt)))
             {
                 IntTaggedWord iT = new IntTaggedWord(UnknownWordModelTrainerConstants.nullWord, tt);
                 IntTaggedWord i  = UnknownWordModelTrainerConstants.NullItw;
                 unSeenCounter.IncrementCount(iT);
                 unSeenCounter.IncrementCount(i);
             }
         }
     }
     return(model);
 }
 // else {
 public override IUnknownWordModel FinishTraining()
 {
     // make sure the unseen counter isn't empty!  If it is, put in
     // a uniform unseen over tags
     if (unSeenCounter.IsEmpty())
     {
         int numTags = tagIndex.Size();
         for (int tt = 0; tt < numTags; tt++)
         {
             if (!BoundaryTag.Equals(tagIndex.Get(tt)))
             {
                 IntTaggedWord iT = new IntTaggedWord(UnknownWordModelTrainerConstants.nullWord, tt);
                 IntTaggedWord i  = UnknownWordModelTrainerConstants.NullItw;
                 unSeenCounter.IncrementCount(iT);
                 unSeenCounter.IncrementCount(i);
             }
         }
     }
     // index the possible tags for each word
     // numWords = wordIndex.size();
     // unknownWordIndex = wordIndex.indexOf(Lexicon.UNKNOWN_WORD, true);
     // initRulesWithWord();
     return(model);
 }
        /// <summary>
        /// Return various statistics about the treebank (number of sentences,
        /// words, tag set, etc.).
        /// </summary>
        /// <param name="tlp">
        /// The TreebankLanguagePack used to determine punctuation and an
        /// appropriate character encoding
        /// </param>
        /// <returns>A big string for human consumption describing the treebank</returns>
        public virtual string TextualSummary(ITreebankLanguagePack tlp)
        {
            int  numTrees         = 0;
            int  numTreesLE40     = 0;
            int  numNonUnaryRoots = 0;
            Tree nonUnaryEg       = null;
            ClassicCounter <Tree>   nonUnaries = new ClassicCounter <Tree>();
            ClassicCounter <string> roots      = new ClassicCounter <string>();
            ClassicCounter <string> starts     = new ClassicCounter <string>();
            ClassicCounter <string> puncts     = new ClassicCounter <string>();
            int numUnenclosedLeaves            = 0;
            int numLeaves     = 0;
            int numNonPhrasal = 0;
            int numPreTerminalWithMultipleChildren = 0;
            int numWords                       = 0;
            int numTags                        = 0;
            int shortestSentence               = int.MaxValue;
            int longestSentence                = 0;
            int numNullLabel                   = 0;
            ICollection <string>    words      = Generics.NewHashSet();
            ClassicCounter <string> tags       = new ClassicCounter <string>();
            ClassicCounter <string> cats       = new ClassicCounter <string>();
            Tree leafEg                        = null;
            Tree preTerminalMultipleChildrenEg = null;
            Tree nullLabelEg                   = null;
            Tree rootRewritesAsTaggedWordEg    = null;

            foreach (Tree t in this)
            {
                roots.IncrementCount(t.Value());
                numTrees++;
                int leng = t.Yield().Count;
                if (leng <= 40)
                {
                    numTreesLE40++;
                }
                if (leng < shortestSentence)
                {
                    shortestSentence = leng;
                }
                if (leng > longestSentence)
                {
                    longestSentence = leng;
                }
                if (t.NumChildren() > 1)
                {
                    if (numNonUnaryRoots == 0)
                    {
                        nonUnaryEg = t;
                    }
                    if (numNonUnaryRoots < 100)
                    {
                        nonUnaries.IncrementCount(t.LocalTree());
                    }
                    numNonUnaryRoots++;
                }
                else
                {
                    if (t.IsLeaf())
                    {
                        numUnenclosedLeaves++;
                    }
                    else
                    {
                        Tree t2 = t.FirstChild();
                        if (t2.IsLeaf())
                        {
                            numLeaves++;
                            leafEg = t;
                        }
                        else
                        {
                            if (t2.IsPreTerminal())
                            {
                                if (numNonPhrasal == 0)
                                {
                                    rootRewritesAsTaggedWordEg = t;
                                }
                                numNonPhrasal++;
                            }
                        }
                        starts.IncrementCount(t2.Value());
                    }
                }
                foreach (Tree subtree in t)
                {
                    ILabel lab = subtree.Label();
                    if (lab == null || lab.Value() == null || lab.Value().IsEmpty())
                    {
                        if (numNullLabel == 0)
                        {
                            nullLabelEg = subtree;
                        }
                        numNullLabel++;
                        if (lab == null)
                        {
                            subtree.SetLabel(new StringLabel(string.Empty));
                        }
                        else
                        {
                            if (lab.Value() == null)
                            {
                                subtree.Label().SetValue(string.Empty);
                            }
                        }
                    }
                    if (subtree.IsLeaf())
                    {
                        numWords++;
                        words.Add(subtree.Value());
                    }
                    else
                    {
                        if (subtree.IsPreTerminal())
                        {
                            numTags++;
                            tags.IncrementCount(subtree.Value());
                            if (tlp != null && tlp.IsPunctuationTag(subtree.Value()))
                            {
                                puncts.IncrementCount(subtree.FirstChild().Value());
                            }
                        }
                        else
                        {
                            if (subtree.IsPhrasal())
                            {
                                bool hasLeafChild = false;
                                foreach (Tree kt in subtree.Children())
                                {
                                    if (kt.IsLeaf())
                                    {
                                        hasLeafChild = true;
                                    }
                                }
                                if (hasLeafChild)
                                {
                                    numPreTerminalWithMultipleChildren++;
                                    if (preTerminalMultipleChildrenEg == null)
                                    {
                                        preTerminalMultipleChildrenEg = subtree;
                                    }
                                }
                                cats.IncrementCount(subtree.Value());
                            }
                            else
                            {
                                throw new InvalidOperationException("Treebank: Bad tree in treebank!: " + subtree);
                            }
                        }
                    }
                }
            }
            StringWriter sw = new StringWriter(2000);
            PrintWriter  pw = new PrintWriter(sw);
            NumberFormat nf = NumberFormat.GetNumberInstance();

            nf.SetMaximumFractionDigits(0);
            pw.Println("Treebank has " + numTrees + " trees (" + numTreesLE40 + " of length <= 40) and " + numWords + " words (tokens)");
            if (numTrees > 0)
            {
                if (numTags != numWords)
                {
                    pw.Println("  Warning! numTags differs and is " + numTags);
                }
                if (roots.Size() == 1)
                {
                    string root = (string)Sharpen.Collections.ToArray(roots.KeySet())[0];
                    pw.Println("  The root category is: " + root);
                }
                else
                {
                    pw.Println("  Warning! " + roots.Size() + " different roots in treebank: " + Counters.ToString(roots, nf));
                }
                if (numNonUnaryRoots > 0)
                {
                    pw.Print("  Warning! " + numNonUnaryRoots + " trees without unary initial rewrite.  ");
                    if (numNonUnaryRoots > 100)
                    {
                        pw.Print("First 100 ");
                    }
                    pw.Println("Rewrites: " + Counters.ToString(nonUnaries, nf));
                    pw.Println("    Example: " + nonUnaryEg);
                }
                if (numUnenclosedLeaves > 0 || numLeaves > 0 || numNonPhrasal > 0)
                {
                    pw.Println("  Warning! Non-phrasal trees: " + numUnenclosedLeaves + " bare leaves; " + numLeaves + " root rewrites as leaf; and " + numNonPhrasal + " root rewrites as tagged word");
                    if (numLeaves > 0)
                    {
                        pw.Println("  Example bad root rewrites as leaf: " + leafEg);
                    }
                    if (numNonPhrasal > 0)
                    {
                        pw.Println("  Example bad root rewrites as tagged word: " + rootRewritesAsTaggedWordEg);
                    }
                }
                if (numNullLabel > 0)
                {
                    pw.Println("  Warning!  " + numNullLabel + " tree nodes with null or empty string labels, e.g.:");
                    pw.Println("    " + nullLabelEg);
                }
                if (numPreTerminalWithMultipleChildren > 0)
                {
                    pw.Println("  Warning! " + numPreTerminalWithMultipleChildren + " preterminal nodes with multiple children.");
                    pw.Println("    Example: " + preTerminalMultipleChildrenEg);
                }
                pw.Println("  Sentences range from " + shortestSentence + " to " + longestSentence + " words, with an average length of " + (((numWords * 100) / numTrees) / 100.0) + " words.");
                pw.Println("  " + cats.Size() + " phrasal category types, " + tags.Size() + " tag types, and " + words.Count + " word types");
                string[] empties = new string[] { "*", "0", "*T*", "*RNR*", "*U*", "*?*", "*EXP*", "*ICH*", "*NOT*", "*PPA*", "*OP*", "*pro*", "*PRO*" };
                // What a dopey choice using 0 as an empty element name!!
                // The problem with the below is that words aren't turned into a basic
                // category, but empties commonly are indexed....  Would need to look
                // for them with a suffix of -[0-9]+
                ICollection <string> knownEmpties        = Generics.NewHashSet(Arrays.AsList(empties));
                ICollection <string> emptiesIntersection = Sets.Intersection(words, knownEmpties);
                if (!emptiesIntersection.IsEmpty())
                {
                    pw.Println("  Caution! " + emptiesIntersection.Count + " word types are known empty elements: " + emptiesIntersection);
                }
                ICollection <string> joint = Sets.Intersection(cats.KeySet(), tags.KeySet());
                if (!joint.IsEmpty())
                {
                    pw.Println("  Warning! " + joint.Count + " items are tags and categories: " + joint);
                }
                foreach (string cat in cats.KeySet())
                {
                    if (cat != null && cat.Contains("@"))
                    {
                        pw.Println("  Warning!!  Stanford Parser does not work with categories containing '@' like: " + cat);
                        break;
                    }
                }
                foreach (string cat_1 in tags.KeySet())
                {
                    if (cat_1 != null && cat_1.Contains("@"))
                    {
                        pw.Println("  Warning!!  Stanford Parser does not work with tags containing '@' like: " + cat_1);
                        break;
                    }
                }
                pw.Println("    Cats: " + Counters.ToString(cats, nf));
                pw.Println("    Tags: " + Counters.ToString(tags, nf));
                pw.Println("    " + starts.Size() + " start categories: " + Counters.ToString(starts, nf));
                if (!puncts.IsEmpty())
                {
                    pw.Println("    Puncts: " + Counters.ToString(puncts, nf));
                }
            }
            return(sw.ToString());
        }