예제 #1
0
        /// <summary>
        /// Provides some testing and opportunities for exploration of the
        /// probabilities of a BaseLexicon.
        /// </summary>
        /// <remarks>
        /// Provides some testing and opportunities for exploration of the
        /// probabilities of a BaseLexicon.  What's here currently probably
        /// only works for the English Penn Treeebank, as it uses default
        /// constructors.  Of the words given to test on,
        /// the first is treated as sentence initial, and the rest as not
        /// sentence initial.
        /// </remarks>
        /// <param name="args">
        /// The command line arguments:
        /// java BaseLexicon treebankPath fileRange unknownWordModel words
        /// </param>
        public static void Main(string[] args)
        {
            if (args.Length < 3)
            {
                log.Info("java BaseLexicon treebankPath fileRange unknownWordModel words*");
                return;
            }
            System.Console.Out.Write("Training BaseLexicon from " + args[0] + ' ' + args[1] + " ... ");
            Treebank tb = new DiskTreebank();

            tb.LoadPath(args[0], new NumberRangesFileFilter(args[1], true));
            // TODO: change this interface so the lexicon creates its own indices?
            IIndex <string> wordIndex = new HashIndex <string>();
            IIndex <string> tagIndex  = new HashIndex <string>();
            Options         op        = new Options();

            op.lexOptions.useUnknownWordSignatures = System.Convert.ToInt32(args[2]);
            Edu.Stanford.Nlp.Parser.Lexparser.BaseLexicon lex = new Edu.Stanford.Nlp.Parser.Lexparser.BaseLexicon(op, wordIndex, tagIndex);
            lex.InitializeTraining(tb.Count);
            lex.Train(tb);
            lex.FinishTraining();
            System.Console.Out.WriteLine("done.");
            System.Console.Out.WriteLine();
            NumberFormat nf = NumberFormat.GetNumberInstance();

            nf.SetMaximumFractionDigits(4);
            IList <string> impos = new List <string>();

            for (int i = 3; i < args.Length; i++)
            {
                if (lex.IsKnown(args[i]))
                {
                    System.Console.Out.WriteLine(args[i] + " is a known word.  Log probabilities [log P(w|t)] for its taggings are:");
                    for (IEnumerator <IntTaggedWord> it = lex.RuleIteratorByWord(wordIndex.AddToIndex(args[i]), i - 3, null); it.MoveNext();)
                    {
                        IntTaggedWord iTW = it.Current;
                        System.Console.Out.WriteLine(StringUtils.Pad(iTW, 24) + nf.Format(lex.Score(iTW, i - 3, wordIndex.Get(iTW.word), null)));
                    }
                }
                else
                {
                    string sig = lex.GetUnknownWordModel().GetSignature(args[i], i - 3);
                    System.Console.Out.WriteLine(args[i] + " is an unknown word.  Signature with uwm " + lex.GetUnknownWordModel().GetUnknownLevel() + ((i == 3) ? " init" : "non-init") + " is: " + sig);
                    impos.Clear();
                    IList <string> lis = new List <string>(tagIndex.ObjectsList());
                    lis.Sort();
                    foreach (string tStr in lis)
                    {
                        IntTaggedWord iTW   = new IntTaggedWord(args[i], tStr, wordIndex, tagIndex);
                        double        score = lex.Score(iTW, 1, args[i], null);
                        if (score == float.NegativeInfinity)
                        {
                            impos.Add(tStr);
                        }
                        else
                        {
                            System.Console.Out.WriteLine(StringUtils.Pad(iTW, 24) + nf.Format(score));
                        }
                    }
                    if (impos.Count > 0)
                    {
                        System.Console.Out.WriteLine(args[i] + " impossible tags: " + impos);
                    }
                }
                System.Console.Out.WriteLine();
            }
        }