/// <summary> /// Provides some testing and opportunities for exploration of the /// probabilities of a BaseLexicon. /// </summary> /// <remarks> /// Provides some testing and opportunities for exploration of the /// probabilities of a BaseLexicon. What's here currently probably /// only works for the English Penn Treeebank, as it uses default /// constructors. Of the words given to test on, /// the first is treated as sentence initial, and the rest as not /// sentence initial. /// </remarks> /// <param name="args"> /// The command line arguments: /// java BaseLexicon treebankPath fileRange unknownWordModel words /// </param> public static void Main(string[] args) { if (args.Length < 3) { log.Info("java BaseLexicon treebankPath fileRange unknownWordModel words*"); return; } System.Console.Out.Write("Training BaseLexicon from " + args[0] + ' ' + args[1] + " ... "); Treebank tb = new DiskTreebank(); tb.LoadPath(args[0], new NumberRangesFileFilter(args[1], true)); // TODO: change this interface so the lexicon creates its own indices? IIndex <string> wordIndex = new HashIndex <string>(); IIndex <string> tagIndex = new HashIndex <string>(); Options op = new Options(); op.lexOptions.useUnknownWordSignatures = System.Convert.ToInt32(args[2]); Edu.Stanford.Nlp.Parser.Lexparser.BaseLexicon lex = new Edu.Stanford.Nlp.Parser.Lexparser.BaseLexicon(op, wordIndex, tagIndex); lex.InitializeTraining(tb.Count); lex.Train(tb); lex.FinishTraining(); System.Console.Out.WriteLine("done."); System.Console.Out.WriteLine(); NumberFormat nf = NumberFormat.GetNumberInstance(); nf.SetMaximumFractionDigits(4); IList <string> impos = new List <string>(); for (int i = 3; i < args.Length; i++) { if (lex.IsKnown(args[i])) { System.Console.Out.WriteLine(args[i] + " is a known word. Log probabilities [log P(w|t)] for its taggings are:"); for (IEnumerator <IntTaggedWord> it = lex.RuleIteratorByWord(wordIndex.AddToIndex(args[i]), i - 3, null); it.MoveNext();) { IntTaggedWord iTW = it.Current; System.Console.Out.WriteLine(StringUtils.Pad(iTW, 24) + nf.Format(lex.Score(iTW, i - 3, wordIndex.Get(iTW.word), null))); } } else { string sig = lex.GetUnknownWordModel().GetSignature(args[i], i - 3); System.Console.Out.WriteLine(args[i] + " is an unknown word. Signature with uwm " + lex.GetUnknownWordModel().GetUnknownLevel() + ((i == 3) ? " init" : "non-init") + " is: " + sig); impos.Clear(); IList <string> lis = new List <string>(tagIndex.ObjectsList()); lis.Sort(); foreach (string tStr in lis) { IntTaggedWord iTW = new IntTaggedWord(args[i], tStr, wordIndex, tagIndex); double score = lex.Score(iTW, 1, args[i], null); if (score == float.NegativeInfinity) { impos.Add(tStr); } else { System.Console.Out.WriteLine(StringUtils.Pad(iTW, 24) + nf.Format(score)); } } if (impos.Count > 0) { System.Console.Out.WriteLine(args[i] + " impossible tags: " + impos); } } System.Console.Out.WriteLine(); } }