Ejemplo n.º 1
0
        protected internal virtual void InitTagBins()
        {
            IIndex <string> tagBinIndex = new HashIndex <string>();

            tagBin = new int[tagIndex.Size()];
            for (int t = 0; t < tagBin.Length; t++)
            {
                string tagStr = tagIndex.Get(t);
                string binStr;
                if (tagProjection == null)
                {
                    binStr = tagStr;
                }
                else
                {
                    binStr = tagProjection.Project(tagStr);
                }
                tagBin[t] = tagBinIndex.AddToIndex(binStr);
            }
            numTagBins = tagBinIndex.Size();
        }
        /// <summary>Removes all features from the dataset that are not in featureSet.</summary>
        /// <param name="featureSet"/>
        public virtual void SelectFeaturesFromSet(ICollection <F> featureSet)
        {
            HashIndex <F> newFeatureIndex = new HashIndex <F>();

            int[] featMap = new int[featureIndex.Size()];
            Arrays.Fill(featMap, -1);
            foreach (F feature in featureSet)
            {
                int oldID = featureIndex.IndexOf(feature);
                if (oldID >= 0)
                {
                    // it's a valid feature in the index
                    int newID = newFeatureIndex.AddToIndex(feature);
                    featMap[oldID] = newID;
                }
            }
            featureIndex = newFeatureIndex;
            for (int i = 0; i < size; i++)
            {
                IList <int>    featList  = new List <int>(data[i].Length);
                IList <double> valueList = new List <double>(values[i].Length);
                for (int j = 0; j < data[i].Length; j++)
                {
                    if (featMap[data[i][j]] >= 0)
                    {
                        featList.Add(featMap[data[i][j]]);
                        valueList.Add(values[i][j]);
                    }
                }
                data[i]   = new int[featList.Count];
                values[i] = new double[valueList.Count];
                for (int j_1 = 0; j_1 < data[i].Length; j_1++)
                {
                    data[i][j_1]   = featList[j_1];
                    values[i][j_1] = valueList[j_1];
                }
            }
        }
Ejemplo n.º 3
0
        /// <summary>
        /// Provides some testing and opportunities for exploration of the
        /// probabilities of a BaseLexicon.
        /// </summary>
        /// <remarks>
        /// Provides some testing and opportunities for exploration of the
        /// probabilities of a BaseLexicon.  What's here currently probably
        /// only works for the English Penn Treeebank, as it uses default
        /// constructors.  Of the words given to test on,
        /// the first is treated as sentence initial, and the rest as not
        /// sentence initial.
        /// </remarks>
        /// <param name="args">
        /// The command line arguments:
        /// java BaseLexicon treebankPath fileRange unknownWordModel words
        /// </param>
        public static void Main(string[] args)
        {
            if (args.Length < 3)
            {
                log.Info("java BaseLexicon treebankPath fileRange unknownWordModel words*");
                return;
            }
            System.Console.Out.Write("Training BaseLexicon from " + args[0] + ' ' + args[1] + " ... ");
            Treebank tb = new DiskTreebank();

            tb.LoadPath(args[0], new NumberRangesFileFilter(args[1], true));
            // TODO: change this interface so the lexicon creates its own indices?
            IIndex <string> wordIndex = new HashIndex <string>();
            IIndex <string> tagIndex  = new HashIndex <string>();
            Options         op        = new Options();

            op.lexOptions.useUnknownWordSignatures = System.Convert.ToInt32(args[2]);
            Edu.Stanford.Nlp.Parser.Lexparser.BaseLexicon lex = new Edu.Stanford.Nlp.Parser.Lexparser.BaseLexicon(op, wordIndex, tagIndex);
            lex.InitializeTraining(tb.Count);
            lex.Train(tb);
            lex.FinishTraining();
            System.Console.Out.WriteLine("done.");
            System.Console.Out.WriteLine();
            NumberFormat nf = NumberFormat.GetNumberInstance();

            nf.SetMaximumFractionDigits(4);
            IList <string> impos = new List <string>();

            for (int i = 3; i < args.Length; i++)
            {
                if (lex.IsKnown(args[i]))
                {
                    System.Console.Out.WriteLine(args[i] + " is a known word.  Log probabilities [log P(w|t)] for its taggings are:");
                    for (IEnumerator <IntTaggedWord> it = lex.RuleIteratorByWord(wordIndex.AddToIndex(args[i]), i - 3, null); it.MoveNext();)
                    {
                        IntTaggedWord iTW = it.Current;
                        System.Console.Out.WriteLine(StringUtils.Pad(iTW, 24) + nf.Format(lex.Score(iTW, i - 3, wordIndex.Get(iTW.word), null)));
                    }
                }
                else
                {
                    string sig = lex.GetUnknownWordModel().GetSignature(args[i], i - 3);
                    System.Console.Out.WriteLine(args[i] + " is an unknown word.  Signature with uwm " + lex.GetUnknownWordModel().GetUnknownLevel() + ((i == 3) ? " init" : "non-init") + " is: " + sig);
                    impos.Clear();
                    IList <string> lis = new List <string>(tagIndex.ObjectsList());
                    lis.Sort();
                    foreach (string tStr in lis)
                    {
                        IntTaggedWord iTW   = new IntTaggedWord(args[i], tStr, wordIndex, tagIndex);
                        double        score = lex.Score(iTW, 1, args[i], null);
                        if (score == float.NegativeInfinity)
                        {
                            impos.Add(tStr);
                        }
                        else
                        {
                            System.Console.Out.WriteLine(StringUtils.Pad(iTW, 24) + nf.Format(score));
                        }
                    }
                    if (impos.Count > 0)
                    {
                        System.Console.Out.WriteLine(args[i] + " impossible tags: " + impos);
                    }
                }
                System.Console.Out.WriteLine();
            }
        }