Exemplo n.º 1
0
        /// <summary>Samples a single position in the sequence.</summary>
        /// <remarks>
        /// Samples a single position in the sequence.
        /// Does not modify the sequence passed in.
        /// returns the score of the new label for the position to sample
        /// </remarks>
        /// <param name="sequence">the sequence to start with</param>
        /// <param name="pos">the position to sample.</param>
        /// <param name="temperature">the temperature to control annealing</param>
        private Pair <int, double> SamplePositionHelper(ISequenceModel model, int[] sequence, int pos, double temperature)
        {
            double[] distribution = model.ScoresOf(sequence, pos);
            if (temperature != 1.0)
            {
                if (temperature == 0.0)
                {
                    // set the max to 1.0
                    int argmax = ArrayMath.Argmax(distribution);
                    Arrays.Fill(distribution, double.NegativeInfinity);
                    distribution[argmax] = 0.0;
                }
                else
                {
                    // take all to a power
                    // use the temperature to increase/decrease the entropy of the sampling distribution
                    ArrayMath.MultiplyInPlace(distribution, 1.0 / temperature);
                }
            }
            ArrayMath.LogNormalize(distribution);
            ArrayMath.ExpInPlace(distribution);
            int    newTag  = ArrayMath.SampleFromDistribution(distribution, random);
            double newProb = distribution[newTag];

            return(new Pair <int, double>(newTag, newProb));
        }
        /// <summary>Do max language model markov segmentation.</summary>
        /// <remarks>
        /// Do max language model markov segmentation.
        /// Note that this algorithm inherently tags words as it goes, but that
        /// we throw away the tags in the final result so that the segmented words
        /// are untagged.  (Note: for a couple of years till Aug 2007, a tagged
        /// result was returned, but this messed up the parser, because it could
        /// use no tagging but the given tagging, which often wasn't very good.
        /// Or in particular it was a subcategorized tagging which never worked
        /// with the current forceTags option which assumes that gold taggings are
        /// inherently basic taggings.)
        /// </remarks>
        /// <param name="s">A String to segment</param>
        /// <returns>The list of segmented words.</returns>
        private List <IHasWord> SegmentWordsWithMarkov(string s)
        {
            // We don't want to accidentally register words that we don't know
            // about in the wordIndex, so we wrap it with a DeltaIndex
            DeltaIndex <string> deltaWordIndex = new DeltaIndex <string>(wordIndex);
            int length = s.Length;
            //    Set<String> POSes = (Set<String>) POSDistribution.keySet();  // 1.5
            int numTags = POSes.Count;

            // score of span with initial word of this tag
            double[][][] scores = new double[length][][];
            // best (length of) first word for this span with this tag
            int[][][] splitBacktrace = new int[length][][];
            // best tag for second word over this span, if first is this tag
            int[][][] POSbacktrace = new int[length][][];
            for (int i = 0; i < length; i++)
            {
                for (int j = 0; j < length + 1; j++)
                {
                    Arrays.Fill(scores[i][j], double.NegativeInfinity);
                }
            }
            // first fill in word probabilities
            for (int diff = 1; diff <= 10; diff++)
            {
                for (int start = 0; start + diff <= length; start++)
                {
                    int           end     = start + diff;
                    StringBuilder wordBuf = new StringBuilder();
                    for (int pos = start; pos < end; pos++)
                    {
                        wordBuf.Append(s[pos]);
                    }
                    string word = wordBuf.ToString();
                    foreach (string tag in POSes)
                    {
                        IntTaggedWord itw   = new IntTaggedWord(word, tag, deltaWordIndex, tagIndex);
                        double        score = lex.Score(itw, 0, word, null);
                        if (start == 0)
                        {
                            score += Math.Log(initialPOSDist.ProbabilityOf(tag));
                        }
                        scores[start][end][itw.Tag()]         = score;
                        splitBacktrace[start][end][itw.Tag()] = end;
                    }
                }
            }
            // now fill in word combination probabilities
            for (int diff_1 = 2; diff_1 <= length; diff_1++)
            {
                for (int start = 0; start + diff_1 <= length; start++)
                {
                    int end = start + diff_1;
                    for (int split = start + 1; split < end && split - start <= 10; split++)
                    {
                        foreach (string tag in POSes)
                        {
                            int tagNum = tagIndex.AddToIndex(tag);
                            if (splitBacktrace[start][split][tagNum] != split)
                            {
                                continue;
                            }
                            Distribution <string> rTagDist = markovPOSDists[tag];
                            if (rTagDist == null)
                            {
                                continue;
                            }
                            // this happens with "*" POS
                            foreach (string rTag in POSes)
                            {
                                int    rTagNum  = tagIndex.AddToIndex(rTag);
                                double newScore = scores[start][split][tagNum] + scores[split][end][rTagNum] + Math.Log(rTagDist.ProbabilityOf(rTag));
                                if (newScore > scores[start][end][tagNum])
                                {
                                    scores[start][end][tagNum]         = newScore;
                                    splitBacktrace[start][end][tagNum] = split;
                                    POSbacktrace[start][end][tagNum]   = rTagNum;
                                }
                            }
                        }
                    }
                }
            }
            int             nextPOS = ArrayMath.Argmax(scores[0][length]);
            List <IHasWord> words   = new List <IHasWord>();
            int             start_1 = 0;

            while (start_1 < length)
            {
                int           split   = splitBacktrace[start_1][length][nextPOS];
                StringBuilder wordBuf = new StringBuilder();
                for (int i_1 = start_1; i_1 < split; i_1++)
                {
                    wordBuf.Append(s[i_1]);
                }
                string word = wordBuf.ToString();
                // String tag = tagIndex.get(nextPOS);
                // words.add(new TaggedWord(word, tag));
                words.Add(new Word(word));
                if (split < length)
                {
                    nextPOS = POSbacktrace[start_1][length][nextPOS];
                }
                start_1 = split;
            }
            return(words);
        }
 public virtual void TestArgmax()
 {
     NUnit.Framework.Assert.AreEqual(d1[ArrayMath.Argmax(d1)], 1e-5, ArrayMath.Max(d1));
     NUnit.Framework.Assert.AreEqual(d2[ArrayMath.Argmax(d2)], 1e-5, ArrayMath.Max(d2));
     NUnit.Framework.Assert.AreEqual(d3[ArrayMath.Argmax(d3)], 1e-5, ArrayMath.Max(d3));
 }