/// <summary>Samples a single position in the sequence.</summary> /// <remarks> /// Samples a single position in the sequence. /// Does not modify the sequence passed in. /// returns the score of the new label for the position to sample /// </remarks> /// <param name="sequence">the sequence to start with</param> /// <param name="pos">the position to sample.</param> /// <param name="temperature">the temperature to control annealing</param> private Pair <int, double> SamplePositionHelper(ISequenceModel model, int[] sequence, int pos, double temperature) { double[] distribution = model.ScoresOf(sequence, pos); if (temperature != 1.0) { if (temperature == 0.0) { // set the max to 1.0 int argmax = ArrayMath.Argmax(distribution); Arrays.Fill(distribution, double.NegativeInfinity); distribution[argmax] = 0.0; } else { // take all to a power // use the temperature to increase/decrease the entropy of the sampling distribution ArrayMath.MultiplyInPlace(distribution, 1.0 / temperature); } } ArrayMath.LogNormalize(distribution); ArrayMath.ExpInPlace(distribution); int newTag = ArrayMath.SampleFromDistribution(distribution, random); double newProb = distribution[newTag]; return(new Pair <int, double>(newTag, newProb)); }
/// <summary>Do max language model markov segmentation.</summary> /// <remarks> /// Do max language model markov segmentation. /// Note that this algorithm inherently tags words as it goes, but that /// we throw away the tags in the final result so that the segmented words /// are untagged. (Note: for a couple of years till Aug 2007, a tagged /// result was returned, but this messed up the parser, because it could /// use no tagging but the given tagging, which often wasn't very good. /// Or in particular it was a subcategorized tagging which never worked /// with the current forceTags option which assumes that gold taggings are /// inherently basic taggings.) /// </remarks> /// <param name="s">A String to segment</param> /// <returns>The list of segmented words.</returns> private List <IHasWord> SegmentWordsWithMarkov(string s) { // We don't want to accidentally register words that we don't know // about in the wordIndex, so we wrap it with a DeltaIndex DeltaIndex <string> deltaWordIndex = new DeltaIndex <string>(wordIndex); int length = s.Length; // Set<String> POSes = (Set<String>) POSDistribution.keySet(); // 1.5 int numTags = POSes.Count; // score of span with initial word of this tag double[][][] scores = new double[length][][]; // best (length of) first word for this span with this tag int[][][] splitBacktrace = new int[length][][]; // best tag for second word over this span, if first is this tag int[][][] POSbacktrace = new int[length][][]; for (int i = 0; i < length; i++) { for (int j = 0; j < length + 1; j++) { Arrays.Fill(scores[i][j], double.NegativeInfinity); } } // first fill in word probabilities for (int diff = 1; diff <= 10; diff++) { for (int start = 0; start + diff <= length; start++) { int end = start + diff; StringBuilder wordBuf = new StringBuilder(); for (int pos = start; pos < end; pos++) { wordBuf.Append(s[pos]); } string word = wordBuf.ToString(); foreach (string tag in POSes) { IntTaggedWord itw = new IntTaggedWord(word, tag, deltaWordIndex, tagIndex); double score = lex.Score(itw, 0, word, null); if (start == 0) { score += Math.Log(initialPOSDist.ProbabilityOf(tag)); } scores[start][end][itw.Tag()] = score; splitBacktrace[start][end][itw.Tag()] = end; } } } // now fill in word combination probabilities for (int diff_1 = 2; diff_1 <= length; diff_1++) { for (int start = 0; start + diff_1 <= length; start++) { int end = start + diff_1; for (int split = start + 1; split < end && split - start <= 10; split++) { foreach (string tag in POSes) { int tagNum = tagIndex.AddToIndex(tag); if (splitBacktrace[start][split][tagNum] != split) { continue; } Distribution <string> rTagDist = markovPOSDists[tag]; if (rTagDist == null) { continue; } // this happens with "*" POS foreach (string rTag in POSes) { int rTagNum = tagIndex.AddToIndex(rTag); double newScore = scores[start][split][tagNum] + scores[split][end][rTagNum] + Math.Log(rTagDist.ProbabilityOf(rTag)); if (newScore > scores[start][end][tagNum]) { scores[start][end][tagNum] = newScore; splitBacktrace[start][end][tagNum] = split; POSbacktrace[start][end][tagNum] = rTagNum; } } } } } } int nextPOS = ArrayMath.Argmax(scores[0][length]); List <IHasWord> words = new List <IHasWord>(); int start_1 = 0; while (start_1 < length) { int split = splitBacktrace[start_1][length][nextPOS]; StringBuilder wordBuf = new StringBuilder(); for (int i_1 = start_1; i_1 < split; i_1++) { wordBuf.Append(s[i_1]); } string word = wordBuf.ToString(); // String tag = tagIndex.get(nextPOS); // words.add(new TaggedWord(word, tag)); words.Add(new Word(word)); if (split < length) { nextPOS = POSbacktrace[start_1][length][nextPOS]; } start_1 = split; } return(words); }
public virtual void TestArgmax() { NUnit.Framework.Assert.AreEqual(d1[ArrayMath.Argmax(d1)], 1e-5, ArrayMath.Max(d1)); NUnit.Framework.Assert.AreEqual(d2[ArrayMath.Argmax(d2)], 1e-5, ArrayMath.Max(d2)); NUnit.Framework.Assert.AreEqual(d3[ArrayMath.Argmax(d3)], 1e-5, ArrayMath.Max(d3)); }