static Dictionary <ObservationFromState, double> createObservationModel(List <Word> words) { Dictionary <ObservationFromState, double> model = new Dictionary <ObservationFromState, double>(); //actual model Dictionary <ObservationFromState, int> wordCount = new Dictionary <ObservationFromState, int>(); //how many times the ObservationFromState occurs Dictionary <WordType, int> typeCount = new Dictionary <WordType, int>(); //how many times WordType occurs for (int i = 0; i < words.Count - 1; i++) { if (typeCount.ContainsKey(words[i].PartOfSpeech)) { typeCount[words[i].PartOfSpeech]++; } else { typeCount.Add(words[i].PartOfSpeech, 1); } ObservationFromState observation = new ObservationFromState(words[i].Content, words[i].PartOfSpeech); if (wordCount.ContainsKey(observation)) { wordCount[observation]++; } else { wordCount.Add(observation, 1); } } foreach (KeyValuePair <ObservationFromState, int> pair in wordCount) { model.Add(pair.Key, (double)pair.Value / (double)typeCount[pair.Key.state]); } return(model); }
public static void printObservationMatrix(Dictionary <ObservationFromState, double> model, List <string> myDictionary) { WordType[] types = (WordType[])Enum.GetValues(typeof(WordType)); ObservationFromState obsv = new ObservationFromState("", WordType.Undefined); StringBuilder output = new StringBuilder(); //Header output.Append("...,"); for (int n = 0; n < myDictionary.Count; n++) { output.Append(myDictionary[n].escape() + ","); } output.AppendLine(); for (int row = 0; row < types.Length; row++) { WordType from = types[row]; obsv.state = from; output.Append(from.ToString() + ","); for (int col = 0; col < myDictionary.Count; col++) { obsv.observation = myDictionary[col]; if (model.ContainsKey(obsv)) { output.Append(model[obsv]); } output.Append(","); } output.AppendLine(); } System.IO.File.WriteAllText("observer.csv", output.ToString()); }
//Φ[t](i) = argmax[j](δ[t-1](j) * a[ji]) //P(X at time t) = max[for each i = prevstate] (P(i at time t - 1) * P(X|i) * P(observation at time t|X) // A = transition matrix // B = emission matrix, B[ij] = prob of observing o[j] from state s[i] /*N=length(O); # number of observation categories * K=length(S); # number of hidden states * T=length(Y); # length of observation series */ /// <summary> /// Does the Viberti algorithm, giving the most likely parts of speech for a sentence. /// </summary> /// <param name="sentence">The sentence.</param> /// <param name="transitionProbs">The probability of transitioning from one part of speech to another (probability: 0-1)</param> /// <param name="observationProbs">The probability of observing a word given a part of speech: (probability: 0-1) (i.e. if the part of speech is "Noun", what is the probability that word is "Dog" or "Cat" ? )</param> /// <param name="initialProbs">The probability that a part of speech appears at the beginning of a sentence (probability: 0-1)</param> /// <returns>Returns the 1-based (NOT 0-Based) array of the parts of speech</returns> public static WordType[] DoViterbi(List <string> sentence, Dictionary <StateTransition, double> transitionProbs, Dictionary <ObservationFromState, double> observationProbs, Dictionary <WordType, double> initialProbs) { const int NUM_POSSIBLE_STATES_K = (int)WordType.COUNT; int WORKING_LEN_T = sentence.Count; //1-based PrettyTable <double> T1 = new PrettyTable <double>(NUM_POSSIBLE_STATES_K + 1, WORKING_LEN_T + 1); PrettyTable <int> T2 = new PrettyTable <int>(NUM_POSSIBLE_STATES_K + 1, WORKING_LEN_T + 1); // Console.WriteLine(T1.ToString()); int[] Z = new int[WORKING_LEN_T + 1]; WordType[] X = new WordType[WORKING_LEN_T + 1]; for (int i = 1; i <= NUM_POSSIBLE_STATES_K; i++) { WordType si = (WordType)i; ObservationFromState observationTrans = new SpeechTagging.ObservationFromState(sentence[0], si); T1[i, 1] = (initialProbs.ContainsKey((WordType)i) ? initialProbs[(WordType)i] : EPSILON) * (observationProbs.ContainsKey(observationTrans) ? observationProbs[observationTrans] : EPSILON); T2[i, 1] = 0; } for (int i = 2; i <= WORKING_LEN_T; i++) { foreach (WordType sj in Enum.GetValues(typeof(WordType))) { int j = (int)sj; int maxK2 = int.MinValue; double maxKVal1 = double.MinValue; double maxKVal2 = double.MinValue; StateTransition stateTrans = new SpeechTagging.StateTransition(WordType.Undefined, (WordType)(i - 1)); ObservationFromState observationTrans = new ObservationFromState(sentence[i - 1], sj); for (int k = 1; k <= NUM_POSSIBLE_STATES_K; k++) { stateTrans.from = (WordType)k; double val = T1[k, i - 1] * (transitionProbs.ContainsKey(stateTrans) ? transitionProbs[stateTrans] : EPSILON) * (observationProbs.ContainsKey(observationTrans) ? observationProbs[observationTrans] : EPSILON); if (val > maxKVal1) { maxKVal1 = val; } val = T1[k, i - 1] * (transitionProbs.ContainsKey(stateTrans) ? transitionProbs[stateTrans] : EPSILON); if (val > maxKVal2) { maxKVal2 = val; maxK2 = k; } } WordType maxK2wordtype = (WordType)maxK2; T1[j, i] = maxKVal1; T2[j, i] = maxK2; } // Console.WriteLine(T1.ToString()); // Console.WriteLine(T2.ToString()); } double maxKTval = double.MinValue; for (int k = 1; k <= NUM_POSSIBLE_STATES_K; k++) { if (T1[k, WORKING_LEN_T] > maxKTval) { maxKTval = T1[k, WORKING_LEN_T]; Z[WORKING_LEN_T] = k; } } X[WORKING_LEN_T] = (WordType)Z[WORKING_LEN_T]; for (int i = WORKING_LEN_T; i >= 2; i--) { Z[i - 1] = T2[Z[i], i]; X[i - 1] = (WordType)Z[i - 1]; } return(X); }