/// <summary> /// Caculate P(word|tag) /// </summary> /// <param name="tag"></param> public double GetProbabilityTagGivenWord(Tag tag) { if (AssociatedTagDict.ContainsKey(tag)) { return (double)AssociatedTagDict[tag] / WordCount; } return 0; }
public void AddTag(Tag tag) { if (AssociatedTagDict.ContainsKey(tag)) { //add 1 count AssociatedTagDict[tag]++; } else { //and new tag into dictionary AssociatedTagDict.Add(tag, 1); } WordCount++; }
public Tagger() { WordDict = new Dictionary<string, Word>(); TagDict = new Dictionary<string, Tag>(); TrainingSentenceList = new List<string>(); TrainingTaggedSentenceList = new List<string>(); Seq2TagDict = new Dictionary<string, Sequence2Tag>(); Seq3TagDict = new Dictionary<string, Sequence3Tag>(); TagDescDict = new Dictionary<string, string>(); //set default value for lamda Lamda1 = (double)1 / 3; Lamda2 = (double)1 / 3; Lamda3 = (double)1 / 3; //add blank tag DumpTag = new Tag() { Value = "Dump" }; //get tag dict desc MakeTagDescDictionary(); }
/// <summary> /// Caculate P(word|tag) /// </summary> /// <param name="word"></param> /// <param name="tag"></param> private double GetProbabilityWordGivenTag(Word word, Tag tag) { return (tag.GetProbabilityWordGivenTag(word)); }
/// <summary> /// Caculate P(word|tag) /// </summary> /// <param name="word"></param> /// <param name="tag"></param> private double GetProbabilityTagGivenWord(Word word, Tag tag) { return (word.GetProbabilityTagGivenWord(tag)); }
/// <summary> /// add to sequence 3 tag to dictionary /// </summary> /// <param name="tag1"></param> /// <param name="tag2"></param> /// <param name="tag3"></param> private void AddSequence3Tag(Tag tag1, Tag tag2, Tag tag3) { var key = tag1.Value + SeparteStr + tag2.Value + SeparteStr + tag3.Value; if (Seq3TagDict.ContainsKey(key)) { Seq3TagDict[key].Count++; } else { var seq = new Sequence3Tag() { Tag1 = tag1, Tag2 = tag2, Tag3 = tag3, Count = 1 }; Seq3TagDict.Add(key, seq); } }
/// <summary> /// get tagged list /// </summary> /// <param name="inputWordList"></param> /// <param name="isSuccessful"></param> /// <param name="message"></param> /// <returns></returns> public List<Tag> GetTagList(List<Word> inputWordList, out bool isSuccessful, out string message) { //make Viterbi table as array var viterbiArr = new List<ViterbiObject>[inputWordList.Count]; #region initilization //add into first position viterbiArr[0] = inputWordList[0].AssociatedTagDict.Keys.Select(tag3 => new ViterbiObject() { Tag1 = DumpTag, Tag2 = DumpTag, Tag3 = tag3, Pi = Math.Log(GetProbabilitySequence3Tag(DumpTag, DumpTag, tag3)) + Math.Log(GetProbabilityWordGivenTag(inputWordList[0], tag3))//use log to store probabilities }).ToList(); //add into second position if (inputWordList.Count >= 2) { viterbiArr[1] = (from tag2 in inputWordList[0].AssociatedTagDict.Keys from tag3 in inputWordList[1].AssociatedTagDict.Keys let prevViterbi = viterbiArr[0].FirstOrDefault(m => m.Tag3 == tag2) where prevViterbi != null select new ViterbiObject() { Tag1 = DumpTag, Tag2 = tag2, Tag3 = tag3, Pi = prevViterbi.Pi + Math.Log(GetProbabilitySequence2Tag(tag2, tag3)) + Math.Log(GetProbabilityWordGivenTag(inputWordList[1], tag3)) }).ToList(); } #endregion #region calculate Pi and generate viterbi table //add from 3 to n for (var i = 2; i < inputWordList.Count; i++) { var viterbiList = new List<ViterbiObject>(); foreach (var tag3 in inputWordList[i].AssociatedTagDict.Keys) { foreach (var tag2 in inputWordList[i - 1].AssociatedTagDict.Keys) { ViterbiObject maxValue = null; foreach (var tag1 in inputWordList[i - 2].AssociatedTagDict.Keys) { var prevViterbi = viterbiArr[i - 1].FirstOrDefault(m => m.Tag3 == tag2 && m.Tag2 == tag1); if (prevViterbi == null) continue; var currentPi = prevViterbi.Pi + Math.Log(GetProbabilitySequence3Tag(tag1, tag2, tag3)) + Math.Log(GetProbabilityWordGivenTag(inputWordList[i], tag3)); if (maxValue == null || maxValue.Pi < currentPi) { maxValue = new ViterbiObject() { Tag1 = tag1, Tag2 = tag2, Tag3 = tag3, Pi = currentPi }; } } viterbiList.Add(maxValue); } } viterbiArr[i] = viterbiList; } #endregion #region backtrack to find sequence tag var tagArr = new Tag[inputWordList.Count]; //find last tag (n)th tagArr[inputWordList.Count - 1] = viterbiArr[inputWordList.Count - 1].MaxBy(m => m.Pi).Tag3; //find (n-1)th tag if (inputWordList.Count > 1) { tagArr[inputWordList.Count - 2] = viterbiArr[inputWordList.Count - 1].MaxBy(m => m.Pi).Tag2; } //find (n-2)th tag if (inputWordList.Count > 2) { tagArr[inputWordList.Count - 3] = viterbiArr[inputWordList.Count - 1].MaxBy(m => m.Pi).Tag1; } //find remaing tag for (var i = inputWordList.Count - 4; i >= 0; i--) { var viterbiObj = viterbiArr[i + 2].FirstOrDefault(m => m.Tag3 == tagArr[i + 2] && m.Tag2 == tagArr[i + 1]); if (viterbiObj != null) { tagArr[i] = viterbiObj.Tag1; } } #endregion isSuccessful = true; message = ""; return tagArr.ToList(); }
/// <summary> /// Caculate P(tag3|tag1,tag2) /// Equation: P(tag3|tag1,tag2) /// = lamda1 * (Count(tag1, tag2,tag3)/Count(tag1,tag2)) /// + lamda2 * (Count(tag2,tag3)/Count(tag2)) /// + lamda3 * (Count(tag3)/Count(total tags)) /// </summary> /// <param name="tag1"></param> /// <param name="tag2"></param> /// <param name="tag3"></param> /// <returns></returns> public double GetProbabilitySequence3Tag(Tag tag1, Tag tag2, Tag tag3) { if (tag3 == null) return 0; double firstParam = 0; double secondParam = 0; //calculate first, second param if (tag2 != null) { if (tag1 != null) { var key12 = tag1.Value + SeparteStr + tag2.Value; if (Seq2TagDict.ContainsKey(key12)) { var key123 = tag1.Value + SeparteStr + tag2.Value + SeparteStr + tag3.Value; if (Seq3TagDict.ContainsKey(key123)) { firstParam = Lamda1 * Seq3TagDict[key123].Count / Seq2TagDict[key12].Count; } } } var key23 = tag2.Value + SeparteStr + tag3.Value; if (Seq2TagDict.ContainsKey(key23)) { if (tag2 == DumpTag) { secondParam = Lamda2*Seq2TagDict[key23].Count / TrainingSentenceList.Count; } else { secondParam = Lamda2 * Seq2TagDict[key23].Count / TagDict[tag2.Value].TagCount; } } } //calculate second param var thirdParam = Lamda3 * tag3.TagCount / WordCount; return (firstParam + secondParam + thirdParam) / (Lamda1 + Lamda2 + Lamda3); }
/// <summary> /// Caculate P(tag2|tag1) /// Equation: P(tag2|tag1) /// = lamda2 * (Count(tag1,tag2)/Count(tag1)) /// + lamda3 * (Count(tag2)/Count(total tags)) /// </summary> /// <param name="tag1"></param> /// <param name="tag2"></param> /// <returns></returns> public double GetProbabilitySequence2Tag(Tag tag1, Tag tag2) { if (tag2 == null) return 0; double firstParam = 0; //calculate first param if (tag1 != null) { var key = tag1.Value + SeparteStr + tag2.Value; if (Seq2TagDict.ContainsKey(key)) { firstParam = Lamda2 * Seq2TagDict[key].Count / TagDict[tag1.Value].TagCount; } } //calculate second param var secondParam = Lamda3 * tag2.TagCount / WordCount; return (firstParam + secondParam) / (Lamda2 + Lamda3); }
public Tag AddTag(string tag) { if (TagDict.ContainsKey(tag)) { return TagDict[tag]; } var newTag = new Tag { Value = tag }; TagDict.Add(tag, newTag); return newTag; }