public static List<string> Viterbi( List<string> str, Trigram q, Bigram e, Unigram freqList, HashSet<string> tags) { return Viterbi(str,q,e,freqList,tags,false); }
public void AddWord(string word, string w1) { try { unigram.AddWord(w1); freqList[word].AddWord(w1); } catch (Exception) { Unigram tmp = new Unigram(); tmp.AddWord(w1); freqList.Add(word, tmp); } count++; }
public Bigram() { freqList = new Dictionary<string, Unigram>(); unigram = new Unigram(); count = 0; }
public static List<string> Viterbi( List<string> str, Trigram q, Bigram e, Unigram freqList, HashSet<string> tags, bool splitRare) { List<string> y = new List<string>(); Dictionary<string, double> current = new Dictionary<string, double>(); Dictionary<string, double> previous = new Dictionary<string, double>(); string key; double prob = 0; double max = 0; if (freqList.Contains(str[0])) { y.Add("STOP"); foreach (string v in tags) { prob = q.Qml(v, "*", "*") * e.Qml(str[0], v); current.Add("*:" + v, prob); if (prob >= max) { max = prob; y.RemoveAt(y.Count - 1); y.Add(v); } } } else { string v = (splitRare) ? ProperRare(str[0]) : "_RARE_"; y.Add(v); prob = Math.Max(q.Qml("_RARE_", "*", "*"), 0); current.Add("*:" + "_RARE_", prob); } previous = current; current = new Dictionary<string, double>(); max = 0; if (freqList.Contains(str[1])) { y.Add("STOP"); foreach (string u in tags) { foreach (string v in tags) { try { key = "*:" + u; prob = previous[key] * q.Qml(v, "*", u) * e.Qml(str[1], v); current.Add(string.Format("{0}:{1}", u, v), prob); if (prob >= max) { max = prob; y.RemoveAt(y.Count - 1); y.Add(v); } } catch (Exception) { continue; } } } } else { string v = (splitRare) ? ProperRare(str[1]) : "_RARE_"; y.Add(v); foreach (string u in tags) { try { key = "*:" + u; prob = previous[key] * q.Qml(v, "*", u) * e.Qml(str[1], v); current.Add(string.Format("{0}:{1}", u, v), prob); if (prob >= max) { max = prob; y.RemoveAt(y.Count - 1); y.Add(v); } } catch (Exception) { continue; } } } for (int i = 2; i < str.Count; i++) { previous = current; current = new Dictionary<string, double>(); max = 0; if (freqList.Contains(str[i])) { y.Add("STOP"); foreach (string v in tags) { foreach (string u in tags) { foreach (string w in tags) { key = w + ":" + u; try { prob = previous[key] * q.Qml(v, w, u) * e.Qml(str[i], v); } catch (Exception) { prob = 0; } try { if (prob > 0) current.Add(string.Format("{0}:{1}", u, v), prob); } catch (Exception) { if (prob > current[string.Format("{0}:{1}", u, v)]) { current.Remove(string.Format("{0}:{1}", u, v)); current.Add(string.Format("{0}:{1}", u, v), prob); } } if (prob >= max) { max = prob; y.RemoveAt(y.Count - 1); y.Add(v); } } } } } else { string v = (splitRare) ? ProperRare(str[i]) : "_RARE_"; y.Add(v); foreach (string u in tags) { foreach (string w in tags) { key = w + ":" + u; try { prob = previous[key] * q.Qml(v, w, u); } catch (Exception) { prob = 0; } try { if (prob > 0) current.Add(string.Format("{0}:{1}", u, v), prob); } catch (Exception) { if (prob > current[string.Format("{0}:{1}", u, v)]) { current.Remove(string.Format("{0}:{1}", u, v)); current.Add(string.Format("{0}:{1}", u, v), prob); } } if (prob >= max) { max = prob; y.RemoveAt(y.Count - 1); y.Add(v); } } } } } return y; }
public static void Train() { if (!(File.Exists(trigramPath) && File.Exists(bigramPath) && File.Exists(tagsPath))) { if (File.Exists("tmp.train")) ParseTrainingData("tmp.train"); else ParseTrainingData(ReplaceTrainingFile(trainingPath, BuildFreqList(trainingPath), true)); SerializeModel(trigramPath, transition); SerializeModel(bigramPath, emission); SerializeModel(unigramPath, freqList); SerializeTags(tagsPath); } else { transition = (Trigram)DeserializeModel(trigramPath); emission = (Bigram)DeserializeModel(bigramPath); freqList = (Unigram)DeserializeModel(unigramPath); tags = DeserializeTags(tagsPath); } }
static string ReplaceTrainingFile(string path, Unigram freqList, bool splitRares) { //FileStream fs = File.Open(path, FileMode.Open, FileAccess.ReadWrite); string outpath = "tmpSP.train"; StreamReader sr = new StreamReader(path); StreamWriter sw = new StreamWriter(outpath); string line; string[] str; while ((line = sr.ReadLine()) != null) { if (line != "") { str = line.Split(' '); if (freqList[str[0]] < 5) sw.WriteLine(str[0] + ((splitRares) ? SplitRareTag(str[0]) : " _RARE_")); else sw.WriteLine(line); } else sw.WriteLine(line); } sr.Close(); sw.Close(); //fs.Close(); //File.Delete(path); //File.Move("tmp.train", path); return outpath; }
static string ReplaceTrainingFile(string path, Unigram freqList) { return ReplaceTrainingFile(path, freqList, false); }