static ChsPersonNameDict() { dictionary = new NRDictionary(); if (!dictionary.Load(Config.Person_Dict_Path)) { // log: loading error return; } transformMatrixDictionary = new TransformMatrixDictionary <NR>(typeof(NR)); transformMatrixDictionary.Load(Config.Person_TR_Dict_Path); _trie = new ACDoubleArrayTrie <NRPattern>(); var map = new SortedDictionary <string, NRPattern>(StrComparer.Default); var nrPatMax = (int)NRPattern.XD + 1; for (int i = 0; i < nrPatMax; i++) { var nrPat = (NRPattern)i; map.Add(nrPat.ToString(), nrPat); } _trie.Build(map); }
/// <summary> /// 特化版的求解HMM模型 /// </summary> /// <param name="vertices"></param> /// <param name="transMaxtrix">转移矩阵</param> public static void Compute(List <Vertex> vertices, TransformMatrixDictionary <Nature> transMaxtrix) { //int length = vertices.Count - 1; // 去掉首节点之后的数量(包括了尾节点) double[][] cost = new double[2][]; // 滚动数组,用于保存最近两个节点的各状态对应的概率值 var start = vertices[0]; // vertices包含了首尾节点,故 start 是辅助首节点 Nature pre = start.attr.natures[0]; // start 节点的nature 肯定是 Nature.begin // 第二个节点计算 Vertex item = vertices[1]; cost[0] = new double[item.attr.natures.Length]; for (int i = 0; i < item.attr.natures.Length; i++) // 遍历第二个节点的所有可能的状态 { var cur = item.attr.natures[i]; cost[0][i] = transMaxtrix.Trans_Prob[(int)pre][(int)cur] - // 从首节点状态转移到第二个节点状态的概率 乘以 第二个节点状态下观测到第二个节点值的发射概率(取对数并取相反数) Math.Log((item.attr.freqs[i] + 1e-8) / transMaxtrix.GetFreq(cur)); } Vertex preItem = item; Nature[] preTagSet = item.attr.natures; for (int i = 2; i < vertices.Count; i++) { int index_i_1 = i % 2; // i = even 时 为 0,表示上一个节点各状态的概率 int index_i = 1 - index_i_1; // i = even 时 为 1,表示当前节点各状态的概率 item = vertices[i]; cost[index_i] = new double[item.attr.natures.Length]; // 用于保存当前节点在各状态下的概率 double perfect_cost_line = double.MaxValue; // 保存 截止到当前时刻 i 为止,已确定的最优路径的概率 var curTagSet = item.attr.natures; for (int k = 0; k < curTagSet.Length; k++) // 遍历当前节点的所有可能状态(标签) { var cur = curTagSet[k]; // 当前状态 for (int n = 0; n < preTagSet.Length; n++) // 遍历上一个节点的所有可能状态 { var p = preTagSet[n]; double now = cost[index_i_1][n] + transMaxtrix.Trans_Prob[(int)p][(int)cur] - // 上一节点状态n的概率 乘以 从上一节点转移到当前节点状态的概率 乘以 当前节点状态的发射概率 Math.Log((item.attr.freqs[k] + 1e-8) / transMaxtrix.GetFreq(cur)); if (now < cost[index_i][k]) { cost[index_i][k] = now; if (now < perfect_cost_line) { perfect_cost_line = now; pre = p; } } } } preItem.ConfirmNature(pre); //! 在当前时刻 i 为止确定的最优路径,并确定这是来自上一节点的哪个状态,从而确定上个节点的状态,然而这种方法确定的每个节点的状态是否属于同一条路径? preTagSet = curTagSet; preItem = item; } }
static PlaceDictionary() { dict = new NSDictionary(); dict.Load(Config.Place_Dict_Path); trans_tr_dict = new TransformMatrixDictionary <NS>(typeof(NS)); trans_tr_dict.Load(Config.Place_TR_Dict_Path); trie = new ACDoubleArrayTrie <string>(); var patternMap = new SortedDictionary <string, string>(StrComparer.Default); patternMap.Add("CH", null); patternMap.Add("CDH", null); patternMap.Add("CDEH", null); patternMap.Add("GH", null); trie.Build(patternMap); }
static OrgDictionary() { dictionary = new NTDictionary(); dictionary.Load(Config.Org_Dict_Path); transformMatrixDictionary = new TransformMatrixDictionary <NT>(typeof(NT)); transformMatrixDictionary.Load(Config.Org_TR_Dict_Path); _trie = new ACDoubleArrayTrie <string>(); var patternMap = new SortedDictionary <string, string>(StrComparer.Default); for (int i = 0; i <= (int)NTPattern.WWIWWCWD; i++) { var enumStr = ((NTPattern)i).ToString(); patternMap.Add(enumStr, enumStr); } _trie.Build(patternMap); }
/// <summary> /// 仅仅利用了转移矩阵的 Viterbi 算法 /// </summary> /// <typeparam name="E">标签(状态)类型</typeparam> /// <param name="roleTagList">观测序列</param> /// <param name="transMatrix">转移矩阵</param> /// <returns></returns> public static List <E> ComputeSimply <E>(List <TagFreqItem <E> > roleTagList, TransformMatrixDictionary <E> transMatrix) where E : IConvertible { var start = roleTagList[0]; // 首节点 E pre = start.labelMap.First().Key; // 首节点标签 E perfect_tag = pre; var list = new List <E>() { pre }; for (int i = 1; i < roleTagList.Count; i++) { double perfect_cost = double.MaxValue; var item = roleTagList[i]; foreach (var cur in item.labelMap.Keys) { double now = transMatrix.Trans_Prob[Convert.ToInt32(pre)][Convert.ToInt32(cur)] - Math.Log((item.GetFreqOrDefault(cur) + 1e-8) / transMatrix.GetFreq(cur)); if (perfect_cost > now) { perfect_cost = now; perfect_tag = cur; } } pre = perfect_tag; list.Add(pre); } return(list); }
/// <summary> /// 标准版的Viterbi算法,查询准确率高,效率稍低 /// </summary> /// <typeparam name="E"></typeparam> /// <param name="roleTagList">观测序列</param> /// <param name="transMatrix">转移矩阵</param> /// <returns></returns> public static List <E> Compute <E>(List <TagFreqItem <E> > roleTagList, TransformMatrixDictionary <E> transMatrix) where E : IConvertible { var list = new List <E>(roleTagList.Count); // 标签序列 var cost = new double[2][]; // 滚动数组,作用与上一个方法类似 var start = roleTagList[0]; // 首节点,这是一个辅助节点(另外一个辅助节点是尾节点) E pre = start.labelMap.First().Key; // 首节点的 标签是确定的 list.Add(pre); // 第二个节点的标签也是可以很容易算出来的 var item = roleTagList[1]; cost[0] = new double[item.labelMap.Count]; // 第二个节点所有可能的标签分别对应的概率 int j = 0; foreach (var p in item.labelMap) { cost[0][j] = transMatrix.Trans_Prob[Convert.ToInt32(pre)][Convert.ToInt32(p.Key)] - // transMatrix中所有概率均作了取对数并取相反数处理 Math.Log((item.GetFreqOrDefault(p.Key) + 1e-8) / transMatrix.GetFreq(p.Key)); // 状态转移概率乘以发射概率(频次相除,做了为 0 处理) j++; } var preTagSet = item.labelMap.Keys; // for (int i = 2; i < roleTagList.Count; i++) { int index_i_1 = i % 2; int index_i = 1 - index_i_1; item = roleTagList[i]; cost[index_i] = new double[item.labelMap.Count]; double perfect_cost_line = double.MaxValue; int k = 0; var curTagSet = item.labelMap.Keys; foreach (var cur in curTagSet) // 遍历当前节点的所有可能标签 { cost[index_i][k] = double.MaxValue; j = 0; foreach (var p in preTagSet) // 遍历前一节点的所有标签 { double now = cost[index_i_1][j] // 上一节点某个状态的概率 + transMatrix.Trans_Prob[Convert.ToInt32(p)][Convert.ToInt32(cur)] // 上一节点那个状态转移到此节点当前状态的概率 - Math.Log((item.GetFreqOrDefault(cur) + 1e-8) / transMatrix.GetFreq(cur)); // 此节点当前状态的发射概率 j++; if (now < cost[index_i][k]) // 对此节点的当前状态来说,如果发现来自上一节点某个状态的路径对应概率更高(取了相反数,即更小) { cost[index_i][k] = now; // 记录此节点当前状态的最优路径的概率 if (now < perfect_cost_line) { perfect_cost_line = now; // 记录此节点所有状态中的最优路径的概率 pre = p; // 记录到达此节点时最优路径中上个节点的标签 } } } k++; } list.Add(pre); //! 在当前时刻 i 为止确定的最优路径,并确定这是来自上一节点的哪个状态,从而确定上个节点的状态,然而这种方法确定的每个节点的状态是否属于同一条路径? preTagSet = curTagSet; } list.Add(list[0]); // 尾节点(##末##)对应的标签 return(list); }
/** * 仅仅利用了转移矩阵的“维特比”算法 * * @param roleTagList 观测序列 * @param transformMatrixDictionary 转移矩阵 * @param <E> EnumItem的具体类型 * @return 预测结果 */ public static List <E> computeEnumSimply <E>(LinkedList <EnumItem <E> > roleTagList, TransformMatrixDictionary <E> transformMatrixDictionary) { //int length = roleTagList.Count - 1; //List<E> tagList = new List<E>(); //Iterator<EnumItem<E>> iterator = roleTagList.iterator(); //EnumItem<E> start = iterator.next(); //E pre = start.labelMap.entrySet().iterator().next().getKey(); //E perfect_tag = pre; //// 第一个是确定的 //tagList.Add(pre); //for (int i = 0; i < length; ++i) //{ // double perfect_cost = Double.MaxValue; // EnumItem<E> item = iterator.next(); // foreach (E cur in item.labelMap) // { // double now = transformMatrixDictionary.transititon_probability[pre.ordinal()][cur.ordinal()] - Math.Log((item.getFrequency(cur) + 1e-8) / transformMatrixDictionary.getTotalFrequency(cur)); // if (perfect_cost > now) // { // perfect_cost = now; // perfect_tag = cur; // } // } // pre = perfect_tag; // tagList.Add(pre); //} //return tagList; return(null); }
/** * 标准版的Viterbi算法,查准率高,效率稍低 * * @param roleTagList 观测序列 * @param transformMatrixDictionary 转移矩阵 * @param <E> EnumItem的具体类型 * @return 预测结果 */ public static LinkedList <E> computeEnum <E>(LinkedList <EnumItem <E> > roleTagList, TransformMatrixDictionary <E> transformMatrixDictionary) { //int length = roleTagList.Count - 1; //List<E> tagList = new List<E>(roleTagList.Count); //double[][] cost = new double[2][]; // 滚动数组 //Iterator<EnumItem<E>> iterator = roleTagList.iterator(); //EnumItem<E> start = iterator.next(); //E pre = start.labelMap.entrySet().iterator().next().getKey(); //// 第一个是确定的 //tagList.Add(pre); //// 第二个也可以简单地算出来 //HashSet<E> preTagSet; //{ // EnumItem<E> item = iterator.next(); // cost[0] = new double[item.labelMap.Count]; // int j = 0; // foreach (E cur in item.labelMap) // { // cost[0][j] = transformMatrixDictionary.transititon_probability[(int)pre][(int)cur] - Math.Log((item.getFrequency(cur) + 1e-8) / transformMatrixDictionary.getTotalFrequency(cur)); // ++j; // } // preTagSet = item.labelMap.keySet(); //} //// 第三个开始复杂一些 //for (int i = 1; i < length; ++i) //{ // int index_i = i & 1; // int index_i_1 = 1 - index_i; // EnumItem<E> item = iterator.next(); // cost[index_i] = new double[item.labelMap.Count]; // double perfect_cost_line = Double.MaxValue; // int k = 0; // HashSet<E> curTagSet = item.labelMap.keySet(); // foreach (E cur in curTagSet) // { // cost[index_i][k] = Double.MaxValue; // int j = 0; // foreach (E p in preTagSet) // { // double now = cost[index_i_1][j] + transformMatrixDictionary.transititon_probability[p.ordinal()][cur.ordinal()] - Math.Log((item.getFrequency(cur) + 1e-8) / transformMatrixDictionary.getTotalFrequency(cur)); // if (now < cost[index_i][k]) // { // cost[index_i][k] = now; // if (now < perfect_cost_line) // { // perfect_cost_line = now; // pre = p; // } // } // ++j; // } // ++k; // } // tagList.Add(pre); // preTagSet = curTagSet; //} //tagList.Add(tagList[0]); // 对于最后一个##末## //return tagList; return(null); }
/** * 特化版的求解HMM模型 * * @param vertexList 包含Vertex.B节点的路径 * @param transformMatrixDictionary 词典对应的转移矩阵 */ public static void compute(List <Vertex> vertexList, TransformMatrixDictionary <Nature> transformMatrixDictionary) { int length = vertexList.Count - 1; double[][] cost = new double[2][]; // 滚动数组 //Iterator<Vertex> iterator = vertexList.iterator(); //Vertex start = iterator.next(); List <Vertex> .Enumerator iterator = vertexList.GetEnumerator(); iterator.MoveNext(); Vertex start = iterator.Current; Nature pre = start.attribute.nature[0]; // 第一个是确定的 // start.confirmNature(pre); // 第二个也可以简单地算出来 Vertex preItem; Nature[] preTagSet; { iterator.MoveNext(); Vertex item = iterator.Current; cost[0] = new double[item.attribute.nature.Length]; int j = 0; int curIndex = 0; foreach (Nature cur in item.attribute.nature) { cost[0][j] = transformMatrixDictionary.transititon_probability[(int)pre][(int)cur] - Math.Log((item.attribute.frequency[curIndex] + 1e-8) / transformMatrixDictionary.getTotalFrequency(cur)); ++j; ++curIndex; } preTagSet = item.attribute.nature; preItem = item; } // 第三个开始复杂一些 for (int i = 1; i < length; ++i) { int index_i = i & 1; int index_i_1 = 1 - index_i; iterator.MoveNext(); Vertex item = iterator.Current; cost[index_i] = new double[item.attribute.nature.Length]; double perfect_cost_line = Double.MaxValue; int k = 0; Nature[] curTagSet = item.attribute.nature; foreach (Nature cur in curTagSet) { cost[index_i][k] = Double.MaxValue; int j = 0; foreach (Nature p in preTagSet) { double now = cost[index_i_1][j] + transformMatrixDictionary.transititon_probability[(int)p][(int)cur] - Math.Log((item.attribute.frequency[k] + 1e-8) / transformMatrixDictionary.getTotalFrequency(cur)); if (now < cost[index_i][k]) { cost[index_i][k] = now; if (now < perfect_cost_line) { perfect_cost_line = now; pre = p; } } ++j; } ++k; } preItem.confirmNature(pre); preTagSet = curTagSet; preItem = item; } }