static public void Intgerate(Label[] labels, int groupIndex, ref Sij sij) { Pj pj = null; Pdata pdata = null; Mcj mcj = null; IDictionary <Tuple <Labelset, Labelset>, double> labelsetPairFrequencyForPj = null;//-1(开头)不能用来索引数组,故只能使用哈希表 IDictionary <Tuple <Character, Character>, IDictionary <Tuple <Labelset, Labelset>, double> > labelsetPairFrequencyForMcj = null; IDictionary <Tuple <Sentence, Sentence>, IDictionary <Tuple <Labelset, Labelset>, double> > labelsetPairFrequencyForSij = null; IDictionary <Label, IDictionary <Tuple <Labelset, Labelset>, double> > independentLabelsetPairFrequencyForPj = null; IDictionary <Tuple <Character, Character>, IDictionary <Label, IDictionary <Tuple <Labelset, Labelset>, double> > > independentLabelsetPairFrequencyForMcj = null; IDictionary <Tuple <Sentence, Sentence>, IDictionary <Label, IDictionary <Tuple <Labelset, Labelset>, double> > > independentLabelsetPairFrequencyForSij = null; if (Variable.PriorP.Contains(PriorP.ConditionalPj))//计算在一篇(一次)标注中的转移频率 { ChoiceFunction.InitializationOfLabelsetPairFrequencyForPj(labels, groupIndex, ref labelsetPairFrequencyForPj, ref independentLabelsetPairFrequencyForPj); } if (Variable.PriorP.Contains(PriorP.ConditionalMcj)) { ChoiceFunction.InitializationOfLabelsetPairFrequencyForMcj(labels, groupIndex, ref labelsetPairFrequencyForMcj, ref independentLabelsetPairFrequencyForMcj); } if (Variable.PriorP.Contains(PriorP.ConditionalSij)) { ChoiceFunction.InitializationOfLabelsetPairFrequencyForSij(labels, groupIndex, ref labelsetPairFrequencyForSij, ref independentLabelsetPairFrequencyForSij); } IList <double> Pdatas = new List <double>(); for (int time = 1; time <= Variable.ConvergeTimeThreshold; ++time) { //计算Pj,mcj(Consistency:角色c有j标签的概率) ChoiceFunction.PriorPj(ref pj, ref mcj, sij, time); //计算π PAkjl pakjl = CalculatePAkjl(labels, sij, time, groupIndex); //计算Sij if (CalculatePdataAndSij(labels, ref sij, pj, pakjl, mcj, ref pdata, groupIndex, Pdatas, labelsetPairFrequencyForPj, labelsetPairFrequencyForMcj, labelsetPairFrequencyForSij)) { break; } if (Variable.PriorP.Contains(PriorP.ConditionalPj)) { ChoiceFunction.UpdateLabelsetPairFrequencyForPj(sij, ref labelsetPairFrequencyForPj, labels, ref independentLabelsetPairFrequencyForPj); } if (Variable.PriorP.Contains(PriorP.ConditionalMcj)) { ChoiceFunction.UpdateLabelsetPairFrequencyForMcj(sij, ref labelsetPairFrequencyForMcj, labels, ref independentLabelsetPairFrequencyForMcj); } } }
//计算Sij和Pdata static public bool CalculatePdataAndSij(Label[] labels, ref Sij sij, Pj pj, PAkjl pakjl, Mcj mcj, ref Pdata pdata, int groupIndex, IList <double> pdatas, IDictionary <Tuple <Labelset, Labelset>, double> labelsetPairFrequencyForPj, IDictionary <Tuple <Character, Character>, IDictionary <Tuple <Labelset, Labelset>, double> > labelsetPairFrequencyForMcj, IDictionary <Tuple <Sentence, Sentence>, IDictionary <Tuple <Labelset, Labelset>, double> > labelsetPairFrequencyForSij) { bool isFinished = false; IDictionary <Tuple <Labelset, Labelset>, double> conditionalPj = null;//转移概率 IDictionary <Tuple <Character, Character>, IDictionary <Tuple <Labelset, Labelset>, double> > conditionalMcj = null; if (Variable.PriorP.Contains(PriorP.ConditionalPj)) { conditionalPj = RelationFunction.CalculateConditionalPj(pj, labelsetPairFrequencyForPj); } if (Variable.PriorP.Contains(PriorP.ConditionalMcj)) { conditionalMcj = RelationFunction.CalculateConditionalMcj(mcj, labelsetPairFrequencyForMcj); } //sij的分子 IDictionary <Sentence, IDictionary <Labelset, double> > numerator = new Dictionary <Sentence, IDictionary <Labelset, double> >(); //sij的分母(P(data on i)) IDictionary <Sentence, double> denominator = new Dictionary <Sentence, double>(); //计算分子 foreach (Sentence sentence in Variable.Sentences) { numerator.Add(sentence, new Dictionary <Labelset, double>()); #region 寻找需要遍历的j //应该用pj中的j,即当前分组中出现过的所有标注情况,不能用sij现有的。因为要用pajl和pj重新计算sij,已与sij现有值无关,虽然对Boku来说结果一样 ICollection <Labelset> labelsets = null; if (Variable.PriorP.Contains(PriorP.Sij) || Variable.PriorP.Contains(PriorP.ConditionalSij)) { labelsets = sij.Value[sentence].Keys; } else if (Variable.PriorP.Contains(PriorP.Mcj) || Variable.PriorP.Contains(PriorP.ConditionalMcj)) { labelsets = mcj.Value[sentence.Character].Keys; } else if (Variable.PriorP.Contains(PriorP.Pj) || Variable.PriorP.Contains(PriorP.ConditionalPj)) { labelsets = pj.Value.Keys;//pj里只包含所有句出现过的所有标注情况,所以最多遍历pj即可,不需要for(int j = 0; j < Math.Pow(2, labels.Length); ++j) } else//全部的 { labelsets = new List <Labelset>(); for (int j = 0; j < Math.Pow(2, labels.Length); ++j) { labelsets.Add(new Labelset(labels, j)); } } #endregion //开始计算 foreach (Labelset labelsetj in labelsets)//此时结果会好一些(masa: group3, group5) { double valueOfNumerator = 1; #region(公式 (5)) foreach (Annotator annotator in sentence.AnnotaitonGroups[groupIndex].AnnotatorAnnotationDic.Keys) { Labelset labelsetl = sentence.AnnotaitonGroups[groupIndex].AnnotatorAnnotationDic[annotator].ToLabelset(labels); if (pakjl.Value[annotator].ContainsKey(labelsetj)) { if (pakjl.Value[annotator][labelsetj].ContainsKey(labelsetl)) { valueOfNumerator *= pakjl.Value[annotator][labelsetj][labelsetl]; } else { valueOfNumerator = 0; break; } } //else valueOfNumerator *= 1 / Math.Pow(2, labels.Length);//如果annotator没标过正确为j的句子,则认为此annotator对此j来说,所有可能标的标签l概率相等(对masa有用,没有的话(相当于乘以1)结果很差;boku进不来) else { valueOfNumerator = 0; break; } //相当于valueOfNumerator*0;boku进不来;此时结果会好一些(masa: group3) } if (valueOfNumerator == 0) { continue; } #endregion #region 公式(5)*(6) foreach (PriorP p in Variable.PriorP) { switch (p) { case PriorP.Pj: valueOfNumerator *= pj.Value[labelsetj]; break; case PriorP.Mcj: valueOfNumerator *= mcj.Value[sentence.Character][labelsetj]; //Consistency的关键 break; case PriorP.Sij: valueOfNumerator *= sij.Value[sentence][labelsetj]; break; case PriorP.ConditionalPj: { if (sentence.ID != 0) { bool finded = false; double optimalPreLabelsetValue = 0; double conditonalPj = 0; int n = 1; foreach (KeyValuePair <Labelset, double> labelsetPre in sij.SortLabelsets(Variable.Sentences[sentence.ID - 1])) //找出前一句最优标注和其概率 { Tuple <Labelset, Labelset> labelsetPair = Tuple.Create(labelsetPre.Key, labelsetj); if (!finded) { if (conditionalPj.ContainsKey(labelsetPair)) { finded = true; conditonalPj = conditionalPj[labelsetPair];//找出前一句最优标注到这一句的转移概率 optimalPreLabelsetValue = labelsetPre.Value; } continue; } if (finded) { if (labelsetPre.Value == optimalPreLabelsetValue && conditionalPj.ContainsKey(labelsetPair)) //最优标注可能不只一个(概率相同,同为最大),随意要继续遍历 { conditonalPj += conditionalPj[labelsetPair]; ++n; } else { break; } } } valueOfNumerator *= conditonalPj / n; //n:最优标注的个数 } else { Tuple <Labelset, Labelset> labelsetPair = Tuple.Create(new Labelset(true), labelsetj); if (conditionalPj.ContainsKey(labelsetPair)) { valueOfNumerator *= conditionalPj[labelsetPair]; } else { valueOfNumerator = 0; } } } break; #region Mcj case PriorP.ConditionalMcj: { if (sentence.ID != 0) { bool finded = false; double optimalPreLabelsetValue = 0; double maxConditonalMcj = 0; int n = 1; Sentence sentencePre = Variable.Sentences[sentence.ID - 1]; Tuple <Character, Character> characterPair = Tuple.Create(sentencePre.Character, sentence.Character); foreach (KeyValuePair <Labelset, double> labelsetPre in sij.SortLabelsets(sentencePre)) { Tuple <Labelset, Labelset> labelsetPair = Tuple.Create(labelsetPre.Key, labelsetj); if (!finded) { if (conditionalMcj[characterPair].ContainsKey(labelsetPair)) { finded = true; maxConditonalMcj = conditionalMcj[characterPair][labelsetPair]; optimalPreLabelsetValue = labelsetPre.Value; } continue; } if (finded) { if (labelsetPre.Value == optimalPreLabelsetValue && conditionalMcj[characterPair].ContainsKey(labelsetPair)) { maxConditonalMcj += conditionalMcj[characterPair][labelsetPair]; ++n; } else { break; } } } valueOfNumerator *= maxConditonalMcj / n; } else { Tuple <Character, Character> characterPair = Tuple.Create(new Character("##"), sentence.Character); Tuple <Labelset, Labelset> labelsetPair = Tuple.Create(new Labelset(true), labelsetj); if (conditionalMcj[characterPair].ContainsKey(labelsetPair)) { valueOfNumerator *= conditionalMcj[characterPair][labelsetPair]; } else { valueOfNumerator = 0; } } } break; #endregion case PriorP.ConditionalSij: { if (sentence.ID != 0) { bool finded = false; double optimalPreLabelsetValue = 0; double conditonalPj = 0; int n = 1; Sentence sentencePre = Variable.Sentences[sentence.ID - 1]; Tuple <Sentence, Sentence> sentencePair = Tuple.Create(sentencePre, sentence); foreach (KeyValuePair <Labelset, double> labelsetPre in sij.SortLabelsets(sentencePre)) { Tuple <Labelset, Labelset> labelsetPair = Tuple.Create(labelsetPre.Key, labelsetj); if (!finded) { if (labelsetPairFrequencyForSij[sentencePair].ContainsKey(labelsetPair)) { finded = true; conditonalPj = labelsetPairFrequencyForSij[sentencePair][labelsetPair] / labelsetPre.Value; optimalPreLabelsetValue = labelsetPre.Value; } continue; } if (finded) { if (labelsetPre.Value == optimalPreLabelsetValue && labelsetPairFrequencyForSij[sentencePair].ContainsKey(labelsetPair)) { conditonalPj += labelsetPairFrequencyForSij[sentencePair][labelsetPair] / labelsetPre.Value; ++n; } else { break; } } } valueOfNumerator *= conditonalPj / n; } else { Tuple <Sentence, Sentence> sentencePair = Tuple.Create(new Sentence(-1, "##"), sentence); Tuple <Labelset, Labelset> labelsetPair = Tuple.Create(new Labelset(true), labelsetj); if (labelsetPairFrequencyForSij[sentencePair].ContainsKey(labelsetPair)) { valueOfNumerator *= labelsetPairFrequencyForSij[sentencePair][labelsetPair]; } else { valueOfNumerator = 0; } } } break; } } #endregion if (valueOfNumerator != 0) { numerator[sentence].Add(labelsetj, valueOfNumerator); } } #region 计算分母 (公式(7)) double valueOfDenominator = 0; foreach (Labelset Labelsetq in numerator[sentence].Keys)//因为是加,故只需遍历numerator里有的标注,不需遍历所有标注 { valueOfDenominator += numerator[sentence][Labelsetq]; } denominator.Add(sentence, valueOfDenominator); #endregion } //计算Pdata和Sij pdata = pdata != null ? new Pdata(++pdata.Time, pdata.Value) : new Pdata(1, 0); sij = new Sij(++sij.Time); foreach (Sentence sentence in Variable.Sentences) { sij.Value.Add(sentence, new Dictionary <Labelset, double>()); foreach (Labelset labelset in numerator[sentence].Keys) { if (Variable.SijDividPDataOnI) //常规方法 { sij.Value[sentence][labelset] = numerator[sentence][labelset] / denominator[sentence]; //Dic赋值时没有的元素会自动加 } else { sij.Value[sentence][labelset] = numerator[sentence][labelset]; } } pdata.Value += -Math.Log(denominator[sentence]); } if (pdatas.Contains(pdata.Value) || (Math.Abs(pdata.MondifiedValue) <= Variable.ConvergeValueThreshold)) { isFinished = true; } else { pdatas.Add(pdata.Value); } if (Variable.OutputPdata) { Variable.OutputFile.WriteLine(pdata.ToString()); } return(isFinished); }