//包括独立与非独立 static public void InitializationOfLabelsetPairFrequencyForMcj(Label[] labels, int groupIndex, ref IDictionary <Tuple <Character, Character>, IDictionary <Tuple <Labelset, Labelset>, double> > LabelsetPairFrequencyForCharacter, ref IDictionary <Tuple <Character, Character>, IDictionary <Label, IDictionary <Tuple <Labelset, Labelset>, double> > > independentLabelsetPairFrequencyForCharacter) { switch (Variable.Relation) { case RelationScheme.RenewLower: case RelationScheme.UpdateLower: case RelationScheme.RenewOne: case RelationScheme.UpdateOne: case RelationScheme.AllLower: case RelationScheme.AlwaysInitialization: LabelsetPairFrequencyForCharacter = RelationFunction.InitializeLabelsetPairFrequencyForMcj(labels, groupIndex); break; case RelationScheme.IndependentRenewLower: case RelationScheme.IndependentRenewOne: independentLabelsetPairFrequencyForCharacter = RelationFunction.InitializeIndependentLabelsetPairFrequencyForCharacter(labels, groupIndex); break; } }
//包括独立与非独立 static public void UpdateLabelsetPairFrequencyForMcj(Sij sij, ref IDictionary <Tuple <Character, Character>, IDictionary <Tuple <Labelset, Labelset>, double> > LabelsetPairFrequencyForCharacter, Label[] labels, ref IDictionary <Tuple <Character, Character>, IDictionary <Label, IDictionary <Tuple <Labelset, Labelset>, double> > > independentLabelsetPairFrequencyForCharacter) { switch (Variable.Relation) { case RelationScheme.RenewLower: #region 取相邻两者最小值作为联合概率 LabelsetPairFrequencyForCharacter = RelationFunction.RenewLabelsetPairFrequencyForCharacter(false, sij); #endregion break; case RelationScheme.UpdateLower: #region 取相邻两者最小值作为联合概率,不重新计算 RelationFunction.UpdateLabelsetPairFrequencyForCharacter(false, sij, ref LabelsetPairFrequencyForCharacter); #endregion break; case RelationScheme.RenewOne: #region 根据新结果重新计算labelPair的频率 LabelsetPairFrequencyForCharacter = RelationFunction.RenewLabelsetPairFrequencyForCharacter(true, sij); #endregion break; case RelationScheme.UpdateOne: #region 根据新结果将labelPair的频率添加到原有频率中,不重新计算 RelationFunction.UpdateLabelsetPairFrequencyForCharacter(true, sij, ref LabelsetPairFrequencyForCharacter); #endregion break; case RelationScheme.AllLower: LabelsetPairFrequencyForCharacter = RelationFunction.AllLabelsetPairFrequencyForCharacter(sij); break; case RelationScheme.IndependentRenewLower: independentLabelsetPairFrequencyForCharacter = RelationFunction.RenewIndependentLabelPairFreuquencyForCharacter(false, labels, sij); break; case RelationScheme.IndependentRenewOne: independentLabelsetPairFrequencyForCharacter = RelationFunction.RenewIndependentLabelPairFreuquencyForCharacter(true, labels, sij); break; } }
static public IDictionary <Tuple <Character, Character>, IDictionary <Tuple <Labelset, Labelset>, double> > ConditionalMj(Mcj mcj, IDictionary <Tuple <Character, Character>, IDictionary <Tuple <Labelset, Labelset>, double> > LabelsetPairFrequencyForCharacter, IDictionary <Tuple <Character, Character>, IDictionary <Label, IDictionary <Tuple <Labelset, Labelset>, double> > > independentLabelsetPairFrequencyForCharacter) { IDictionary <Tuple <Character, Character>, IDictionary <Tuple <Labelset, Labelset>, double> > conditionalMcj = null; switch (Variable.Relation) { case RelationScheme.RenewLower: case RelationScheme.UpdateLower: case RelationScheme.RenewOne: case RelationScheme.UpdateOne: case RelationScheme.AllLower: conditionalMcj = RelationFunction.CalculateConditionalMcj(mcj, LabelsetPairFrequencyForCharacter); break; case RelationScheme.IndependentRenewLower: case RelationScheme.IndependentRenewOne: conditionalMcj = RelationFunction.CalculateIndependentConditionalMcj(mcj, independentLabelsetPairFrequencyForCharacter); break; } return(conditionalMcj); }
static public IDictionary <Tuple <Labelset, Labelset>, double> ConditionalPj(Pj pj, IDictionary <Tuple <Labelset, Labelset>, double> LabelsetPairFrequencyForSentence, IDictionary <Label, IDictionary <Tuple <Labelset, Labelset>, double> > independentLabelsetPairFrequencyForSentence) { IDictionary <Tuple <Labelset, Labelset>, double> conditionalPj = null; switch (Variable.Relation) { case RelationScheme.RenewLower: case RelationScheme.UpdateLower: case RelationScheme.RenewOne: case RelationScheme.UpdateOne: case RelationScheme.AllLower: conditionalPj = RelationFunction.CalculateConditionalPj(pj, LabelsetPairFrequencyForSentence); break; case RelationScheme.IndependentRenewLower: case RelationScheme.IndependentRenewOne: conditionalPj = RelationFunction.CalculateIndependentConditionalPj(pj, independentLabelsetPairFrequencyForSentence); break; } return(conditionalPj); }
//计算Sij和Pdata static public bool CalculatePdataAndSij(Label[] labels, ref Sij sij, Pj pj, PAkjl pakjl, Mcj mcj, ref Pdata pdata, int groupIndex, IList <double> pdatas, IDictionary <Tuple <Labelset, Labelset>, double> labelsetPairFrequencyForPj, IDictionary <Tuple <Character, Character>, IDictionary <Tuple <Labelset, Labelset>, double> > labelsetPairFrequencyForMcj, IDictionary <Tuple <Sentence, Sentence>, IDictionary <Tuple <Labelset, Labelset>, double> > labelsetPairFrequencyForSij) { bool isFinished = false; IDictionary <Tuple <Labelset, Labelset>, double> conditionalPj = null;//转移概率 IDictionary <Tuple <Character, Character>, IDictionary <Tuple <Labelset, Labelset>, double> > conditionalMcj = null; if (Variable.PriorP.Contains(PriorP.ConditionalPj)) { conditionalPj = RelationFunction.CalculateConditionalPj(pj, labelsetPairFrequencyForPj); } if (Variable.PriorP.Contains(PriorP.ConditionalMcj)) { conditionalMcj = RelationFunction.CalculateConditionalMcj(mcj, labelsetPairFrequencyForMcj); } //sij的分子 IDictionary <Sentence, IDictionary <Labelset, double> > numerator = new Dictionary <Sentence, IDictionary <Labelset, double> >(); //sij的分母(P(data on i)) IDictionary <Sentence, double> denominator = new Dictionary <Sentence, double>(); //计算分子 foreach (Sentence sentence in Variable.Sentences) { numerator.Add(sentence, new Dictionary <Labelset, double>()); #region 寻找需要遍历的j //应该用pj中的j,即当前分组中出现过的所有标注情况,不能用sij现有的。因为要用pajl和pj重新计算sij,已与sij现有值无关,虽然对Boku来说结果一样 ICollection <Labelset> labelsets = null; if (Variable.PriorP.Contains(PriorP.Sij) || Variable.PriorP.Contains(PriorP.ConditionalSij)) { labelsets = sij.Value[sentence].Keys; } else if (Variable.PriorP.Contains(PriorP.Mcj) || Variable.PriorP.Contains(PriorP.ConditionalMcj)) { labelsets = mcj.Value[sentence.Character].Keys; } else if (Variable.PriorP.Contains(PriorP.Pj) || Variable.PriorP.Contains(PriorP.ConditionalPj)) { labelsets = pj.Value.Keys;//pj里只包含所有句出现过的所有标注情况,所以最多遍历pj即可,不需要for(int j = 0; j < Math.Pow(2, labels.Length); ++j) } else//全部的 { labelsets = new List <Labelset>(); for (int j = 0; j < Math.Pow(2, labels.Length); ++j) { labelsets.Add(new Labelset(labels, j)); } } #endregion //开始计算 foreach (Labelset labelsetj in labelsets)//此时结果会好一些(masa: group3, group5) { double valueOfNumerator = 1; #region(公式 (5)) foreach (Annotator annotator in sentence.AnnotaitonGroups[groupIndex].AnnotatorAnnotationDic.Keys) { Labelset labelsetl = sentence.AnnotaitonGroups[groupIndex].AnnotatorAnnotationDic[annotator].ToLabelset(labels); if (pakjl.Value[annotator].ContainsKey(labelsetj)) { if (pakjl.Value[annotator][labelsetj].ContainsKey(labelsetl)) { valueOfNumerator *= pakjl.Value[annotator][labelsetj][labelsetl]; } else { valueOfNumerator = 0; break; } } //else valueOfNumerator *= 1 / Math.Pow(2, labels.Length);//如果annotator没标过正确为j的句子,则认为此annotator对此j来说,所有可能标的标签l概率相等(对masa有用,没有的话(相当于乘以1)结果很差;boku进不来) else { valueOfNumerator = 0; break; } //相当于valueOfNumerator*0;boku进不来;此时结果会好一些(masa: group3) } if (valueOfNumerator == 0) { continue; } #endregion #region 公式(5)*(6) foreach (PriorP p in Variable.PriorP) { switch (p) { case PriorP.Pj: valueOfNumerator *= pj.Value[labelsetj]; break; case PriorP.Mcj: valueOfNumerator *= mcj.Value[sentence.Character][labelsetj]; //Consistency的关键 break; case PriorP.Sij: valueOfNumerator *= sij.Value[sentence][labelsetj]; break; case PriorP.ConditionalPj: { if (sentence.ID != 0) { bool finded = false; double optimalPreLabelsetValue = 0; double conditonalPj = 0; int n = 1; foreach (KeyValuePair <Labelset, double> labelsetPre in sij.SortLabelsets(Variable.Sentences[sentence.ID - 1])) //找出前一句最优标注和其概率 { Tuple <Labelset, Labelset> labelsetPair = Tuple.Create(labelsetPre.Key, labelsetj); if (!finded) { if (conditionalPj.ContainsKey(labelsetPair)) { finded = true; conditonalPj = conditionalPj[labelsetPair];//找出前一句最优标注到这一句的转移概率 optimalPreLabelsetValue = labelsetPre.Value; } continue; } if (finded) { if (labelsetPre.Value == optimalPreLabelsetValue && conditionalPj.ContainsKey(labelsetPair)) //最优标注可能不只一个(概率相同,同为最大),随意要继续遍历 { conditonalPj += conditionalPj[labelsetPair]; ++n; } else { break; } } } valueOfNumerator *= conditonalPj / n; //n:最优标注的个数 } else { Tuple <Labelset, Labelset> labelsetPair = Tuple.Create(new Labelset(true), labelsetj); if (conditionalPj.ContainsKey(labelsetPair)) { valueOfNumerator *= conditionalPj[labelsetPair]; } else { valueOfNumerator = 0; } } } break; #region Mcj case PriorP.ConditionalMcj: { if (sentence.ID != 0) { bool finded = false; double optimalPreLabelsetValue = 0; double maxConditonalMcj = 0; int n = 1; Sentence sentencePre = Variable.Sentences[sentence.ID - 1]; Tuple <Character, Character> characterPair = Tuple.Create(sentencePre.Character, sentence.Character); foreach (KeyValuePair <Labelset, double> labelsetPre in sij.SortLabelsets(sentencePre)) { Tuple <Labelset, Labelset> labelsetPair = Tuple.Create(labelsetPre.Key, labelsetj); if (!finded) { if (conditionalMcj[characterPair].ContainsKey(labelsetPair)) { finded = true; maxConditonalMcj = conditionalMcj[characterPair][labelsetPair]; optimalPreLabelsetValue = labelsetPre.Value; } continue; } if (finded) { if (labelsetPre.Value == optimalPreLabelsetValue && conditionalMcj[characterPair].ContainsKey(labelsetPair)) { maxConditonalMcj += conditionalMcj[characterPair][labelsetPair]; ++n; } else { break; } } } valueOfNumerator *= maxConditonalMcj / n; } else { Tuple <Character, Character> characterPair = Tuple.Create(new Character("##"), sentence.Character); Tuple <Labelset, Labelset> labelsetPair = Tuple.Create(new Labelset(true), labelsetj); if (conditionalMcj[characterPair].ContainsKey(labelsetPair)) { valueOfNumerator *= conditionalMcj[characterPair][labelsetPair]; } else { valueOfNumerator = 0; } } } break; #endregion case PriorP.ConditionalSij: { if (sentence.ID != 0) { bool finded = false; double optimalPreLabelsetValue = 0; double conditonalPj = 0; int n = 1; Sentence sentencePre = Variable.Sentences[sentence.ID - 1]; Tuple <Sentence, Sentence> sentencePair = Tuple.Create(sentencePre, sentence); foreach (KeyValuePair <Labelset, double> labelsetPre in sij.SortLabelsets(sentencePre)) { Tuple <Labelset, Labelset> labelsetPair = Tuple.Create(labelsetPre.Key, labelsetj); if (!finded) { if (labelsetPairFrequencyForSij[sentencePair].ContainsKey(labelsetPair)) { finded = true; conditonalPj = labelsetPairFrequencyForSij[sentencePair][labelsetPair] / labelsetPre.Value; optimalPreLabelsetValue = labelsetPre.Value; } continue; } if (finded) { if (labelsetPre.Value == optimalPreLabelsetValue && labelsetPairFrequencyForSij[sentencePair].ContainsKey(labelsetPair)) { conditonalPj += labelsetPairFrequencyForSij[sentencePair][labelsetPair] / labelsetPre.Value; ++n; } else { break; } } } valueOfNumerator *= conditonalPj / n; } else { Tuple <Sentence, Sentence> sentencePair = Tuple.Create(new Sentence(-1, "##"), sentence); Tuple <Labelset, Labelset> labelsetPair = Tuple.Create(new Labelset(true), labelsetj); if (labelsetPairFrequencyForSij[sentencePair].ContainsKey(labelsetPair)) { valueOfNumerator *= labelsetPairFrequencyForSij[sentencePair][labelsetPair]; } else { valueOfNumerator = 0; } } } break; } } #endregion if (valueOfNumerator != 0) { numerator[sentence].Add(labelsetj, valueOfNumerator); } } #region 计算分母 (公式(7)) double valueOfDenominator = 0; foreach (Labelset Labelsetq in numerator[sentence].Keys)//因为是加,故只需遍历numerator里有的标注,不需遍历所有标注 { valueOfDenominator += numerator[sentence][Labelsetq]; } denominator.Add(sentence, valueOfDenominator); #endregion } //计算Pdata和Sij pdata = pdata != null ? new Pdata(++pdata.Time, pdata.Value) : new Pdata(1, 0); sij = new Sij(++sij.Time); foreach (Sentence sentence in Variable.Sentences) { sij.Value.Add(sentence, new Dictionary <Labelset, double>()); foreach (Labelset labelset in numerator[sentence].Keys) { if (Variable.SijDividPDataOnI) //常规方法 { sij.Value[sentence][labelset] = numerator[sentence][labelset] / denominator[sentence]; //Dic赋值时没有的元素会自动加 } else { sij.Value[sentence][labelset] = numerator[sentence][labelset]; } } pdata.Value += -Math.Log(denominator[sentence]); } if (pdatas.Contains(pdata.Value) || (Math.Abs(pdata.MondifiedValue) <= Variable.ConvergeValueThreshold)) { isFinished = true; } else { pdatas.Add(pdata.Value); } if (Variable.OutputPdata) { Variable.OutputFile.WriteLine(pdata.ToString()); } return(isFinished); }