Exemplo n.º 1
0
        //计算Sij和Pdata
        static public bool CalculatePdataAndSij(Label[] labels, ref Sij sij, Pj pj, PAkjl pakjl, Mcj mcj, ref Pdata pdata, int groupIndex, IList <double> pdatas,
                                                IDictionary <Tuple <Labelset, Labelset>, double> labelsetPairFrequencyForPj, IDictionary <Tuple <Character, Character>, IDictionary <Tuple <Labelset, Labelset>, double> > labelsetPairFrequencyForMcj,
                                                IDictionary <Tuple <Sentence, Sentence>, IDictionary <Tuple <Labelset, Labelset>, double> > labelsetPairFrequencyForSij)
        {
            bool isFinished = false;
            IDictionary <Tuple <Labelset, Labelset>, double> conditionalPj = null;//转移概率
            IDictionary <Tuple <Character, Character>, IDictionary <Tuple <Labelset, Labelset>, double> > conditionalMcj = null;

            if (Variable.PriorP.Contains(PriorP.ConditionalPj))
            {
                conditionalPj = RelationFunction.CalculateConditionalPj(pj, labelsetPairFrequencyForPj);
            }
            if (Variable.PriorP.Contains(PriorP.ConditionalMcj))
            {
                conditionalMcj = RelationFunction.CalculateConditionalMcj(mcj, labelsetPairFrequencyForMcj);
            }
            //sij的分子
            IDictionary <Sentence, IDictionary <Labelset, double> > numerator = new Dictionary <Sentence, IDictionary <Labelset, double> >();
            //sij的分母(P(data on i))
            IDictionary <Sentence, double> denominator = new Dictionary <Sentence, double>();

            //计算分子
            foreach (Sentence sentence in Variable.Sentences)
            {
                numerator.Add(sentence, new Dictionary <Labelset, double>());
                #region 寻找需要遍历的j
                //应该用pj中的j,即当前分组中出现过的所有标注情况,不能用sij现有的。因为要用pajl和pj重新计算sij,已与sij现有值无关,虽然对Boku来说结果一样
                ICollection <Labelset> labelsets = null;
                if (Variable.PriorP.Contains(PriorP.Sij) || Variable.PriorP.Contains(PriorP.ConditionalSij))
                {
                    labelsets = sij.Value[sentence].Keys;
                }
                else if (Variable.PriorP.Contains(PriorP.Mcj) || Variable.PriorP.Contains(PriorP.ConditionalMcj))
                {
                    labelsets = mcj.Value[sentence.Character].Keys;
                }
                else if (Variable.PriorP.Contains(PriorP.Pj) || Variable.PriorP.Contains(PriorP.ConditionalPj))
                {
                    labelsets = pj.Value.Keys;//pj里只包含所有句出现过的所有标注情况,所以最多遍历pj即可,不需要for(int j = 0; j < Math.Pow(2, labels.Length); ++j)
                }
                else//全部的
                {
                    labelsets = new List <Labelset>();
                    for (int j = 0; j < Math.Pow(2, labels.Length); ++j)
                    {
                        labelsets.Add(new Labelset(labels, j));
                    }
                }
                #endregion

                //开始计算
                foreach (Labelset labelsetj in labelsets)//此时结果会好一些(masa: group3, group5)
                {
                    double valueOfNumerator = 1;
                    #region(公式 (5))
                    foreach (Annotator annotator in sentence.AnnotaitonGroups[groupIndex].AnnotatorAnnotationDic.Keys)
                    {
                        Labelset labelsetl = sentence.AnnotaitonGroups[groupIndex].AnnotatorAnnotationDic[annotator].ToLabelset(labels);
                        if (pakjl.Value[annotator].ContainsKey(labelsetj))
                        {
                            if (pakjl.Value[annotator][labelsetj].ContainsKey(labelsetl))
                            {
                                valueOfNumerator *= pakjl.Value[annotator][labelsetj][labelsetl];
                            }
                            else
                            {
                                valueOfNumerator = 0;
                                break;
                            }
                        }
                        //else valueOfNumerator *= 1 / Math.Pow(2, labels.Length);//如果annotator没标过正确为j的句子,则认为此annotator对此j来说,所有可能标的标签l概率相等(对masa有用,没有的话(相当于乘以1)结果很差;boku进不来)
                        else
                        {
                            valueOfNumerator = 0; break;
                        }                                    //相当于valueOfNumerator*0;boku进不来;此时结果会好一些(masa: group3)
                    }
                    if (valueOfNumerator == 0)
                    {
                        continue;
                    }
                    #endregion
                    #region 公式(5)*(6)
                    foreach (PriorP p in Variable.PriorP)
                    {
                        switch (p)
                        {
                        case PriorP.Pj:
                            valueOfNumerator *= pj.Value[labelsetj];
                            break;

                        case PriorP.Mcj:
                            valueOfNumerator *= mcj.Value[sentence.Character][labelsetj];    //Consistency的关键
                            break;

                        case PriorP.Sij:
                            valueOfNumerator *= sij.Value[sentence][labelsetj];
                            break;

                        case PriorP.ConditionalPj:
                        {
                            if (sentence.ID != 0)
                            {
                                bool   finded = false;
                                double optimalPreLabelsetValue = 0;
                                double conditonalPj            = 0;
                                int    n = 1;
                                foreach (KeyValuePair <Labelset, double> labelsetPre in sij.SortLabelsets(Variable.Sentences[sentence.ID - 1]))       //找出前一句最优标注和其概率
                                {
                                    Tuple <Labelset, Labelset> labelsetPair = Tuple.Create(labelsetPre.Key, labelsetj);
                                    if (!finded)
                                    {
                                        if (conditionalPj.ContainsKey(labelsetPair))
                                        {
                                            finded                  = true;
                                            conditonalPj            = conditionalPj[labelsetPair];//找出前一句最优标注到这一句的转移概率
                                            optimalPreLabelsetValue = labelsetPre.Value;
                                        }
                                        continue;
                                    }
                                    if (finded)
                                    {
                                        if (labelsetPre.Value == optimalPreLabelsetValue && conditionalPj.ContainsKey(labelsetPair))        //最优标注可能不只一个(概率相同,同为最大),随意要继续遍历
                                        {
                                            conditonalPj += conditionalPj[labelsetPair];
                                            ++n;
                                        }
                                        else
                                        {
                                            break;
                                        }
                                    }
                                }
                                valueOfNumerator *= conditonalPj / n;        //n:最优标注的个数
                            }
                            else
                            {
                                Tuple <Labelset, Labelset> labelsetPair = Tuple.Create(new Labelset(true), labelsetj);
                                if (conditionalPj.ContainsKey(labelsetPair))
                                {
                                    valueOfNumerator *= conditionalPj[labelsetPair];
                                }
                                else
                                {
                                    valueOfNumerator = 0;
                                }
                            }
                        }
                        break;

                            #region Mcj
                        case PriorP.ConditionalMcj:
                        {
                            if (sentence.ID != 0)
                            {
                                bool     finded = false;
                                double   optimalPreLabelsetValue = 0;
                                double   maxConditonalMcj        = 0;
                                int      n           = 1;
                                Sentence sentencePre = Variable.Sentences[sentence.ID - 1];
                                Tuple <Character, Character> characterPair = Tuple.Create(sentencePre.Character, sentence.Character);
                                foreach (KeyValuePair <Labelset, double> labelsetPre in sij.SortLabelsets(sentencePre))
                                {
                                    Tuple <Labelset, Labelset> labelsetPair = Tuple.Create(labelsetPre.Key, labelsetj);
                                    if (!finded)
                                    {
                                        if (conditionalMcj[characterPair].ContainsKey(labelsetPair))
                                        {
                                            finded                  = true;
                                            maxConditonalMcj        = conditionalMcj[characterPair][labelsetPair];
                                            optimalPreLabelsetValue = labelsetPre.Value;
                                        }
                                        continue;
                                    }
                                    if (finded)
                                    {
                                        if (labelsetPre.Value == optimalPreLabelsetValue && conditionalMcj[characterPair].ContainsKey(labelsetPair))
                                        {
                                            maxConditonalMcj += conditionalMcj[characterPair][labelsetPair];
                                            ++n;
                                        }
                                        else
                                        {
                                            break;
                                        }
                                    }
                                }
                                valueOfNumerator *= maxConditonalMcj / n;
                            }
                            else
                            {
                                Tuple <Character, Character> characterPair = Tuple.Create(new Character("##"), sentence.Character);
                                Tuple <Labelset, Labelset>   labelsetPair  = Tuple.Create(new Labelset(true), labelsetj);
                                if (conditionalMcj[characterPair].ContainsKey(labelsetPair))
                                {
                                    valueOfNumerator *= conditionalMcj[characterPair][labelsetPair];
                                }
                                else
                                {
                                    valueOfNumerator = 0;
                                }
                            }
                        }
                        break;

                            #endregion
                        case PriorP.ConditionalSij:
                        {
                            if (sentence.ID != 0)
                            {
                                bool     finded = false;
                                double   optimalPreLabelsetValue = 0;
                                double   conditonalPj            = 0;
                                int      n           = 1;
                                Sentence sentencePre = Variable.Sentences[sentence.ID - 1];
                                Tuple <Sentence, Sentence> sentencePair = Tuple.Create(sentencePre, sentence);
                                foreach (KeyValuePair <Labelset, double> labelsetPre in sij.SortLabelsets(sentencePre))
                                {
                                    Tuple <Labelset, Labelset> labelsetPair = Tuple.Create(labelsetPre.Key, labelsetj);
                                    if (!finded)
                                    {
                                        if (labelsetPairFrequencyForSij[sentencePair].ContainsKey(labelsetPair))
                                        {
                                            finded                  = true;
                                            conditonalPj            = labelsetPairFrequencyForSij[sentencePair][labelsetPair] / labelsetPre.Value;
                                            optimalPreLabelsetValue = labelsetPre.Value;
                                        }
                                        continue;
                                    }
                                    if (finded)
                                    {
                                        if (labelsetPre.Value == optimalPreLabelsetValue && labelsetPairFrequencyForSij[sentencePair].ContainsKey(labelsetPair))
                                        {
                                            conditonalPj += labelsetPairFrequencyForSij[sentencePair][labelsetPair] / labelsetPre.Value;
                                            ++n;
                                        }
                                        else
                                        {
                                            break;
                                        }
                                    }
                                }
                                valueOfNumerator *= conditonalPj / n;
                            }
                            else
                            {
                                Tuple <Sentence, Sentence> sentencePair = Tuple.Create(new Sentence(-1, "##"), sentence);
                                Tuple <Labelset, Labelset> labelsetPair = Tuple.Create(new Labelset(true), labelsetj);
                                if (labelsetPairFrequencyForSij[sentencePair].ContainsKey(labelsetPair))
                                {
                                    valueOfNumerator *= labelsetPairFrequencyForSij[sentencePair][labelsetPair];
                                }
                                else
                                {
                                    valueOfNumerator = 0;
                                }
                            }
                        }
                        break;
                        }
                    }
                    #endregion
                    if (valueOfNumerator != 0)
                    {
                        numerator[sentence].Add(labelsetj, valueOfNumerator);
                    }
                }
                #region 计算分母 (公式(7))
                double valueOfDenominator = 0;
                foreach (Labelset Labelsetq in numerator[sentence].Keys)//因为是加,故只需遍历numerator里有的标注,不需遍历所有标注
                {
                    valueOfDenominator += numerator[sentence][Labelsetq];
                }
                denominator.Add(sentence, valueOfDenominator);
                #endregion
            }

            //计算Pdata和Sij
            pdata = pdata != null ? new Pdata(++pdata.Time, pdata.Value) : new Pdata(1, 0);
            sij   = new Sij(++sij.Time);
            foreach (Sentence sentence in Variable.Sentences)
            {
                sij.Value.Add(sentence, new Dictionary <Labelset, double>());
                foreach (Labelset labelset in numerator[sentence].Keys)
                {
                    if (Variable.SijDividPDataOnI)                                                             //常规方法
                    {
                        sij.Value[sentence][labelset] = numerator[sentence][labelset] / denominator[sentence]; //Dic赋值时没有的元素会自动加
                    }
                    else
                    {
                        sij.Value[sentence][labelset] = numerator[sentence][labelset];
                    }
                }
                pdata.Value += -Math.Log(denominator[sentence]);
            }
            if (pdatas.Contains(pdata.Value) || (Math.Abs(pdata.MondifiedValue) <= Variable.ConvergeValueThreshold))
            {
                isFinished = true;
            }
            else
            {
                pdatas.Add(pdata.Value);
            }
            if (Variable.OutputPdata)
            {
                Variable.OutputFile.WriteLine(pdata.ToString());
            }
            return(isFinished);
        }