Esempio n. 1
0
        /// <summary>Compute the log probability of the document given the model with the parameters x.</summary>
        private double DocumentLogProbability(int[][][] docData, int docIndex, CRFCliqueTree <string> cliqueTree)
        {
            int[] docLabels = labels[docIndex];
            int[] given     = new int[window - 1];
            Arrays.Fill(given, classIndex.IndexOf(backgroundSymbol));
            if (docLabels.Length > docData.Length)
            {
                // only true for self-training
                // fill the given array with the extra docLabels
                System.Array.Copy(docLabels, 0, given, 0, given.Length);
                // shift the docLabels array left
                int[] newDocLabels = new int[docData.Length];
                System.Array.Copy(docLabels, docLabels.Length - newDocLabels.Length, newDocLabels, 0, newDocLabels.Length);
                docLabels = newDocLabels;
            }
            double startPosLogProb = cliqueTree.LogProbStartPos();

            if (Verbose)
            {
                System.Console.Error.Printf("P_-1(Background) = % 5.3f%n", startPosLogProb);
            }
            double prob = startPosLogProb;

            // iterate over the positions in this document
            for (int i = 0; i < docData.Length; i++)
            {
                int    label = docLabels[i];
                double p     = cliqueTree.CondLogProbGivenPrevious(i, label, given);
                if (Verbose)
                {
                    log.Info("P(" + label + "|" + ArrayMath.ToString(given) + ")=" + p);
                }
                prob += p;
                System.Array.Copy(given, 1, given, 0, given.Length - 1);
                given[given.Length - 1] = label;
            }
            return(prob);
        }
        private Quadruple <int, double, IDictionary <int, double[]>, IDictionary <int, double[]> > ExpectedCountsAndValueForADoc(int docIndex, bool skipExpectedCountCalc, bool skipValCalc)
        {
            int[] activeFeatures = dataFeatureHashByDoc[docIndex];
            IList <ICollection <int> >      docDataHash          = dataFeatureHash[docIndex];
            IDictionary <int, IList <int> > condensedFeaturesMap = condensedMap[docIndex];
            double prob = 0;

            int[][][] docData   = totalData[docIndex];
            int[]     docLabels = null;
            if (docIndex < labels.Length)
            {
                docLabels = labels[docIndex];
            }
            Timing timer = new Timing();

            double[][][] featureVal3DArr = null;
            if (featureVal != null)
            {
                featureVal3DArr = featureVal[docIndex];
            }
            // make a clique tree for this document
            CRFCliqueTree <string> cliqueTree = CRFCliqueTree.GetCalibratedCliqueTree(docData, labelIndices, numClasses, classIndex, backgroundSymbol, cliquePotentialFunc, featureVal3DArr);

            if (!skipValCalc)
            {
                // compute the log probability of the document given the model with the parameters x
                int[] given = new int[window - 1];
                Arrays.Fill(given, classIndex.IndexOf(backgroundSymbol));
                if (docLabels.Length > docData.Length)
                {
                    // only true for self-training
                    // fill the given array with the extra docLabels
                    System.Array.Copy(docLabels, 0, given, 0, given.Length);
                    // shift the docLabels array left
                    int[] newDocLabels = new int[docData.Length];
                    System.Array.Copy(docLabels, docLabels.Length - newDocLabels.Length, newDocLabels, 0, newDocLabels.Length);
                    docLabels = newDocLabels;
                }
                double startPosLogProb = cliqueTree.LogProbStartPos();
                if (Verbose)
                {
                    System.Console.Error.Printf("P_-1(Background) = % 5.3f\n", startPosLogProb);
                }
                prob += startPosLogProb;
                // iterate over the positions in this document
                for (int i = 0; i < docData.Length; i++)
                {
                    int    label = docLabels[i];
                    double p     = cliqueTree.CondLogProbGivenPrevious(i, label, given);
                    if (Verbose)
                    {
                        log.Info("P(" + label + "|" + ArrayMath.ToString(given) + ")=" + System.Math.Exp(p));
                    }
                    prob += p;
                    System.Array.Copy(given, 1, given, 0, given.Length - 1);
                    given[given.Length - 1] = label;
                }
            }
            IDictionary <int, double[]>          EForADoc    = SparseE(activeFeatures);
            IList <IDictionary <int, double[]> > EForADocPos = null;

            if (dropoutApprox)
            {
                EForADocPos = new List <IDictionary <int, double[]> >(docData.Length);
            }
            if (!skipExpectedCountCalc)
            {
                // compute the expected counts for this document, which we will need to compute the derivative
                // iterate over the positions in this document
                double fVal = 1.0;
                for (int i = 0; i < docData.Length; i++)
                {
                    ICollection <int>           docDataHashI   = docDataHash[i];
                    IDictionary <int, double[]> EForADocPosAtI = null;
                    if (dropoutApprox)
                    {
                        EForADocPosAtI = SparseE(docDataHashI);
                    }
                    foreach (int fIndex in docDataHashI)
                    {
                        int j = map[fIndex];
                        IIndex <CRFLabel> labelIndex = labelIndices[j];
                        // for each possible labeling for that clique
                        for (int k = 0; k < labelIndex.Size(); k++)
                        {
                            int[]  label = labelIndex.Get(k).GetLabel();
                            double p     = cliqueTree.Prob(i, label);
                            // probability of these labels occurring in this clique with these features
                            if (dropoutApprox)
                            {
                                IncreScore(EForADocPosAtI, fIndex, k, fVal * p);
                            }
                            IncreScore(EForADoc, fIndex, k, fVal * p);
                        }
                    }
                    if (dropoutApprox)
                    {
                        foreach (int fIndex_1 in docDataHashI)
                        {
                            if (condensedFeaturesMap.Contains(fIndex_1))
                            {
                                IList <int> aList = condensedFeaturesMap[fIndex_1];
                                foreach (int toCopyInto in aList)
                                {
                                    double[] arr       = EForADocPosAtI[fIndex_1];
                                    double[] targetArr = new double[arr.Length];
                                    for (int q = 0; q < arr.Length; q++)
                                    {
                                        targetArr[q] = arr[q];
                                    }
                                    EForADocPosAtI[toCopyInto] = targetArr;
                                }
                            }
                        }
                        EForADocPos.Add(EForADocPosAtI);
                    }
                }
                // copy for condensedFeaturesMap
                foreach (KeyValuePair <int, IList <int> > entry in condensedFeaturesMap)
                {
                    int         key   = entry.Key;
                    IList <int> aList = entry.Value;
                    foreach (int toCopyInto in aList)
                    {
                        double[] arr       = EForADoc[key];
                        double[] targetArr = new double[arr.Length];
                        for (int i_1 = 0; i_1 < arr.Length; i_1++)
                        {
                            targetArr[i_1] = arr[i_1];
                        }
                        EForADoc[toCopyInto] = targetArr;
                    }
                }
            }
            IDictionary <int, double[]> dropoutPriorGrad = null;

            if (prior == DropoutPrior)
            {
                // we can optimize this, this is too large, don't need this big
                dropoutPriorGrad = SparseE(activeFeatures);
                // log.info("computing dropout prior for doc " + docIndex + " ... ");
                prob -= GetDropoutPrior(cliqueTree, docData, EForADoc, docDataHash, activeFeatures, dropoutPriorGrad, condensedFeaturesMap, EForADocPos);
            }
            // log.info(" done!");
            return(new Quadruple <int, double, IDictionary <int, double[]>, IDictionary <int, double[]> >(docIndex, prob, EForADoc, dropoutPriorGrad));
        }