コード例 #1
0
ファイル: FindStepLib.cs プロジェクト: zbxzc35/BoostTree
        public double BestCombinedMeanNDCG(QueryCollection qc1, QueryCollection qc2)
        {
            double bestNDCGGain;
            double alpha = FindStep(qc1, qc2, null, out bestNDCGGain);
            QueryCollection qc = QueryCollection.LinearlyCombine(1.0, qc1, alpha, qc2);
            qc.ComputeNDCGs();

            if (verbose)
            {
                // Print all three NDCGs: previous model, current model, combination
                qc1.ComputeNDCGs();
                qc2.ComputeNDCGs();
                Console.WriteLine("Previous model: NDCG = {0:F6}-{1:F6}-{2:F6}, NDCG@{3} = {4:F6}-{5:F6}-{6:F6}",
                                  qc1.NonTruncNDCG_pes, qc1.NonTruncNDCG_mean, qc1.NonTruncNDCG_opt, DCGScorer.truncLevel,
                                  qc1.TruncNDCG_pes, qc1.TruncNDCG_mean, qc1.TruncNDCG_opt);
                Console.WriteLine("Current model: NDCG = {0:F6}-{1:F6}-{2:F6}, NDCG@{3} = {4:F6}-{5:F6}-{6:F6}",
                                  qc2.NonTruncNDCG_pes, qc2.NonTruncNDCG_mean, qc2.NonTruncNDCG_opt, DCGScorer.truncLevel,
                                  qc2.TruncNDCG_pes, qc2.TruncNDCG_mean, qc2.TruncNDCG_opt);
                Console.WriteLine("Combined model: NDCG = {0:F6}-{1:F6}-{2:F6}, NDCG@{3} = {4:F6}-{5:F6}-{6:F6}",
                                  qc.NonTruncNDCG_pes, qc.NonTruncNDCG_mean, qc.NonTruncNDCG_opt, DCGScorer.truncLevel,
                                  qc.TruncNDCG_pes, qc.TruncNDCG_mean, qc.TruncNDCG_opt);
                Console.WriteLine("alpha = {0}", alpha);
            }

            return qc.NonTruncNDCG_mean;
        }
コード例 #2
0
ファイル: FindStepLib.cs プロジェクト: zbxzc35/BoostTree
        /// <summary>
        /// 
        /// </summary>
        /// <param name="qc1">the scores for all the queries computed from the existing system</param>
        /// <param name="qc2">the score for all the queries computed from the newly added tree</param>
        /// <param name="queryIdxActive">the set of "active queries" that we use to find the optimal combination</param>
        /// <param name="bestMeanNDCGGain"></param>
        /// <returns></returns>
        public double FindStep(QueryCollection qc1, QueryCollection qc2, int[] queryIdxActive, out double bestMeanNDCGGain)
        {
            PairRankedItems.Reset();

            // (queryIdxActive == null) <=> using all the queries in qc1/qc2
            int cActiveQueries = (queryIdxActive == null) ? qc1.NQueries : queryIdxActive.Length;

            if(qc1.Count != qc2.Count)
                throw new Exception("Input files must have same number of rows.");
            if(qc1.NQueries != qc2.NQueries)
                throw new Exception("Input files must have same number of queries.");

            if (qc1.NQueries < cActiveQueries)
                throw new Exception("Active queries must be less than all the queries.");
            DCGScorer dcg = new DCGScorer();

            // The relabeling must be done before FillRankedItems is called // REVIEW: ??
            long nPairs; // Only used for debug
            int nDocs;   // ditto
            CountDocsNAllPairs(qc1, qc2, queryIdxActive, out nPairs, out nDocs); // ditto
            int rankedItemCtr = 0;
            //PairRankedItems pri = null;
            int nQueries = 0;
            int nSkippedQueries = 0;
            for (int i = 0; i < cActiveQueries; ++i)
            {
                int qIdx = (queryIdxActive == null) ? i : queryIdxActive[i];
                Query query1 = qc1.queries[qIdx];
                Query query2 = qc2.queries[qIdx];
                // We discard the array itself each time, but the object pointers persist.
                // Also: discard any queries that have maxDCG = 0.
                RankedItem[] thisRankedItems = FillRankedItems(query1, query2, dcg, random);
                if(thisRankedItems != null)
                {
                    FillRanks(thisRankedItems);
                    //pri = FillPairRankedItems(thisRankedItems, convex, maxStep, ref rankedItemCtr);
                    FillPairRankedItems(thisRankedItems, convex, alphaPos, maxStep, ref rankedItemCtr); // This forms a linked list.
                    ++nQueries;
                }
                else
                {
                    ++nSkippedQueries;
                }
            }

            PairRankedItems[] pairRankedItems = PRI_ListToArray();
            if (alphaPos)
            {
                Array.Sort(pairRankedItems, new SortPairRankedItemsIncreasing()); // First value closest to zero, next more positive
            }
            else
            {
                Array.Sort(pairRankedItems, new SortPairRankedItemsDecreasing()); // First value still closest to zero, next more negative
            }
            // Now that we have the sorted values of alpha: compute which global alpha gives best NDCG gain.
            double bestAlpha;
            FindBestAlpha(pairRankedItems, dcg, nQueries, out bestAlpha, out bestMeanNDCGGain);

            if (verbose)
            {
                Console.WriteLine("{0} queries total, {1} skipped queries, {2} docs", nQueries + nSkippedQueries, nSkippedQueries, nDocs);
                Console.WriteLine("Tot. # pairs = {0}, num. pairs in computation = {1}", nPairs, pairRankedItems.Length);
                // For the convex combination, it's tempting to rescale alpha so that the first weight is one.  But this is not always possible:
                // it may need to be -1.
                Console.WriteLine("Best mean NDCG Gain = {0}, best alpha = {1}", bestMeanNDCGGain, bestAlpha);

                // Check that the gain is correct.
                qc1.ComputeNDCGs();
                qc2.ComputeNDCGs();
                double firstFactor = convex ? 1.0 - bestAlpha : 1.0;
                QueryCollection qc = QueryCollection.LinearlyCombine(firstFactor, qc1, bestAlpha, qc2);
                qc.ComputeNDCGs();
                Console.WriteLine("NON-TRUNC: First NDCG = {0:F4}/{1:F4}, second = {2:F4}/{3:F4}, combined = {4:F4}/{5:F4}",
                                  qc1.NonTruncNDCG_pes, qc1.NonTruncNDCG_opt, qc2.NonTruncNDCG_pes, qc2.NonTruncNDCG_opt,
                                  qc.NonTruncNDCG_pes, qc.NonTruncNDCG_opt);
                Console.WriteLine("    TRUNC: First NDCG = {0:F4}/{1:F4}, second = {2:F4}/{3:F4}, combined = {4:F4}/{5:F4}",
                                  qc1.TruncNDCG_pes, qc1.TruncNDCG_opt, qc2.TruncNDCG_pes, qc2.TruncNDCG_opt,
                                  qc.TruncNDCG_pes, qc.TruncNDCG_opt);
            }

            return bestAlpha;
        }
コード例 #3
0
ファイル: FindStep.cs プロジェクト: zbxzc35/BoostTree
        /// <summary>
        /// Generate two QueryCollections containing randomly generated scores and labels (although they share
        /// all the same labels, as though one dataset tested on two models).  The scores are loosely
        /// correlated with labels.  Then, compute the best linear combination.  Finally compare the claimed NDCG gain
        /// with the NDCG gain computed directly.  The relative frequencies of the labels are taken from the May 2005
        /// training set: 
        /// 
        /// Perfect:	0.0204
        /// Excellent: 	0.0523
        /// Good:		0.2714
        /// Fair:		0.2855
        /// Bad:		0.3704
        ///
        /// Note we use random features to make it very unlikely that there will be any degeneracy: so the claimed delta NDCG
        /// should be what's actually measured by taking the linear combination that FindStep proposes.
        /// </summary>
        /// <param name="qc1"></param>
        /// <param name="qc2"></param>
        /// <param name="nDocsPerQuery"></param>
        static void SelfTest(int nQueries, int nDocsPerQuery, FindStepLib fs)
        {
            Random rangen = new Random(0);
            float[] priors = new float[5];
            priors[0] = 0.3704F; // bads first
            priors[1] = 0.2855F;
            priors[2] = 0.2714F;
            priors[3] = 0.0523F;
            priors[4] = 0.0204F;
            double scale1 = 10.0;
            double scale2 = 20.0;
            int nScores = 1;
            QueryCollection qc1 = new QueryCollection(nQueries, priors, scale1, nScores, nDocsPerQuery, rangen);
            // Must share labels
            QueryCollection qc2 = qc1.CopyEmptyQueryCollection();
            for(int i = 0; i < qc2.queries.Length; ++i)
            {
                Query q1 = qc1.queries[i];
                Query q2 = qc2.queries[i];
                for(int j = 0; j < q1.Length; ++j)
                {
                    double label = (double) q1.Labels[j];
                    if (q2.Labels[j] != label)
                        throw new Exception("Labels mismatch.");
                    q1.scores[j] = (float)(label + scale1 * (2.0 * rangen.NextDouble() - 1.0));
                    q2.scores[j] = (float)(label + scale2 * (2.0 * rangen.NextDouble() - 1.0));
                }

            }

            double bestMeanNDCGGain;
            // We will only check for positive alphas.
            double alpha = fs.FindStep(qc1, qc2, null, out bestMeanNDCGGain); // prints out the best NDCG gain
            Console.WriteLine("Optimal alpha = {0}", alpha);

            double firstFactor = fs.convex ? (1.0 - alpha) : 1.0;

            qc1.ComputeNDCGs();
            double initialNDCG_pes = qc1.NonTruncNDCG_pes;
            double initialNDCG_opt = qc1.NonTruncNDCG_opt;
            Console.WriteLine("Initial nonTruncNDCG = {0}-{1}", initialNDCG_pes, initialNDCG_opt);
            QueryCollection qc = QueryCollection.LinearlyCombine(firstFactor, qc1, alpha, qc2);
            qc.ComputeNDCGs();
            double finalNDCG_pes = qc.NonTruncNDCG_pes;
            double finalNDCG_opt = qc.NonTruncNDCG_opt;
            Console.WriteLine("Final nonTruncNDCG = {0}-{1}", finalNDCG_pes, finalNDCG_opt);

            Console.WriteLine("Type RETURN for exhaustive search");
            Console.ReadLine();
            double bestFound = 0.0;
            double maxAlpha = fs.convex ? 1.0 : fs.MaxStep;
            double alphaFactor = fs.alphaPos ? 1.0 : -1.0;
            for(int i = 0; i < 10001; ++i)
            {
                alpha = alphaFactor * (double)(i * maxAlpha) / 10000.0;
                qc = QueryCollection.LinearlyCombine(firstFactor, qc1, alpha, qc2);
                qc.ComputeNDCGs();
                if (qc.NonTruncNDCG_opt != qc.NonTruncNDCG_pes)
                    throw new Exception("Self test requires no degeneracy");
                double finalNDCG_mean = qc.NonTruncNDCG_mean;
                if(finalNDCG_mean > bestFound)
                {
                    Console.WriteLine("Best NDCG found so far with search: alpha = {0}, NDCG = {1}", alpha, finalNDCG_mean);
                    bestFound = finalNDCG_mean;
                }
            }
        }