Ejemplo n.º 1
0
        static void Main(string[] args)
        {
            try
            {
                CommandLineArguments cmd = new CommandLineArguments();
                if(CommandLine.Parser.ParseArgumentsWithUsage(args, cmd))
                {
                    Random random = new Random(cmd.seed);
                    FindStepLib fs = new FindStepLib(cmd.convex, random, cmd.verbose);
                    DCGScorer.truncLevel = cmd.truncLevel;

                    if (cmd.selfTest)
                    {
                        //fs.alphaPos = false; // Set to false to search for the optimal negative alpha.  Default is true.
                        SelfTest(10, 100, fs);
                    }
                    else
                    {
                        QueryCollection qc1 = new QueryCollection(cmd.firstScoresFile, cmd.labelForUnlabeled,
                                                                  cmd.skipDegenerateQueries, cmd.scoreForDegenerateQuery);
                        QueryCollection qc2 = new QueryCollection(cmd.secondScoresFile, cmd.labelForUnlabeled,
                                                                  cmd.skipDegenerateQueries, cmd.scoreForDegenerateQuery);

                        // Assume that the first 'feature' is in fact the scores
                        qc1.AssignScoresFromFeature(0);
                        qc2.AssignScoresFromFeature(0);

                        double bestGain;

                        fs.FindStep(qc1, qc2, null, out bestGain);
                    }

            #if DEBUG // Force console to stick around
                    Console.WriteLine("...Press Enter to terminate program...");
                    Console.ReadLine();
            #endif
                }
            }
            catch(Exception exc)
            {
                Console.WriteLine(exc.Message);
            }
        }
Ejemplo n.º 2
0
        /// <summary>
        /// 
        /// </summary>
        /// <param name="qc1">the scores for all the queries computed from the existing system</param>
        /// <param name="qc2">the score for all the queries computed from the newly added tree</param>
        /// <param name="queryIdxActive">the set of "active queries" that we use to find the optimal combination</param>
        /// <param name="bestMeanNDCGGain"></param>
        /// <returns></returns>
        public double FindStep(QueryCollection qc1, QueryCollection qc2, int[] queryIdxActive, out double bestMeanNDCGGain)
        {
            PairRankedItems.Reset();

            // (queryIdxActive == null) <=> using all the queries in qc1/qc2
            int cActiveQueries = (queryIdxActive == null) ? qc1.NQueries : queryIdxActive.Length;

            if(qc1.Count != qc2.Count)
                throw new Exception("Input files must have same number of rows.");
            if(qc1.NQueries != qc2.NQueries)
                throw new Exception("Input files must have same number of queries.");

            if (qc1.NQueries < cActiveQueries)
                throw new Exception("Active queries must be less than all the queries.");
            DCGScorer dcg = new DCGScorer();

            // The relabeling must be done before FillRankedItems is called // REVIEW: ??
            long nPairs; // Only used for debug
            int nDocs;   // ditto
            CountDocsNAllPairs(qc1, qc2, queryIdxActive, out nPairs, out nDocs); // ditto
            int rankedItemCtr = 0;
            //PairRankedItems pri = null;
            int nQueries = 0;
            int nSkippedQueries = 0;
            for (int i = 0; i < cActiveQueries; ++i)
            {
                int qIdx = (queryIdxActive == null) ? i : queryIdxActive[i];
                Query query1 = qc1.queries[qIdx];
                Query query2 = qc2.queries[qIdx];
                // We discard the array itself each time, but the object pointers persist.
                // Also: discard any queries that have maxDCG = 0.
                RankedItem[] thisRankedItems = FillRankedItems(query1, query2, dcg, random);
                if(thisRankedItems != null)
                {
                    FillRanks(thisRankedItems);
                    //pri = FillPairRankedItems(thisRankedItems, convex, maxStep, ref rankedItemCtr);
                    FillPairRankedItems(thisRankedItems, convex, alphaPos, maxStep, ref rankedItemCtr); // This forms a linked list.
                    ++nQueries;
                }
                else
                {
                    ++nSkippedQueries;
                }
            }

            PairRankedItems[] pairRankedItems = PRI_ListToArray();
            if (alphaPos)
            {
                Array.Sort(pairRankedItems, new SortPairRankedItemsIncreasing()); // First value closest to zero, next more positive
            }
            else
            {
                Array.Sort(pairRankedItems, new SortPairRankedItemsDecreasing()); // First value still closest to zero, next more negative
            }
            // Now that we have the sorted values of alpha: compute which global alpha gives best NDCG gain.
            double bestAlpha;
            FindBestAlpha(pairRankedItems, dcg, nQueries, out bestAlpha, out bestMeanNDCGGain);

            if (verbose)
            {
                Console.WriteLine("{0} queries total, {1} skipped queries, {2} docs", nQueries + nSkippedQueries, nSkippedQueries, nDocs);
                Console.WriteLine("Tot. # pairs = {0}, num. pairs in computation = {1}", nPairs, pairRankedItems.Length);
                // For the convex combination, it's tempting to rescale alpha so that the first weight is one.  But this is not always possible:
                // it may need to be -1.
                Console.WriteLine("Best mean NDCG Gain = {0}, best alpha = {1}", bestMeanNDCGGain, bestAlpha);

                // Check that the gain is correct.
                qc1.ComputeNDCGs();
                qc2.ComputeNDCGs();
                double firstFactor = convex ? 1.0 - bestAlpha : 1.0;
                QueryCollection qc = QueryCollection.LinearlyCombine(firstFactor, qc1, bestAlpha, qc2);
                qc.ComputeNDCGs();
                Console.WriteLine("NON-TRUNC: First NDCG = {0:F4}/{1:F4}, second = {2:F4}/{3:F4}, combined = {4:F4}/{5:F4}",
                                  qc1.NonTruncNDCG_pes, qc1.NonTruncNDCG_opt, qc2.NonTruncNDCG_pes, qc2.NonTruncNDCG_opt,
                                  qc.NonTruncNDCG_pes, qc.NonTruncNDCG_opt);
                Console.WriteLine("    TRUNC: First NDCG = {0:F4}/{1:F4}, second = {2:F4}/{3:F4}, combined = {4:F4}/{5:F4}",
                                  qc1.TruncNDCG_pes, qc1.TruncNDCG_opt, qc2.TruncNDCG_pes, qc2.TruncNDCG_opt,
                                  qc.TruncNDCG_pes, qc.TruncNDCG_opt);
            }

            return bestAlpha;
        }
Ejemplo n.º 3
0
        /// <summary>
        /// The only reason we pass both data1 and data2 is to change the labels in the second one, too.
        /// </summary>
        /// <param name="data1"></param>
        /// <param name="data2"></param>
        /// <param name="labelForUnlabeled"></param>
        /// <returns></returns>
        //static public void Relabel(IRankDataCollection data1, IRankDataCollection data2, float labelForUnlabeled)
        //{
        //    IEnumerator ienumData1 = data1.GetEnumerator();
        //    IEnumerator ienumData2 = data2.GetEnumerator();
        //    while(ienumData1.MoveNext())
        //    {
        //        ienumData2.MoveNext();
        //        RankData query1 = (RankData)ienumData1.Current;
        //        RankData query2 = (RankData)ienumData2.Current;
        //        for(int i = 0; i < query1.Labels.Length; ++i)
        //        {
        //            if(query1.Labels[i] == -1)
        //                query1.Labels[i] = labelForUnlabeled;
        //            if(query2.Labels[i] == -1)
        //                query2.Labels[i] = labelForUnlabeled;
        //        }
        //    }
        //}
        /// <summary>
        /// Debug only: how does the total number of pairs compare with the number of pairs used in the computation?
        /// </summary>
        /// <param name="data1"></param>
        /// <param name="data2"></param>
        /// <param name="labelForUnlabeled"></param>
        /// <returns></returns>
        public void CountDocsNAllPairs(QueryCollection qc1, QueryCollection qc2, int[] queryIdxActive, out long nPairs, out int nDocs)
        {
            nPairs = 0;
            nDocs = 0;

            int cActiveQuery = (queryIdxActive == null) ? qc1.queries.Length : queryIdxActive.Length;

            for (int i = 0; i < cActiveQuery; ++i)
            {
                int qIdx = (queryIdxActive == null) ? i : queryIdxActive[i];
                int tot = qc1.queries[qIdx].Labels.Length;
                Debug.Assert(tot == qc2.queries[qIdx].Labels.Length, "query collection size mismatch");
                nDocs += tot;
                nPairs += ( tot * ( tot - 1 ) ) / 2;
            }
        }
Ejemplo n.º 4
0
        public double BestCombinedMeanNDCG(QueryCollection qc1, QueryCollection qc2)
        {
            double bestNDCGGain;
            double alpha = FindStep(qc1, qc2, null, out bestNDCGGain);
            QueryCollection qc = QueryCollection.LinearlyCombine(1.0, qc1, alpha, qc2);
            qc.ComputeNDCGs();

            if (verbose)
            {
                // Print all three NDCGs: previous model, current model, combination
                qc1.ComputeNDCGs();
                qc2.ComputeNDCGs();
                Console.WriteLine("Previous model: NDCG = {0:F6}-{1:F6}-{2:F6}, NDCG@{3} = {4:F6}-{5:F6}-{6:F6}",
                                  qc1.NonTruncNDCG_pes, qc1.NonTruncNDCG_mean, qc1.NonTruncNDCG_opt, DCGScorer.truncLevel,
                                  qc1.TruncNDCG_pes, qc1.TruncNDCG_mean, qc1.TruncNDCG_opt);
                Console.WriteLine("Current model: NDCG = {0:F6}-{1:F6}-{2:F6}, NDCG@{3} = {4:F6}-{5:F6}-{6:F6}",
                                  qc2.NonTruncNDCG_pes, qc2.NonTruncNDCG_mean, qc2.NonTruncNDCG_opt, DCGScorer.truncLevel,
                                  qc2.TruncNDCG_pes, qc2.TruncNDCG_mean, qc2.TruncNDCG_opt);
                Console.WriteLine("Combined model: NDCG = {0:F6}-{1:F6}-{2:F6}, NDCG@{3} = {4:F6}-{5:F6}-{6:F6}",
                                  qc.NonTruncNDCG_pes, qc.NonTruncNDCG_mean, qc.NonTruncNDCG_opt, DCGScorer.truncLevel,
                                  qc.TruncNDCG_pes, qc.TruncNDCG_mean, qc.TruncNDCG_opt);
                Console.WriteLine("alpha = {0}", alpha);
            }

            return qc.NonTruncNDCG_mean;
        }
Ejemplo n.º 5
0
 public void AssignScoresFrom(QueryCollection qc)
 {
     if (qc.NQueries != NQueries)
         throw new Exception("QueryCollection.AssignScores: query array length mismatch");
     for (int iQuery = 0; iQuery < NQueries; ++iQuery)
     {
         Query qIn = qc.queries[iQuery];
         Query q = queries[iQuery];
         if (qIn.scores.Length != q.scores.Length)
             throw new Exception("QueryCollection.AssignScores: query scores length mismatch");
         for (int iScore = 0; iScore < q.Length; ++iScore)
         {
             q.scores[iScore] = qIn.scores[iScore];
         }
     }
 }
Ejemplo n.º 6
0
 /// <summary>
 /// Same, but replace the scores in qc2 with the results.
 /// </summary>
 /// <param name="alpha"></param>
 /// <param name="qc1"></param>
 /// <param name="beta"></param>
 /// <param name="qc2"></param>
 public static void LinearlyCombineNUpdate(double alpha, QueryCollection qc1, double beta, QueryCollection qc2, QueryCollection qcTarget)
 {
     #if DEBUG
     if(qc1.nQueries != qc2.nQueries || qc1.NQueries != qcTarget.NQueries)
         throw new Exception("LinearlyCombineNUpdate: queries size mismatch");
     #endif
     for (int iQuery = 0; iQuery < qc1.nQueries; ++iQuery)
     {
         double[] scores1 = qc1.queries[iQuery].scores;
         double[] scores2 = qc2.queries[iQuery].scores;
         double[] scores3 = qcTarget.queries[iQuery].scores;
     #if DEBUG
         if (scores1.Length != scores2.Length || scores1.Length != scores3.Length)
             throw new Exception("LinearlyCombineNUpdate: scores size mismatch");
     #endif
         for (int iScore = 0; iScore < scores1.Length; ++iScore)
         {
             scores3[iScore] = alpha * scores1[iScore] + beta * scores2[iScore];
         }
     }
 }
Ejemplo n.º 7
0
        /// <summary>
        /// Same, but linearly combine several vectors into one, and increment qcTarget accordingly.
        /// It is up to the calling code to zero out qcTarget if desired.
        /// </summary>
        /// <param name="weights"></param>
        /// <param name="qcArray"></param>
        /// <param name="start">Index of first query to use.</param>
        /// <param name="end">Index of last query to use.</param>
        /// <param name="qc2"></param>
        /// <returns></returns>
        public static void LinearlyCombineNIncrement(double[] weights, QueryCollection[] qcArray, int start, int end, QueryCollection qcTarget)
        {
            if(start > end || start < 0 || end >= qcArray.Length)
                throw new Exception("Illegal indices passed to LinearlyCombine");
            if(weights.Length != qcArray.Length)
                throw new Exception("weights must have same length as qcArray");

            for(int j = start; j <= end; ++j)
            {
                QueryCollection qcSource = qcArray[j];
                double wt = weights[j];
                for(int i = 0; i < qcTarget.nQueries; ++i)
                {
                    double[] scoresSource = qcSource.queries[i].scores;
                    double[] scoresTarget = qcTarget.queries[i].scores;
                    if(scoresSource.Length != scoresTarget.Length)
                        throw new Exception("LinearlyCombine: scores size mismatch");
                    for(int k = 0; k < scoresSource.Length; ++k)
                    {
                        scoresTarget[k] += wt * scoresSource[k];
                    }
                }
            }
        }
Ejemplo n.º 8
0
        /// <summary>
        /// Use one function rather than operators, since it's much more efficient.
        /// </summary>
        /// <param name="alpha"></param>
        /// <param name="qc1"></param>
        /// <param name="beta"></param>
        /// <param name="qc2"></param>
        /// <returns></returns>
        public static QueryCollection LinearlyCombine(double alpha, QueryCollection qc1, double beta, QueryCollection qc2)
        {
            QueryCollection qc = qc1.CopyEmptyQueryCollection();
            for (int i = 0; i < qc.nQueries; ++i)
            {
                double[] scores1 = qc1.queries[i].scores;
                double[] scores2 = qc2.queries[i].scores;
                double[] scoresCombined = qc.queries[i].scores;
                if (scores1.Length != scores2.Length)
                    throw new Exception("LinearlyCombine: size mismatch");
                for (int j = 0; j < scores1.Length; ++j)
                {
                    scoresCombined[j] = alpha * scores1[j] + beta * scores2[j];
                }
            }

            return qc;
        }
Ejemplo n.º 9
0
        /// <summary>
        /// Generate two QueryCollections containing randomly generated scores and labels (although they share
        /// all the same labels, as though one dataset tested on two models).  The scores are loosely
        /// correlated with labels.  Then, compute the best linear combination.  Finally compare the claimed NDCG gain
        /// with the NDCG gain computed directly.  The relative frequencies of the labels are taken from the May 2005
        /// training set: 
        /// 
        /// Perfect:	0.0204
        /// Excellent: 	0.0523
        /// Good:		0.2714
        /// Fair:		0.2855
        /// Bad:		0.3704
        ///
        /// Note we use random features to make it very unlikely that there will be any degeneracy: so the claimed delta NDCG
        /// should be what's actually measured by taking the linear combination that FindStep proposes.
        /// </summary>
        /// <param name="qc1"></param>
        /// <param name="qc2"></param>
        /// <param name="nDocsPerQuery"></param>
        static void SelfTest(int nQueries, int nDocsPerQuery, FindStepLib fs)
        {
            Random rangen = new Random(0);
            float[] priors = new float[5];
            priors[0] = 0.3704F; // bads first
            priors[1] = 0.2855F;
            priors[2] = 0.2714F;
            priors[3] = 0.0523F;
            priors[4] = 0.0204F;
            double scale1 = 10.0;
            double scale2 = 20.0;
            int nScores = 1;
            QueryCollection qc1 = new QueryCollection(nQueries, priors, scale1, nScores, nDocsPerQuery, rangen);
            // Must share labels
            QueryCollection qc2 = qc1.CopyEmptyQueryCollection();
            for(int i = 0; i < qc2.queries.Length; ++i)
            {
                Query q1 = qc1.queries[i];
                Query q2 = qc2.queries[i];
                for(int j = 0; j < q1.Length; ++j)
                {
                    double label = (double) q1.Labels[j];
                    if (q2.Labels[j] != label)
                        throw new Exception("Labels mismatch.");
                    q1.scores[j] = (float)(label + scale1 * (2.0 * rangen.NextDouble() - 1.0));
                    q2.scores[j] = (float)(label + scale2 * (2.0 * rangen.NextDouble() - 1.0));
                }

            }

            double bestMeanNDCGGain;
            // We will only check for positive alphas.
            double alpha = fs.FindStep(qc1, qc2, null, out bestMeanNDCGGain); // prints out the best NDCG gain
            Console.WriteLine("Optimal alpha = {0}", alpha);

            double firstFactor = fs.convex ? (1.0 - alpha) : 1.0;

            qc1.ComputeNDCGs();
            double initialNDCG_pes = qc1.NonTruncNDCG_pes;
            double initialNDCG_opt = qc1.NonTruncNDCG_opt;
            Console.WriteLine("Initial nonTruncNDCG = {0}-{1}", initialNDCG_pes, initialNDCG_opt);
            QueryCollection qc = QueryCollection.LinearlyCombine(firstFactor, qc1, alpha, qc2);
            qc.ComputeNDCGs();
            double finalNDCG_pes = qc.NonTruncNDCG_pes;
            double finalNDCG_opt = qc.NonTruncNDCG_opt;
            Console.WriteLine("Final nonTruncNDCG = {0}-{1}", finalNDCG_pes, finalNDCG_opt);

            Console.WriteLine("Type RETURN for exhaustive search");
            Console.ReadLine();
            double bestFound = 0.0;
            double maxAlpha = fs.convex ? 1.0 : fs.MaxStep;
            double alphaFactor = fs.alphaPos ? 1.0 : -1.0;
            for(int i = 0; i < 10001; ++i)
            {
                alpha = alphaFactor * (double)(i * maxAlpha) / 10000.0;
                qc = QueryCollection.LinearlyCombine(firstFactor, qc1, alpha, qc2);
                qc.ComputeNDCGs();
                if (qc.NonTruncNDCG_opt != qc.NonTruncNDCG_pes)
                    throw new Exception("Self test requires no degeneracy");
                double finalNDCG_mean = qc.NonTruncNDCG_mean;
                if(finalNDCG_mean > bestFound)
                {
                    Console.WriteLine("Best NDCG found so far with search: alpha = {0}, NDCG = {1}", alpha, finalNDCG_mean);
                    bestFound = finalNDCG_mean;
                }
            }
        }
Ejemplo n.º 10
0
        /// <summary>
        /// Constructor of BoostTreeRankNetLoss
        /// </summary>
        /// <param name="dp">the input data used to train boosted regression trees</param>
        /// <param name="learnRate">the input learning rate specified in training</param>
        public BoostTreeNoGainLambdaLoss(LabelFeatureDataCoded labelFeatureDataCoded, LabelConverter labelConvert,
                                   float learnRate, float labelWeight, StepSizeType stepSizeType, FindStepLib fs,
                                   float labelForUnlabeled, double scoreForDegenerateQuery, int truncLevel)
        {
            this.learnRate = learnRate;
            this.labelWeight = labelWeight;

            this.labelForUnlabeled = labelForUnlabeled;
            this.scoreForDegenerateQuery = scoreForDegenerateQuery;
            this.truncLevel = truncLevel;

            this.labels = new float[labelFeatureDataCoded.NumDataPoint];

            for (int i = 0; i < labelFeatureDataCoded.NumDataPoint; i++)
            {
                this.labels[i] = labelConvert.convert(labelFeatureDataCoded.GetLabel(i));
            }

            this.numSamples = labels.Length;

            this.score = new float[this.numSamples];
            this.funValue = new float[this.numSamples];
            this.pseudoResponses = new float[this.numSamples];
            this.weights = new float[this.numSamples];

            this.labelFeatureDataCoded = labelFeatureDataCoded;

            //data member to compute the optimal adjustment factor (step size) for leaf nodes response
            this.qcAccum = this.CreateQueryCollection();
            this.qcCurrent = this.CreateQueryCollection();
            this.fs = fs;
            this.stepSizeType = stepSizeType;
        }
Ejemplo n.º 11
0
        private QueryCollection CreateQueryCollection()
        {
            int cQueris = this.labelFeatureDataCoded.DataGroups.GroupCounts;
            Query[] queries = new Query[cQueris];
            for (int qIdx = 0; qIdx < cQueris; qIdx++)
            {
                DataGroup queryGroup = this.labelFeatureDataCoded.DataGroups[qIdx];
                Query query = CreateQuery(queryGroup, this.labels, this.score, this.labelForUnlabeled, this.scoreForDegenerateQuery);
                queries[qIdx] = query;
            }

            bool skipDegenerateQueries = true;

            QueryCollection qc = new QueryCollection(queries, skipDegenerateQueries, this.scoreForDegenerateQuery);

            return qc;
        }