/// <summary> /// /// </summary> /// <param name="qc1">the scores for all the queries computed from the existing system</param> /// <param name="qc2">the score for all the queries computed from the newly added tree</param> /// <param name="queryIdxActive">the set of "active queries" that we use to find the optimal combination</param> /// <param name="bestMeanNDCGGain"></param> /// <returns></returns> public double FindStep(QueryCollection qc1, QueryCollection qc2, int[] queryIdxActive, out double bestMeanNDCGGain) { PairRankedItems.Reset(); // (queryIdxActive == null) <=> using all the queries in qc1/qc2 int cActiveQueries = (queryIdxActive == null) ? qc1.NQueries : queryIdxActive.Length; if(qc1.Count != qc2.Count) throw new Exception("Input files must have same number of rows."); if(qc1.NQueries != qc2.NQueries) throw new Exception("Input files must have same number of queries."); if (qc1.NQueries < cActiveQueries) throw new Exception("Active queries must be less than all the queries."); DCGScorer dcg = new DCGScorer(); // The relabeling must be done before FillRankedItems is called // REVIEW: ?? long nPairs; // Only used for debug int nDocs; // ditto CountDocsNAllPairs(qc1, qc2, queryIdxActive, out nPairs, out nDocs); // ditto int rankedItemCtr = 0; //PairRankedItems pri = null; int nQueries = 0; int nSkippedQueries = 0; for (int i = 0; i < cActiveQueries; ++i) { int qIdx = (queryIdxActive == null) ? i : queryIdxActive[i]; Query query1 = qc1.queries[qIdx]; Query query2 = qc2.queries[qIdx]; // We discard the array itself each time, but the object pointers persist. // Also: discard any queries that have maxDCG = 0. RankedItem[] thisRankedItems = FillRankedItems(query1, query2, dcg, random); if(thisRankedItems != null) { FillRanks(thisRankedItems); //pri = FillPairRankedItems(thisRankedItems, convex, maxStep, ref rankedItemCtr); FillPairRankedItems(thisRankedItems, convex, alphaPos, maxStep, ref rankedItemCtr); // This forms a linked list. ++nQueries; } else { ++nSkippedQueries; } } PairRankedItems[] pairRankedItems = PRI_ListToArray(); if (alphaPos) { Array.Sort(pairRankedItems, new SortPairRankedItemsIncreasing()); // First value closest to zero, next more positive } else { Array.Sort(pairRankedItems, new SortPairRankedItemsDecreasing()); // First value still closest to zero, next more negative } // Now that we have the sorted values of alpha: compute which global alpha gives best NDCG gain. double bestAlpha; FindBestAlpha(pairRankedItems, dcg, nQueries, out bestAlpha, out bestMeanNDCGGain); if (verbose) { Console.WriteLine("{0} queries total, {1} skipped queries, {2} docs", nQueries + nSkippedQueries, nSkippedQueries, nDocs); Console.WriteLine("Tot. # pairs = {0}, num. pairs in computation = {1}", nPairs, pairRankedItems.Length); // For the convex combination, it's tempting to rescale alpha so that the first weight is one. But this is not always possible: // it may need to be -1. Console.WriteLine("Best mean NDCG Gain = {0}, best alpha = {1}", bestMeanNDCGGain, bestAlpha); // Check that the gain is correct. qc1.ComputeNDCGs(); qc2.ComputeNDCGs(); double firstFactor = convex ? 1.0 - bestAlpha : 1.0; QueryCollection qc = QueryCollection.LinearlyCombine(firstFactor, qc1, bestAlpha, qc2); qc.ComputeNDCGs(); Console.WriteLine("NON-TRUNC: First NDCG = {0:F4}/{1:F4}, second = {2:F4}/{3:F4}, combined = {4:F4}/{5:F4}", qc1.NonTruncNDCG_pes, qc1.NonTruncNDCG_opt, qc2.NonTruncNDCG_pes, qc2.NonTruncNDCG_opt, qc.NonTruncNDCG_pes, qc.NonTruncNDCG_opt); Console.WriteLine(" TRUNC: First NDCG = {0:F4}/{1:F4}, second = {2:F4}/{3:F4}, combined = {4:F4}/{5:F4}", qc1.TruncNDCG_pes, qc1.TruncNDCG_opt, qc2.TruncNDCG_pes, qc2.TruncNDCG_opt, qc.TruncNDCG_pes, qc.TruncNDCG_opt); } return bestAlpha; }
/// <summary> /// Loop through every alpha. If both labels are the same, just swap ranks (and no change to NDCG). /// If not, still swap ranks, and compute cumulative delta NDCG. Keep track of that alpha that gave the /// best NDCG. Also treat as a special case alpha=0 (which may give the best result, and which may not /// be one of the listed alphas, since those always correspond to swapping points). /// </summary> /// <param name="pairRankedItems">Assumed sorted by alpha, with the value closest to zero first. WARNING: SIDE EFFECTS on RankedItems.</param> /// <param name="dcg"></param> /// <param name="bestAlpha"></param> /// <param name="bestNDCGGain"></param> void FindBestAlpha(PairRankedItems[] pairRankedItems, DCGScorer scorer, int nQueries, out double bestAlpha, out double bestMeanNDCGGain) { bestAlpha = 0.0; double bestNDCGGain = 0.0; int bestIndex = 0; double NDCGGain = 0.0; // This really is a gain in a gain (again) double[] markups = DCGScorer.discounts; // Position dependent part of NDCG // Rely on jittering to take care of degeneracy. int loopLength = pairRankedItems.Length; //int degCtr = 0; //while (loopLength != 0) //{ for (int i = 0; i < loopLength; ++i) { PairRankedItems pairRankedItem = pairRankedItems[i]; RankedItem x = pairRankedItem.item1; RankedItem y = pairRankedItem.item2; int rankx = x.rank; int ranky = y.rank; if (rankx != ranky + 1 && rankx != ranky - 1) { throw new Exception("FindBestAlpha: degenerate scores encountered."); //pairRankedItems[degCtr++] = pairRankedItem; //Console.WriteLine("Warning: we've hit a degenerate pair: QueryID {0}", x.QueryID); //Console.WriteLine("QueryID: {0} s1_1 {1} s1_2 {2} s2_1 {3} s2_2 {4} rank1 {5} rank2 {6} crossing error...", x.QueryID, x.score1, y.score1, x.score2, y.score2, rankx, ranky); } else { if (x.label != y.label) { double ndcgx = x.ndcgWt; double ndcgy = y.ndcgWt; double markupx = markups[rankx]; double markupy = markups[ranky]; NDCGGain += (ndcgx - ndcgy) * (markupy - markupx); if (NDCGGain > bestNDCGGain) { bestNDCGGain = NDCGGain; bestIndex = i; } } // Positions swap only if in the open interval (not at the edges), otherwise could get a spurious gain if ((convex && pairRankedItem.alpha != 0.0 && pairRankedItem.alpha != 1.0) || (!convex && pairRankedItem.alpha != 0.0 && pairRankedItem.alpha < maxStep && pairRankedItem.alpha > -maxStep)) { x.rank = ranky; y.rank = rankx; } } } // if(degCtr > 0) // Console.WriteLine("Num degenerates = {0}", degCtr); // loopLength = degCtr; // degCtr = 0; // } // Put the best alpha half way between that found (which is on the border) and the next, unless it's the last. if(bestIndex < pairRankedItems.Length - 1) bestAlpha = 0.5 * ( pairRankedItems[bestIndex].alpha + pairRankedItems[bestIndex + 1].alpha ); else if(bestIndex == pairRankedItems.Length - 1) bestAlpha = pairRankedItems[bestIndex].alpha; else // The passed pairRankedItems array could be empty. bestAlpha = 0.0; bestMeanNDCGGain = bestNDCGGain / (double)nQueries; }
/// <summary> /// Assumes all query data has been prepared in the linked list of QueryRows, except features and scores (which don't live /// in individual queries). /// </summary> /// <param name="QID"></param> /// <param name="dcg">Can be null if desired (in which case max DCGs won't be computed).</param> public Query(string QID, double scoreForDegenerateQuery) { this.QueryID = QID; this.scoreForDegenerateQuery = scoreForDegenerateQuery; this.length = 0; QueryRow ptr = QueryRow.mostRecent; while(ptr != null) { ++length; ptr = ptr.previous; } ftrVectors = new float[length][]; scores = new double[length]; scoresCp = new double[length]; ranks = new int[length]; labels = new float[length]; ptr = QueryRow.mostRecent; int ctr = length-1; // stick to original order while(ptr != null) { labels[ctr] = ptr.Label; ftrVectors[ctr] = ptr.Features; --ctr; ptr = ptr.previous; } // Reset QueryRow QueryRow.Reset(); previous = mostRecent; mostRecent = this; // Fill DCGs DCGScorer dcg = new DCGScorer(); FillMaxDCGs(dcg); }
/// <summary> /// query1 and query2 must be the same query (but with different scores). The urls must be in the same order. However they need not be sorted. /// </summary> /// <param name="query1"></param> /// <param name="query2"></param> /// <param name="dcg"></param> /// <param name="truncLevel"></param> /// <returns>Null if this query has zero maxDCG. Else, a rankedItem array, sorted by the scores in query1.</returns> public static RankedItem[] FillRankedItems(Query query1, Query query2, DCGScorer scorer, Random ran) { if(query1.Length != query2.Length) throw new Exception("Query length mismatch."); if(query1.QueryID != query2.QueryID) throw new Exception("Queries have differnt IDs."); int length = query1.Length; double maxDCG = query1.MaxNonTruncDCG; if(maxDCG == 0.0) return null; RankedItem[] rankedItems = new RankedItem[length]; double[] scores1 = query1.scores; double[] scores2 = query2.scores; string QID = query1.QueryID; for(int i = 0; i < length; ++i) { float label = query1.Labels[i]; if(label != query2.Labels[i]) throw new Exception("FillRankedItems: label mismatch."); rankedItems[i] = new RankedItem((double)DCGScorer.scoresMap[(int)label] / maxDCG, scores1[i], scores2[i], label);//, QueryID); } if (rankedItems != null) { SortNJitter(rankedItems, ran); } return rankedItems; }
public void FillMaxDCGs(DCGScorer dcg) { // Faster to compute the histogram just once int[] hist = new int[dcg.NLabels]; for (int i = 0; i < labels.Length; ++i) { ++hist[(int)labels[i]]; } maxNonTruncDCG = dcg.ComputeMaxDCG(hist); maxTruncDCG = dcg.ComputeMaxTruncDCG(hist); }
/// <summary> /// Compute both truncated and non-truncated NDCG for this query. /// </summary> /// <returns></returns> public void ComputeNDCGs() { if (maxNonTruncDCG == 0.0) // Then also maxTruncDCG = 0.0, and this query is degenerate. { // Note: these may be skipped in the NDCG computation for a bunch of queries - but that is handled in QueryCollection. Debug.Assert(maxTruncDCG == 0.0, "maxNonTruncDCG = 0 should imply maxTruncDCG = 0"); nonTruncNDCG_opt = scoreForDegenerateQuery; nonTruncNDCG_pes = scoreForDegenerateQuery; nonTruncNDCG_mean = scoreForDegenerateQuery; truncNDCG_opt = scoreForDegenerateQuery; truncNDCG_pes = scoreForDegenerateQuery; truncNDCG_mean = scoreForDegenerateQuery; } else { DCGScorer dcg = new DCGScorer(); double truncDCG_pes, nonTruncDCG_pes, truncDCG_opt, nonTruncDCG_opt, truncDCG_mean, nonTruncDCG_mean; dcg.ComputeDCGs(true, scores, labels, out truncDCG_pes, out nonTruncDCG_pes); dcg.ComputeDCGs(false, scores, labels, out truncDCG_opt, out nonTruncDCG_opt); dcg.ComputeMeanDCGs(scores, labels, out truncDCG_mean, out nonTruncDCG_mean); truncNDCG_pes = truncDCG_pes / maxTruncDCG; truncNDCG_opt = truncDCG_opt / maxTruncDCG; truncNDCG_mean = truncDCG_mean / maxTruncDCG; nonTruncNDCG_pes = nonTruncDCG_pes / maxNonTruncDCG; nonTruncNDCG_opt = nonTruncDCG_opt / maxNonTruncDCG; nonTruncNDCG_mean = nonTruncDCG_mean / maxNonTruncDCG; } }
/// <summary> /// Generate a random query. No need to pass labelForUnlabeled here, since the random queries are all given labels. /// Still pass scoreForDegenerateQuery just in case some query gets all labels the same. /// Scores are not set here. /// </summary> /// <param name="priors"></param> /// <param name="scale"></param> /// <param name="nScores"></param> /// <param name="rangen"></param> /// <param name="nRows"></param> public Query(float[] priors, double scale, int nScores, int nRows, string queryID, Random rangen) { length = nRows; QueryID = queryID; QueryRow[] qr = new QueryRow[length]; ftrVectors = new float[length][]; scores = new double[length]; labels = new float[length]; for (int i = 0; i < nRows; ++i) { qr[i] = new QueryRow(priors, scale, nScores, rangen); labels[i] = qr[i].Label; ftrVectors[i] = qr[i].Features; } DCGScorer dcg = new DCGScorer(); FillMaxDCGs(dcg); }
public Query(string QID, float[] labels, float[][] ftrVectors, double[] scores, float labelForUnlabeled, double scoreForDegenerateQuery, bool dcgFlag) { QueryID = QID; this.scoreForDegenerateQuery = scoreForDegenerateQuery; this.labels = labels; this.ftrVectors = ftrVectors; this.scores = scores; length = labels.Length; scoresCp = new double[length]; ranks = new int[length]; // Compute max DCGs FixUnlabeledRows(labelForUnlabeled); DCGScorer dcg = new DCGScorer(); if(dcgFlag) FillMaxDCGs(dcg); }
public NDCG(bool dropEmptyQueries, float scoreForEmptyQuery, int ndcgAt) { this.dropEmptyQueries = dropEmptyQueries; this.scoreForEmptyQuery = scoreForEmptyQuery; dcgScorer = new DCGScorer(); DCGScorer.truncLevel = ndcgAt; }