/// <summary> /// Create a random subsample of size nSamples, without replacement. /// </summary> /// <param name="nSamples"></param> /// <param name="rangen"></param> /// <returns></returns> public QueryCollection Sample(int nSamples, Random rangen) { if (nSamples > nQueries) throw new Exception("nSamples cannot exceed total number of queries."); int[] indices = new int[nQueries]; ArrayUtils.Range(indices, 0, 1); float[] vals = new float[nQueries]; ArrayUtils.Random(vals, 0, 1, rangen); Array.Sort(vals, indices); Query[] qs = new Query[nSamples]; for (int i = 0; i < nSamples; ++i) { qs[i] = queries[indices[i]]; } return new QueryCollection(qs, true, 0.0); }
/// <summary> /// query1 and query2 must be the same query (but with different scores). The urls must be in the same order. However they need not be sorted. /// </summary> /// <param name="query1"></param> /// <param name="query2"></param> /// <param name="dcg"></param> /// <param name="truncLevel"></param> /// <returns>Null if this query has zero maxDCG. Else, a rankedItem array, sorted by the scores in query1.</returns> public static RankedItem[] FillRankedItems(Query query1, Query query2, DCGScorer scorer, Random ran) { if(query1.Length != query2.Length) throw new Exception("Query length mismatch."); if(query1.QueryID != query2.QueryID) throw new Exception("Queries have differnt IDs."); int length = query1.Length; double maxDCG = query1.MaxNonTruncDCG; if(maxDCG == 0.0) return null; RankedItem[] rankedItems = new RankedItem[length]; double[] scores1 = query1.scores; double[] scores2 = query2.scores; string QID = query1.QueryID; for(int i = 0; i < length; ++i) { float label = query1.Labels[i]; if(label != query2.Labels[i]) throw new Exception("FillRankedItems: label mismatch."); rankedItems[i] = new RankedItem((double)DCGScorer.scoresMap[(int)label] / maxDCG, scores1[i], scores2[i], label);//, QueryID); } if (rankedItems != null) { SortNJitter(rankedItems, ran); } return rankedItems; }
/// <summary> /// Labels are hard copied. Feature vectors are left unallocated. /// Space is allocated for a single column of scores. /// </summary> /// <param name="qc"></param> /// <returns></returns> public QueryCollection CopyEmptyQueryCollection() { Query[] newQueries = new Query[nQueries]; for (int i = 0; i < nQueries; ++i) { Query q = queries[i]; float[][] ftrVectors = ArrayUtils.FloatMatrix(0, 0); double[] scores = new double[q.Length]; float[] newLabels = ArrayUtils.Copy(q.Labels); newQueries[i] = new Query(q, newLabels, ftrVectors, scores); } return new QueryCollection(newQueries, this.skipDegenerateQueries, scoreForDegenerateQueries); }
/// <summary> /// Hard copy. /// </summary> /// <returns></returns> public QueryCollection CopyQueryCollection(bool skipDegenerateQueries, double scoreForDegenerateQueries) { Query[] newQueries = new Query[nQueries]; for (int i = 0; i < nQueries; ++i) { Query q = queries[i]; float[][] ftrVectors = ArrayUtils.Copy(q.FtrVectors); double[] scores = ArrayUtils.Copy(q.scores); float[] newLabels = ArrayUtils.Copy(q.Labels); newQueries[i] = new Query(q, newLabels, ftrVectors, scores); } return new QueryCollection(newQueries, skipDegenerateQueries, this.scoreForDegenerateQueries); }
/// <summary> /// Generate a random query collection for testing. The scores are correlated with the labels, and the labels are distributed /// according to priors (priors[0] = P(Bad), etc.). /// </summary> /// <param name="nQueries"></param> /// <param name="priors"></param> /// <param name="scale"></param> /// <param name="nScores"></param> /// <param name="nRowsPerQuery"></param> /// <param name="rangen"></param> public QueryCollection(int nQueries, float[] priors, double scale, int nScores, int nRowsPerQuery, Random rangen) { this.nQueries = nQueries; nRows = nRowsPerQuery * nQueries; queries = new Query[nQueries]; for (int i = 0; i < nQueries; ++i) { queries[i] = new Query(priors, scale, nScores, nRowsPerQuery, i.ToString(), rangen); } }
/// <summary> /// Assumes all query data has been prepared in the linked list of QueryRows, except features and scores (which don't live /// in individual queries). /// </summary> /// <param name="QID"></param> /// <param name="dcg">Can be null if desired (in which case max DCGs won't be computed).</param> public Query(string QID, double scoreForDegenerateQuery) { this.QueryID = QID; this.scoreForDegenerateQuery = scoreForDegenerateQuery; this.length = 0; QueryRow ptr = QueryRow.mostRecent; while(ptr != null) { ++length; ptr = ptr.previous; } ftrVectors = new float[length][]; scores = new double[length]; scoresCp = new double[length]; ranks = new int[length]; labels = new float[length]; ptr = QueryRow.mostRecent; int ctr = length-1; // stick to original order while(ptr != null) { labels[ctr] = ptr.Label; ftrVectors[ctr] = ptr.Features; --ctr; ptr = ptr.previous; } // Reset QueryRow QueryRow.Reset(); previous = mostRecent; mostRecent = this; // Fill DCGs DCGScorer dcg = new DCGScorer(); FillMaxDCGs(dcg); }
/// <summary> /// A degenerate query is one with all the labels the same. /// </summary> /// <param name="fname"></param> /// <param name="skipDegenerate"></param> public QueryCollection(string fname, float labelForUnlabeled, bool skipDegenerateQueries, double scoreForDegenerateQuery) { this.skipDegenerateQueries = skipDegenerateQueries; this.scoreForDegenerateQueries = scoreForDegenerateQuery; using(StreamReader sr = new StreamReader(fname)) { string[] headers = sr.ReadLine().Split(QueryRowHeader.Separator); QueryRowHeader queryRowHeader = new QueryRowHeader(headers); #if USE_BM25_TO_BREAK_DEGENERACY degenBreak_idx = queryRowHeader.DegenBreak_idx; #endif nRows = 0; string row = sr.ReadLine(); string[] splitRow; string lastQID = null; string QID = null; Query.Reset(); while(row != null) { splitRow = row.Split(QueryRowHeader.Separator); QID = splitRow[queryRowHeader.queryIDIdx]; if(QID != lastQID && nRows != 0) { new Query(lastQID, scoreForDegenerateQuery); ++nQueries; } string rating = splitRow[queryRowHeader.ratingIdx]; float label; switch(rating) { case "Definitive": label = 4; break; case "Perfect": label = 4; break; case "Excellent": label = 3; break; case "Good": label = 2; break; case "Fair": label = 1; break; case "Bad": label = 0; break; case "Detrimental": label = 0; break; case "Unknown": label = -1; break; case "": // Unlabeled (in RatedNRandom) label = -1; break; case "HighlyRelevant": // ImageSearch - happily the only label that overlaps with MSNSearch("Detrimental") gets the same score label = 2; break; case "Relevant": label = 1; break; case "NotRelevant": label = 0; break; case "Unjudged": label = 0; break; default: try { label = float.Parse(rating); } catch(Exception) { Console.WriteLine("Unable to parse rating " + rating + " into an float. Using 0"); label = 0; } break; } // Convention is: if row has shorter length than number of headers, the missing values are all zero. //float[] ftrVector = new float[queryRowHeader.FeatureCount]; float[] ftrVector = new float[queryRowHeader.FeatureCount]; //for(int j = 0, i = queryRowHeader.firstFtrIdx; j < queryRowHeader.FeatureCount; ++i, ++j) for (int j = 0, i = queryRowHeader.firstFtrIdx; i < splitRow.Length; ++i, ++j) { string val = splitRow[i]; ftrVector[j] = ( val == String.Empty ) ? 0.0F : float.Parse(val); } new QueryRow(label, ftrVector, labelForUnlabeled); lastQID = QID; row = sr.ReadLine(); ++nRows; } new Query(QID, scoreForDegenerateQuery); ++nQueries; } // Finally construct the array, in the original order queries = new Query[nQueries]; Query ptr = Query.mostRecent; int ctr = nQueries-1; while(ptr != null) { queries[ctr--] = ptr; ptr = ptr.previous; } // No need to fix unlabeled rows, since QueryRow does that. (Any unlabeled rows must be fixed before degenerates are removed). FixUnlabeledRows(labelForUnlabeled); // For training, for efficiency, we might want to skip queries that have no pairs. if (skipDegenerateQueries) { int nonDegCtr = 0; int newNRows = 0; for (int i = 0; i < nQueries; ++i) { Query q = queries[i]; if (!q.AllRowsSameLabel()) { queries[nonDegCtr++] = q; newNRows += q.Length; } } if (nonDegCtr < nQueries) { Query[] newQueries = new Query[nonDegCtr]; // wish C# had direct way to shorten array Array.Copy(queries, newQueries, nonDegCtr); nRows = newNRows; nQueries = nonDegCtr; queries = newQueries; } } Query.Reset(); // Prepare for next load }
public QueryCollection(Query[] queries, bool skipDegenerateQueries, double scoreForDegenerateQueries) { this.skipDegenerateQueries = skipDegenerateQueries; this.scoreForDegenerateQueries = scoreForDegenerateQueries; this.queries = queries; nQueries = queries.Length; nRows = 0; for(int i = 0; i < nQueries; ++i) { nRows += queries[i].Length; } }
internal static void Reset() { mostRecent = null; }
/// <summary> /// If maxDCG and maxTruncDCG have been passed, it's safe to assume that the labels have already been remapped, /// hence labelForUnlabeled is not needed. /// </summary> /// <param name="QID"></param> /// <param name="labels"></param> /// <param name="scores"></param> /// <param name="maxDCG"></param> /// <param name="maxTruncDCG"></param> public Query(Query q, float[] labels, float[][] ftrVectors, double[] scores) { if (ftrVectors.Length != 0 && labels.Length != ftrVectors.Length) // Allow for zero-sized feature vectors (for queries that contain only scores) throw new Exception("Query constructor: size mismatch"); QueryID = q.QueryID; this.scoreForDegenerateQuery = q.scoreForDegenerateQuery; this.maxNonTruncDCG = q.maxNonTruncDCG; this.maxTruncDCG = q.maxTruncDCG; this.labels = labels; this.ftrVectors = ftrVectors; this.scores = scores; this.maxNonTruncDCG = q.maxNonTruncDCG; this.maxTruncDCG = q.maxTruncDCG; length = labels.Length; }
private QueryCollection CreateQueryCollection() { int cQueris = this.labelFeatureDataCoded.DataGroups.GroupCounts; Query[] queries = new Query[cQueris]; for (int qIdx = 0; qIdx < cQueris; qIdx++) { DataGroup queryGroup = this.labelFeatureDataCoded.DataGroups[qIdx]; Query query = CreateQuery(queryGroup, this.labels, this.score, this.labelForUnlabeled, this.scoreForDegenerateQuery); queries[qIdx] = query; } bool skipDegenerateQueries = true; QueryCollection qc = new QueryCollection(queries, skipDegenerateQueries, this.scoreForDegenerateQuery); return qc; }
private Query CreateQuery(DataGroup queryGroup, float[] inLabels, float[] inScores, float labelForUnlabeled, double scoreForDegenerateQuery) { string QID = queryGroup.id.ToString(); float[] labels = new float[queryGroup.cSize]; double[] scores = new double[queryGroup.cSize]; int end = queryGroup.iStart + queryGroup.cSize; for (int i = queryGroup.iStart; i < end; i++) { labels[i - queryGroup.iStart] = (float)inLabels[i]; scores[i - queryGroup.iStart] = inScores[i]; } DCGScorer.truncLevel = this.truncLevel; Query query = new Query(QID, labels, null, scores, labelForUnlabeled, scoreForDegenerateQuery); return query; }
private float AbsDeltaNDCG(RankPair rankPair, DataGroup queryGroup, Query query) { int idx1 = rankPair.IdxL - queryGroup.iStart; int idx2 = rankPair.IdxH - queryGroup.iStart; return query.AbsDeltaNDCG(idx1, idx2); }