예제 #1
0
        /// <summary>
        /// Create a random subsample of size nSamples, without replacement.
        /// </summary>
        /// <param name="nSamples"></param>
        /// <param name="rangen"></param>
        /// <returns></returns>
        public QueryCollection Sample(int nSamples, Random rangen)
        {
            if (nSamples > nQueries)
                throw new Exception("nSamples cannot exceed total number of queries.");

            int[] indices = new int[nQueries];
            ArrayUtils.Range(indices, 0, 1);
            float[] vals = new float[nQueries];
            ArrayUtils.Random(vals, 0, 1, rangen);
            Array.Sort(vals, indices);

            Query[] qs = new Query[nSamples];
            for (int i = 0; i < nSamples; ++i)
            {
                qs[i] = queries[indices[i]];
            }

            return new QueryCollection(qs, true, 0.0);
        }
예제 #2
0
        /// <summary>
        /// query1 and query2 must be the same query (but with different scores).  The urls must be in the same order.  However they need not be sorted.
        /// </summary>
        /// <param name="query1"></param>
        /// <param name="query2"></param>
        /// <param name="dcg"></param>
        /// <param name="truncLevel"></param>
        /// <returns>Null if this query has zero maxDCG.  Else, a rankedItem array, sorted by the scores in query1.</returns>
        public static RankedItem[] FillRankedItems(Query query1, Query query2, DCGScorer scorer, Random ran)
        {
            if(query1.Length != query2.Length)
                throw new Exception("Query length mismatch.");
            if(query1.QueryID != query2.QueryID)
                throw new Exception("Queries have differnt IDs.");
            int length = query1.Length;
            double maxDCG = query1.MaxNonTruncDCG;
            if(maxDCG == 0.0)
                return null;
            RankedItem[] rankedItems = new RankedItem[length];
            double[] scores1 = query1.scores;
            double[] scores2 = query2.scores;
            string QID = query1.QueryID;
            for(int i = 0; i < length; ++i)
            {
                float label = query1.Labels[i];
                if(label != query2.Labels[i])
                    throw new Exception("FillRankedItems: label mismatch.");
                rankedItems[i] = new RankedItem((double)DCGScorer.scoresMap[(int)label] / maxDCG, scores1[i], scores2[i], label);//, QueryID);
            }

            if (rankedItems != null)
            {
                SortNJitter(rankedItems, ran);
            }
            return rankedItems;
        }
예제 #3
0
        /// <summary>
        /// Labels are hard copied.  Feature vectors are left unallocated. 
        /// Space is allocated for a single column of scores.
        /// </summary>
        /// <param name="qc"></param>
        /// <returns></returns>
        public QueryCollection CopyEmptyQueryCollection()
        {
            Query[] newQueries = new Query[nQueries];
            for (int i = 0; i < nQueries; ++i)
            {
                Query q = queries[i];
                float[][] ftrVectors = ArrayUtils.FloatMatrix(0, 0);
                double[] scores = new double[q.Length];
                float[] newLabels = ArrayUtils.Copy(q.Labels);
                newQueries[i] = new Query(q, newLabels, ftrVectors, scores);
            }

            return new QueryCollection(newQueries, this.skipDegenerateQueries, scoreForDegenerateQueries);
        }
예제 #4
0
        /// <summary>
        /// Hard copy.
        /// </summary>
        /// <returns></returns>
        public QueryCollection CopyQueryCollection(bool skipDegenerateQueries, double scoreForDegenerateQueries)
        {
            Query[] newQueries = new Query[nQueries];
            for (int i = 0; i < nQueries; ++i)
            {
                Query q = queries[i];
                float[][] ftrVectors = ArrayUtils.Copy(q.FtrVectors);
                double[] scores = ArrayUtils.Copy(q.scores);
                float[] newLabels = ArrayUtils.Copy(q.Labels);
                newQueries[i] = new Query(q, newLabels, ftrVectors, scores);
            }

            return new QueryCollection(newQueries, skipDegenerateQueries, this.scoreForDegenerateQueries);
        }
예제 #5
0
 /// <summary>
 /// Generate a random query collection for testing.  The scores are correlated with the labels, and the labels are distributed
 /// according to priors (priors[0] = P(Bad), etc.).
 /// </summary>
 /// <param name="nQueries"></param>
 /// <param name="priors"></param>
 /// <param name="scale"></param>
 /// <param name="nScores"></param>
 /// <param name="nRowsPerQuery"></param>
 /// <param name="rangen"></param>
 public QueryCollection(int nQueries, float[] priors, double scale, int nScores, int nRowsPerQuery, Random rangen)
 {
     this.nQueries = nQueries;
     nRows = nRowsPerQuery * nQueries;
     queries = new Query[nQueries];
     for (int i = 0; i < nQueries; ++i)
     {
         queries[i] = new Query(priors, scale, nScores, nRowsPerQuery, i.ToString(), rangen);
     }
 }
예제 #6
0
        /// <summary>
        /// Assumes all query data has been prepared in the linked list of QueryRows, except features and scores (which don't live
        /// in individual queries).
        /// </summary>
        /// <param name="QID"></param>
        /// <param name="dcg">Can be null if desired (in which case max DCGs won't be computed).</param>
        public Query(string QID, double scoreForDegenerateQuery)
        {
            this.QueryID = QID;
            this.scoreForDegenerateQuery = scoreForDegenerateQuery;
            this.length = 0;
            QueryRow ptr = QueryRow.mostRecent;
            while(ptr != null)
            {
                ++length;
                ptr = ptr.previous;
            }

            ftrVectors = new float[length][];
            scores = new double[length];
            scoresCp = new double[length];
            ranks = new int[length];
            labels = new float[length];
            ptr = QueryRow.mostRecent;
            int ctr = length-1; // stick to original order
            while(ptr != null)
            {
                labels[ctr] = ptr.Label;
                ftrVectors[ctr] = ptr.Features;
                --ctr;
                ptr = ptr.previous;
            }

            // Reset QueryRow
            QueryRow.Reset();

            previous = mostRecent;
            mostRecent = this;

            // Fill DCGs
            DCGScorer dcg = new DCGScorer();
            FillMaxDCGs(dcg);
        }
예제 #7
0
        /// <summary>
        /// A degenerate query is one with all the labels the same.
        /// </summary>
        /// <param name="fname"></param>
        /// <param name="skipDegenerate"></param>
        public QueryCollection(string fname, float labelForUnlabeled, bool skipDegenerateQueries, double scoreForDegenerateQuery)
        {
            this.skipDegenerateQueries = skipDegenerateQueries;
            this.scoreForDegenerateQueries = scoreForDegenerateQuery;

            using(StreamReader sr = new StreamReader(fname))
            {
                string[] headers = sr.ReadLine().Split(QueryRowHeader.Separator);
                QueryRowHeader queryRowHeader = new QueryRowHeader(headers);

            #if USE_BM25_TO_BREAK_DEGENERACY
                degenBreak_idx = queryRowHeader.DegenBreak_idx;
            #endif

                nRows = 0;
                string row = sr.ReadLine();
                string[] splitRow;
                string lastQID = null;
                string QID = null;
                Query.Reset();
                while(row != null)
                {
                    splitRow = row.Split(QueryRowHeader.Separator);
                    QID = splitRow[queryRowHeader.queryIDIdx];
                    if(QID != lastQID && nRows != 0)
                    {
                        new Query(lastQID, scoreForDegenerateQuery);
                        ++nQueries;
                    }

                    string rating = splitRow[queryRowHeader.ratingIdx];
                    float label;
                    switch(rating)
                    {
                        case "Definitive":
                            label = 4;
                            break;
                        case "Perfect":
                            label = 4;
                            break;
                        case "Excellent":
                            label = 3;
                            break;
                        case "Good":
                            label = 2;
                            break;
                        case "Fair":
                            label = 1;
                            break;
                        case "Bad":
                            label = 0;
                            break;
                        case "Detrimental":
                            label = 0;
                            break;
                        case "Unknown":
                            label = -1;
                            break;
                        case "": // Unlabeled (in RatedNRandom)
                            label = -1;
                            break;
                        case "HighlyRelevant": // ImageSearch - happily the only label that overlaps with MSNSearch("Detrimental") gets the same score
                            label = 2;
                            break;
                        case "Relevant":
                            label = 1;
                            break;
                        case "NotRelevant":
                            label = 0;
                            break;
                        case "Unjudged":
                            label = 0;
                            break;
                        default:
                            try
                            {
                                label = float.Parse(rating);
                            }
                            catch(Exception)
                            {
                                Console.WriteLine("Unable to parse rating " + rating + " into an float. Using 0");
                                label = 0;
                            }
                            break;
                    }

                    // Convention is: if row has shorter length than number of headers, the missing values are all zero.
                    //float[] ftrVector = new float[queryRowHeader.FeatureCount];
                    float[] ftrVector = new float[queryRowHeader.FeatureCount];
                    //for(int j = 0, i = queryRowHeader.firstFtrIdx; j < queryRowHeader.FeatureCount; ++i, ++j)
                    for (int j = 0, i = queryRowHeader.firstFtrIdx; i < splitRow.Length; ++i, ++j)
                    {
                        string val = splitRow[i];
                        ftrVector[j] = ( val == String.Empty ) ? 0.0F : float.Parse(val);
                    }
                    new QueryRow(label, ftrVector, labelForUnlabeled);
                    lastQID = QID;
                    row = sr.ReadLine();
                    ++nRows;
                }
                new Query(QID, scoreForDegenerateQuery);
                ++nQueries;
            }

            // Finally construct the array, in the original order
            queries = new Query[nQueries];
            Query ptr = Query.mostRecent;
            int ctr = nQueries-1;
            while(ptr != null)
            {
                queries[ctr--] = ptr;
                ptr = ptr.previous;
            }

            // No need to fix unlabeled rows, since QueryRow does that.  (Any unlabeled rows must be fixed before degenerates are removed).
            FixUnlabeledRows(labelForUnlabeled);

            // For training, for efficiency, we might want to skip queries that have no pairs.
            if (skipDegenerateQueries)
            {
                int nonDegCtr = 0;
                int newNRows = 0;
                for (int i = 0; i < nQueries; ++i)
                {
                    Query q = queries[i];
                    if (!q.AllRowsSameLabel())
                    {
                        queries[nonDegCtr++] = q;
                        newNRows += q.Length;
                    }
                }
                if (nonDegCtr < nQueries)
                {
                    Query[] newQueries = new Query[nonDegCtr]; // wish C# had direct way to shorten array
                    Array.Copy(queries, newQueries, nonDegCtr);
                    nRows = newNRows;
                    nQueries = nonDegCtr;
                    queries = newQueries;
                }
            }

            Query.Reset(); // Prepare for next load
        }
예제 #8
0
 public QueryCollection(Query[] queries, bool skipDegenerateQueries, double scoreForDegenerateQueries)
 {
     this.skipDegenerateQueries = skipDegenerateQueries;
     this.scoreForDegenerateQueries = scoreForDegenerateQueries;
     this.queries = queries;
     nQueries = queries.Length;
     nRows = 0;
     for(int i = 0; i < nQueries; ++i)
     {
         nRows += queries[i].Length;
     }
 }
예제 #9
0
 internal static void Reset()
 {
     mostRecent = null;
 }
예제 #10
0
 /// <summary>
 /// If maxDCG and maxTruncDCG have been passed, it's safe to assume that the labels have already been remapped,
 /// hence labelForUnlabeled is not needed.
 /// </summary>
 /// <param name="QID"></param>
 /// <param name="labels"></param>
 /// <param name="scores"></param>
 /// <param name="maxDCG"></param>
 /// <param name="maxTruncDCG"></param>
 public Query(Query q, float[] labels, float[][] ftrVectors, double[] scores)
 {
     if (ftrVectors.Length != 0 && labels.Length != ftrVectors.Length) // Allow for zero-sized feature vectors (for queries that contain only scores)
         throw new Exception("Query constructor: size mismatch");
     QueryID = q.QueryID;
     this.scoreForDegenerateQuery = q.scoreForDegenerateQuery;
     this.maxNonTruncDCG = q.maxNonTruncDCG;
     this.maxTruncDCG = q.maxTruncDCG;
     this.labels = labels;
     this.ftrVectors = ftrVectors;
     this.scores = scores;
     this.maxNonTruncDCG = q.maxNonTruncDCG;
     this.maxTruncDCG = q.maxTruncDCG;
     length = labels.Length;
 }
예제 #11
0
        private QueryCollection CreateQueryCollection()
        {
            int cQueris = this.labelFeatureDataCoded.DataGroups.GroupCounts;
            Query[] queries = new Query[cQueris];
            for (int qIdx = 0; qIdx < cQueris; qIdx++)
            {
                DataGroup queryGroup = this.labelFeatureDataCoded.DataGroups[qIdx];
                Query query = CreateQuery(queryGroup, this.labels, this.score, this.labelForUnlabeled, this.scoreForDegenerateQuery);
                queries[qIdx] = query;
            }

            bool skipDegenerateQueries = true;

            QueryCollection qc = new QueryCollection(queries, skipDegenerateQueries, this.scoreForDegenerateQuery);

            return qc;
        }
예제 #12
0
        private Query CreateQuery(DataGroup queryGroup, float[] inLabels, float[] inScores,
                                  float labelForUnlabeled, double scoreForDegenerateQuery)
        {
            string QID = queryGroup.id.ToString();
            float[] labels = new float[queryGroup.cSize];
            double[] scores = new double[queryGroup.cSize];
            int end = queryGroup.iStart + queryGroup.cSize;
            for (int i = queryGroup.iStart; i < end; i++)
            {
                labels[i - queryGroup.iStart] = (float)inLabels[i];
                scores[i - queryGroup.iStart] = inScores[i];
            }

            DCGScorer.truncLevel = this.truncLevel;
            Query query = new Query(QID, labels, null, scores, labelForUnlabeled, scoreForDegenerateQuery);
            return query;
        }
예제 #13
0
 private float AbsDeltaNDCG(RankPair rankPair, DataGroup queryGroup, Query query)
 {
     int idx1 = rankPair.IdxL - queryGroup.iStart;
     int idx2 = rankPair.IdxH - queryGroup.iStart;
     return query.AbsDeltaNDCG(idx1, idx2);
 }