Beispiel #1
0
        /// <summary>
        /// Retrive the data partition specifed before in actual data index
        /// </summary>
        /// <param name="pType">DataPartitionType</param>
        /// <param name="subSamplePercent">the percentage of specified partition to be returned</param>
        /// <param name="cSize">the actual number of total data points returned</param>
        /// <returns>Corresponding Data indices</returns>
        public DataSet GetDataPartition(DataPartitionType pType, float subSamplePercent, Random r)
        {
            DataSet dataSet = null;
            if (groupPartitionIndexTbl != null && pType < DataPartitionType.cTypes)
            {                
                int[] groupIndex = this.groupPartitionIndexTbl[(int)pType];

                int cGroups = groupIndex.Length;
                cGroups = (int)((float)cGroups * subSamplePercent);

                int[] sampleGroupIndex = null;
                if (r != null)
                {
                    sampleGroupIndex = Vector.RandomSample(groupIndex.Length, cGroups, r);                    
                }
                else
                {
                    sampleGroupIndex = Vector.IndexArray(cGroups);                    
                }
                for (int i = 0; i < cGroups; i++)
                {
                    sampleGroupIndex[i] = groupIndex[sampleGroupIndex[i]];
                }
                dataSet = new DataSet(this, sampleGroupIndex);                
            }

            return dataSet;
        }
Beispiel #2
0
        /// <summary>
        /// Reducing the training data used to build a boosted tree
        /// </summary>
        /// <param name="inDataSet">The current work training data set</param>
        /// <param name="k">The index of the tree to be built</param>
        /// <param name="m">The iteration of the current boosting process</param>
        /// <returns>The data index used to build the boosted tree after triming</returns>
        public int[] TrimIndex(DataSet inDataSet, int k, int m)
        {
            #if !DoTrimIndex
            return inDataSet.DataIndex;
            #else
            const float maxTrimRate = 0.8F;
            const int minNonTrimIter = 30;

            float trimQuantile = 0.10F; //Make it scalable to any number of classes
            //float[] trimQuantile = { 0.10F, 0.10F, 0.10F, 0.10F, 0.10F };

            // Weight-trimming discards a portion of samples in the lower quantile of data respect to weights
            // I find this not only speeds up the computations but also outputs better results.
            int[] trimIndex;
            int[] index = inDataSet.DataIndex;
            if (m < minNonTrimIter)
            {
                trimIndex = index;
            }
            else
            {
                float[] weightsL = new float[index.Length];
                float sumWeights = 0;

                for (int i = 0; i < index.Length; i++)
                {
                    weightsL[i] = this.weights[index[i]];
                    sumWeights += weightsL[i];
                }

                int[] weightIndex = Vector.IndexArray(index.Length);
                Array.Sort(weightsL, weightIndex);

                int trimLen = 0;
                float partialSumWeights = 0;
                float targetSumWeights = trimQuantile * sumWeights - float.Epsilon;
                while (partialSumWeights < targetSumWeights && trimLen < index.Length * maxTrimRate)
                    partialSumWeights += weightsL[trimLen++];

                trimIndex = new int[index.Length - trimLen];

                for (int i = 0; i < trimIndex.Length; i++)
                    trimIndex[i] = index[weightIndex[i + trimLen]];

                // We find empirically that sorting the indexes actually speeds up accessing the data matrix
                Array.Sort(trimIndex);

                if (m >= minNonTrimIter)
                    Console.WriteLine("\t" + k.ToString() + "\t" + (1.0 * trimIndex.Length / index.Length).ToString());
            }
            return trimIndex;
            #endif //!DoTrimIndex
        }
Beispiel #3
0
        /// <summary>
        /// Compute the point-wise pseudo-response of the loss function to be optimized
        /// It is used to build the decision tree - except from computing the response value of a leaf node
        /// </summary>    
        /// <param name="dataSet">all training data</param>
        public void ComputePseudoResponse(DataSet dataSet)
        {
            // Reset/(zero out) pseudoResponse and weights for a new iteration
            ResetParameters();

            for (int qIdx = 0; qIdx < dataSet.NumGroups; qIdx++)
            {
                DataGroup queryGroup = dataSet.GetDataGroup(qIdx);
                RankPairGenerator rankPairs = new RankPairGenerator(queryGroup, this.labels);
                Query query = this.qcAccum.queries[dataSet.GroupIndex[qIdx]]; ;
                query.UpdateScores(this.score, queryGroup.iStart);
                query.ComputeRank();
                foreach (RankPair rankPair in rankPairs)
                {
                    float scoreH_minus_scoreL = this.score[rankPair.IdxH] - this.score[rankPair.IdxL];
                    //compute the cross-entropy gradient of the pair
                    float gradient = RankPair.CrossEntropyDerivative(scoreH_minus_scoreL);
                    //compute the absolute change in NDCG if we swap the pair in the current ordering
                    float absDeltaPosition = AbsDeltaPosition(rankPair, queryGroup, query);

                    // Marginalize the pair-wise gradient to get point wise gradient.  The point with higher relevance label (IdxH) always gets
                    // a positive push (i.e. upwards).
                    this.pseudoResponses[rankPair.IdxH] += gradient * absDeltaPosition;
                    this.pseudoResponses[rankPair.IdxL] -= gradient * absDeltaPosition;

                    // Note that the weights are automatically always positive
                    float weight = absDeltaPosition * RankPair.CrossEntropy2ndDerivative(this.score[rankPair.IdxH] - this.score[rankPair.IdxL]);
                    this.weights[rankPair.IdxH] += weight;
                    this.weights[rankPair.IdxL] += weight;
                }
            }

            for (int i = 0; i < dataSet.NumSamples; i++)
            {
                int dataIdx = dataSet.DataIndex[i];
                //incorporating the gradient of the label
                this.pseudoResponses[dataIdx] = (1 - this.labelWeight) * this.pseudoResponses[dataIdx] + this.labelWeight * (this.labels[dataIdx] - this.score[dataIdx]);
                this.weights[dataIdx] = (1 - this.labelWeight) * this.weights[dataIdx] + this.labelWeight * 1;
            }
        }
Beispiel #4
0
 /// <summary>
 /// Compute the point-wise pseudo-response of the loss function to be optimized
 /// It is used to build the decision tree - except from computing the response value of a leaf node
 /// </summary>
 /// <param name="dataSet">all training data</param>
 public void ComputePseudoResponse(DataSet dataSet)
 {
     for (int k = 0; k < this.numClass; k++)
     {
         for (int j = 0; j < dataSet.NumSamples; j++)
         {
             int i = dataSet.DataIndex[j];
             this.pseudoResponses[k][i] = this.classLabelsMatrix[k][i] - this.classProb[k][i];
             // qiangwu: pingli has assured me that the following are equvalent:
             // qiangwu: weights[i] = abs(responses[k][i])(1-abs(responses[k][i]));
             // qiangwu: which is shown in the paper - algorithms 6: Lk-TreeBoost
             this.weights[k][i] = this.classProb[k][i] * (1 - this.classProb[k][i]);
         }
     }
 }
Beispiel #5
0
        /// <summary>
        /// Reducing the training data used to build a boosted tree
        /// Influece trimming discard a small fraction (lower quantile) of the samples. 
        /// This was proposed previously as a heuristic for speeding up training. 
        /// I happen to notice that trimming also helps for better NDCG. 
        /// Intutively, when the weight p*(1-p) is small, this sample is already classified pretty well, and
        /// further efforts could be wasted. In any case, for ranking, we do not really need perfect classifications.
        /// </summary>
        /// <param name="inDataSet">The current work training data set</param>
        /// <param name="k">The index of the tree to be built</param>
        /// <param name="m">The iteration of the current boosting process</param>
        /// <returns>The data index used to build the boosted tree after triming</returns>
        public int[] TrimIndex(DataSet inDataSet, int k, int m)
        {
            const float maxTrimRate = 0.8F;  // At least keeps 1-0.8 samples after trimming
            const int minNonTrimIter = 30;   // only perform weight trimming after 30 iterations.

            // weight trimming plays a beneficial role for NDCG.
            // For now, it is probably safe to use 0.10F-0.20F.

            float trimQuantile = 0.10F; //Make it scalable to any number of classes

            //float[] trimQuantile = { 0.10F, 0.10F, 0.10F, 0.10F, 0.10F };
            //private float[] trimQuantile = { 0.15F, 0.15F, 0.15F, 0.15F, 0.15F};
            //private float[] trimQuantile = { 0.2F, 0.2F, 0.2F, 0.2F, 0.2F };
            //private float[] trimQuantile = { 0.25F, 0.25F, 0.25F, 0.25F, 0.25F };
            //private float[] trimQuantile = { 0.3F, 0.3F, 0.3F, 0.3F, 0.3F };
            //private float[] trimQuantile = { 0.25F, 0.20F, 0.15F, 0.10F, 0.05F };
            //private float[] trimQuantile = { 0.30F, 0.25F, 0.20F, 0.15F, 0.10F };
            //private float[] trimQuantile = { 0.35F, 0.30F, 0.25F, 0.20F, 0.15F };
            //private float[] trimQuantile = { 0.40F, 0.35F, 0.30F, 0.25F, 0.20F };
            //private float[] trimQuantile = { 0.45F, 0.40F, 0.35F, 0.30F, 0.25F };

            // Weight-trimming discards a portion of samples in the lower quantile of p*(1-p)
            // I find this not only speeds up the computations but also outputs better results.
            int[] trimIndex;
            int[] index = inDataSet.DataIndex;
            if (m < minNonTrimIter || (this.numClass == 2 && k == 1))
            {
                trimIndex = index;
            }
            else
            {
                float[] weightsL = new float[index.Length];
                float sumWeights = 0;

                for (int i = 0; i < index.Length; i++)
                {
                    weightsL[i] = this.weights[k][index[i]];
                    sumWeights += weightsL[i];
                }

                int[] weightIndex = Vector.IndexArray(index.Length);
                Array.Sort(weightsL, weightIndex);

                int trimLen = 0;
                float partialSumWeights = 0;
                float targetSumWeights = trimQuantile * sumWeights - float.Epsilon;
                while (partialSumWeights < targetSumWeights && trimLen < index.Length * maxTrimRate)
                    partialSumWeights += weightsL[trimLen++];

                trimIndex = new int[index.Length - trimLen];

                for (int i = 0; i < trimIndex.Length; i++)
                    trimIndex[i] = index[weightIndex[i + trimLen]];

                // We find empirically that sorting the indexes actually speeds up accessing the data matrix
                Array.Sort(trimIndex);

                if (m >= minNonTrimIter)
                    Console.WriteLine("\t" + k.ToString() + "\t" + (1.0 * trimIndex.Length / index.Length).ToString());
            }
            return trimIndex;
        }
Beispiel #6
0
 /// <summary>
 /// Reducing the training data used to build a boosted tree
 /// </summary>
 /// <param name="inDataSet">The current work training data set</param>
 /// <param name="k">The index of the tree to be built</param>
 /// <param name="m">The iteration of the current boosting process</param>
 /// <returns>The data index used to build the boosted tree after triming</returns>
 public int[] TrimIndex(DataSet inDataSet, int k, int m)
 {
     return inDataSet.DataIndex;
 }
Beispiel #7
0
 /// <summary>
 /// Compute the point-wise pseudo-response of the loss function to be optimized
 /// It is used to build the decision tree - except from computing the response value of a leaf node
 /// </summary>       
 /// <param name="dataSet">all training data</param>
 public abstract void ComputePseudoResponse(DataSet dataSet);
Beispiel #8
0
        public override void ComputePseudoResponse(DataSet dataSet)
        {
            this.ResetParameters();

            for (int j = 0; j < dataSet.NumSamples; j++)
            {
                int iDoc = dataSet.DataIndex[j];
                this.pseudoResponses[iDoc] = this.labels[iDoc] - this.funValues[iDoc];
            }
        }
Beispiel #9
0
        public override void ComputePseudoResponse(DataSet dataSet)
        {
            this.ResetParameters();

            for (int j = 0; j < dataSet.NumSamples; j++)
            {
                int iDoc = dataSet.DataIndex[j];
                // Careful: set the gradient at zero to zero.  Oterhwise we're asking the tree to correct for something that's not an error.
                float delta = this.labels[iDoc] - this.funValues[iDoc];
                if (delta > 0)
                    this.pseudoResponses[iDoc] = 1;
                else if (delta < 0)
                    this.pseudoResponses[iDoc] = -1;
                else
                    this.pseudoResponses[iDoc] = 0;
            }
        }
Beispiel #10
0
        /// <summary>
        /// Compute the point-wise pseudo-response of the loss function to be optimized
        /// It is used to build the decision tree - except from computing the response value of a leaf node
        /// </summary>        
        /// <param name="dataSet">all training data</param>
        public void ComputePseudoResponse(DataSet dataSet)
        {
            ResetParameters();
            for (int qIdx = 0; qIdx < dataSet.NumGroups; qIdx++)
            {
                DataGroup query = dataSet.GetDataGroup(qIdx);
                RankPairGenerator rankPairs = new RankPairGenerator(query, this.labels);
                foreach (RankPair rankPair in rankPairs)
                {
                    float scoreH_minus_scoreL = this.score[rankPair.IdxH] - this.score[rankPair.IdxL];
                    float gradient = RankPair.CrossEntropyDerivative(scoreH_minus_scoreL);

                    this.pseudoResponses[rankPair.IdxH] += gradient;
                    this.pseudoResponses[rankPair.IdxL] -= gradient;

                    float weight = RankPair.CrossEntropy2ndDerivative(this.score[rankPair.IdxH] - this.score[rankPair.IdxL]);

                    this.weights[rankPair.IdxH] += weight;
                    this.weights[rankPair.IdxL] += weight;

                }
                //this.labelFeatureData.PartitionData;
            }
        }