/// <summary> /// Retrive the data partition specifed before in actual data index /// </summary> /// <param name="pType">DataPartitionType</param> /// <param name="subSamplePercent">the percentage of specified partition to be returned</param> /// <param name="cSize">the actual number of total data points returned</param> /// <returns>Corresponding Data indices</returns> public DataSet GetDataPartition(DataPartitionType pType, float subSamplePercent, Random r) { DataSet dataSet = null; if (groupPartitionIndexTbl != null && pType < DataPartitionType.cTypes) { int[] groupIndex = this.groupPartitionIndexTbl[(int)pType]; int cGroups = groupIndex.Length; cGroups = (int)((float)cGroups * subSamplePercent); int[] sampleGroupIndex = null; if (r != null) { sampleGroupIndex = Vector.RandomSample(groupIndex.Length, cGroups, r); } else { sampleGroupIndex = Vector.IndexArray(cGroups); } for (int i = 0; i < cGroups; i++) { sampleGroupIndex[i] = groupIndex[sampleGroupIndex[i]]; } dataSet = new DataSet(this, sampleGroupIndex); } return dataSet; }
/// <summary> /// Reducing the training data used to build a boosted tree /// </summary> /// <param name="inDataSet">The current work training data set</param> /// <param name="k">The index of the tree to be built</param> /// <param name="m">The iteration of the current boosting process</param> /// <returns>The data index used to build the boosted tree after triming</returns> public int[] TrimIndex(DataSet inDataSet, int k, int m) { #if !DoTrimIndex return inDataSet.DataIndex; #else const float maxTrimRate = 0.8F; const int minNonTrimIter = 30; float trimQuantile = 0.10F; //Make it scalable to any number of classes //float[] trimQuantile = { 0.10F, 0.10F, 0.10F, 0.10F, 0.10F }; // Weight-trimming discards a portion of samples in the lower quantile of data respect to weights // I find this not only speeds up the computations but also outputs better results. int[] trimIndex; int[] index = inDataSet.DataIndex; if (m < minNonTrimIter) { trimIndex = index; } else { float[] weightsL = new float[index.Length]; float sumWeights = 0; for (int i = 0; i < index.Length; i++) { weightsL[i] = this.weights[index[i]]; sumWeights += weightsL[i]; } int[] weightIndex = Vector.IndexArray(index.Length); Array.Sort(weightsL, weightIndex); int trimLen = 0; float partialSumWeights = 0; float targetSumWeights = trimQuantile * sumWeights - float.Epsilon; while (partialSumWeights < targetSumWeights && trimLen < index.Length * maxTrimRate) partialSumWeights += weightsL[trimLen++]; trimIndex = new int[index.Length - trimLen]; for (int i = 0; i < trimIndex.Length; i++) trimIndex[i] = index[weightIndex[i + trimLen]]; // We find empirically that sorting the indexes actually speeds up accessing the data matrix Array.Sort(trimIndex); if (m >= minNonTrimIter) Console.WriteLine("\t" + k.ToString() + "\t" + (1.0 * trimIndex.Length / index.Length).ToString()); } return trimIndex; #endif //!DoTrimIndex }
/// <summary> /// Compute the point-wise pseudo-response of the loss function to be optimized /// It is used to build the decision tree - except from computing the response value of a leaf node /// </summary> /// <param name="dataSet">all training data</param> public void ComputePseudoResponse(DataSet dataSet) { // Reset/(zero out) pseudoResponse and weights for a new iteration ResetParameters(); for (int qIdx = 0; qIdx < dataSet.NumGroups; qIdx++) { DataGroup queryGroup = dataSet.GetDataGroup(qIdx); RankPairGenerator rankPairs = new RankPairGenerator(queryGroup, this.labels); Query query = this.qcAccum.queries[dataSet.GroupIndex[qIdx]]; ; query.UpdateScores(this.score, queryGroup.iStart); query.ComputeRank(); foreach (RankPair rankPair in rankPairs) { float scoreH_minus_scoreL = this.score[rankPair.IdxH] - this.score[rankPair.IdxL]; //compute the cross-entropy gradient of the pair float gradient = RankPair.CrossEntropyDerivative(scoreH_minus_scoreL); //compute the absolute change in NDCG if we swap the pair in the current ordering float absDeltaPosition = AbsDeltaPosition(rankPair, queryGroup, query); // Marginalize the pair-wise gradient to get point wise gradient. The point with higher relevance label (IdxH) always gets // a positive push (i.e. upwards). this.pseudoResponses[rankPair.IdxH] += gradient * absDeltaPosition; this.pseudoResponses[rankPair.IdxL] -= gradient * absDeltaPosition; // Note that the weights are automatically always positive float weight = absDeltaPosition * RankPair.CrossEntropy2ndDerivative(this.score[rankPair.IdxH] - this.score[rankPair.IdxL]); this.weights[rankPair.IdxH] += weight; this.weights[rankPair.IdxL] += weight; } } for (int i = 0; i < dataSet.NumSamples; i++) { int dataIdx = dataSet.DataIndex[i]; //incorporating the gradient of the label this.pseudoResponses[dataIdx] = (1 - this.labelWeight) * this.pseudoResponses[dataIdx] + this.labelWeight * (this.labels[dataIdx] - this.score[dataIdx]); this.weights[dataIdx] = (1 - this.labelWeight) * this.weights[dataIdx] + this.labelWeight * 1; } }
/// <summary> /// Compute the point-wise pseudo-response of the loss function to be optimized /// It is used to build the decision tree - except from computing the response value of a leaf node /// </summary> /// <param name="dataSet">all training data</param> public void ComputePseudoResponse(DataSet dataSet) { for (int k = 0; k < this.numClass; k++) { for (int j = 0; j < dataSet.NumSamples; j++) { int i = dataSet.DataIndex[j]; this.pseudoResponses[k][i] = this.classLabelsMatrix[k][i] - this.classProb[k][i]; // qiangwu: pingli has assured me that the following are equvalent: // qiangwu: weights[i] = abs(responses[k][i])(1-abs(responses[k][i])); // qiangwu: which is shown in the paper - algorithms 6: Lk-TreeBoost this.weights[k][i] = this.classProb[k][i] * (1 - this.classProb[k][i]); } } }
/// <summary> /// Reducing the training data used to build a boosted tree /// Influece trimming discard a small fraction (lower quantile) of the samples. /// This was proposed previously as a heuristic for speeding up training. /// I happen to notice that trimming also helps for better NDCG. /// Intutively, when the weight p*(1-p) is small, this sample is already classified pretty well, and /// further efforts could be wasted. In any case, for ranking, we do not really need perfect classifications. /// </summary> /// <param name="inDataSet">The current work training data set</param> /// <param name="k">The index of the tree to be built</param> /// <param name="m">The iteration of the current boosting process</param> /// <returns>The data index used to build the boosted tree after triming</returns> public int[] TrimIndex(DataSet inDataSet, int k, int m) { const float maxTrimRate = 0.8F; // At least keeps 1-0.8 samples after trimming const int minNonTrimIter = 30; // only perform weight trimming after 30 iterations. // weight trimming plays a beneficial role for NDCG. // For now, it is probably safe to use 0.10F-0.20F. float trimQuantile = 0.10F; //Make it scalable to any number of classes //float[] trimQuantile = { 0.10F, 0.10F, 0.10F, 0.10F, 0.10F }; //private float[] trimQuantile = { 0.15F, 0.15F, 0.15F, 0.15F, 0.15F}; //private float[] trimQuantile = { 0.2F, 0.2F, 0.2F, 0.2F, 0.2F }; //private float[] trimQuantile = { 0.25F, 0.25F, 0.25F, 0.25F, 0.25F }; //private float[] trimQuantile = { 0.3F, 0.3F, 0.3F, 0.3F, 0.3F }; //private float[] trimQuantile = { 0.25F, 0.20F, 0.15F, 0.10F, 0.05F }; //private float[] trimQuantile = { 0.30F, 0.25F, 0.20F, 0.15F, 0.10F }; //private float[] trimQuantile = { 0.35F, 0.30F, 0.25F, 0.20F, 0.15F }; //private float[] trimQuantile = { 0.40F, 0.35F, 0.30F, 0.25F, 0.20F }; //private float[] trimQuantile = { 0.45F, 0.40F, 0.35F, 0.30F, 0.25F }; // Weight-trimming discards a portion of samples in the lower quantile of p*(1-p) // I find this not only speeds up the computations but also outputs better results. int[] trimIndex; int[] index = inDataSet.DataIndex; if (m < minNonTrimIter || (this.numClass == 2 && k == 1)) { trimIndex = index; } else { float[] weightsL = new float[index.Length]; float sumWeights = 0; for (int i = 0; i < index.Length; i++) { weightsL[i] = this.weights[k][index[i]]; sumWeights += weightsL[i]; } int[] weightIndex = Vector.IndexArray(index.Length); Array.Sort(weightsL, weightIndex); int trimLen = 0; float partialSumWeights = 0; float targetSumWeights = trimQuantile * sumWeights - float.Epsilon; while (partialSumWeights < targetSumWeights && trimLen < index.Length * maxTrimRate) partialSumWeights += weightsL[trimLen++]; trimIndex = new int[index.Length - trimLen]; for (int i = 0; i < trimIndex.Length; i++) trimIndex[i] = index[weightIndex[i + trimLen]]; // We find empirically that sorting the indexes actually speeds up accessing the data matrix Array.Sort(trimIndex); if (m >= minNonTrimIter) Console.WriteLine("\t" + k.ToString() + "\t" + (1.0 * trimIndex.Length / index.Length).ToString()); } return trimIndex; }
/// <summary> /// Reducing the training data used to build a boosted tree /// </summary> /// <param name="inDataSet">The current work training data set</param> /// <param name="k">The index of the tree to be built</param> /// <param name="m">The iteration of the current boosting process</param> /// <returns>The data index used to build the boosted tree after triming</returns> public int[] TrimIndex(DataSet inDataSet, int k, int m) { return inDataSet.DataIndex; }
/// <summary> /// Compute the point-wise pseudo-response of the loss function to be optimized /// It is used to build the decision tree - except from computing the response value of a leaf node /// </summary> /// <param name="dataSet">all training data</param> public abstract void ComputePseudoResponse(DataSet dataSet);
public override void ComputePseudoResponse(DataSet dataSet) { this.ResetParameters(); for (int j = 0; j < dataSet.NumSamples; j++) { int iDoc = dataSet.DataIndex[j]; this.pseudoResponses[iDoc] = this.labels[iDoc] - this.funValues[iDoc]; } }
public override void ComputePseudoResponse(DataSet dataSet) { this.ResetParameters(); for (int j = 0; j < dataSet.NumSamples; j++) { int iDoc = dataSet.DataIndex[j]; // Careful: set the gradient at zero to zero. Oterhwise we're asking the tree to correct for something that's not an error. float delta = this.labels[iDoc] - this.funValues[iDoc]; if (delta > 0) this.pseudoResponses[iDoc] = 1; else if (delta < 0) this.pseudoResponses[iDoc] = -1; else this.pseudoResponses[iDoc] = 0; } }
/// <summary> /// Compute the point-wise pseudo-response of the loss function to be optimized /// It is used to build the decision tree - except from computing the response value of a leaf node /// </summary> /// <param name="dataSet">all training data</param> public void ComputePseudoResponse(DataSet dataSet) { ResetParameters(); for (int qIdx = 0; qIdx < dataSet.NumGroups; qIdx++) { DataGroup query = dataSet.GetDataGroup(qIdx); RankPairGenerator rankPairs = new RankPairGenerator(query, this.labels); foreach (RankPair rankPair in rankPairs) { float scoreH_minus_scoreL = this.score[rankPair.IdxH] - this.score[rankPair.IdxL]; float gradient = RankPair.CrossEntropyDerivative(scoreH_minus_scoreL); this.pseudoResponses[rankPair.IdxH] += gradient; this.pseudoResponses[rankPair.IdxL] -= gradient; float weight = RankPair.CrossEntropy2ndDerivative(this.score[rankPair.IdxH] - this.score[rankPair.IdxL]); this.weights[rankPair.IdxH] += weight; this.weights[rankPair.IdxL] += weight; } //this.labelFeatureData.PartitionData; } }