// The data are preprocessed (quantized) to avoid sorting public BoostTree(LabelFeatureDataCoded labelFeatureDataCoded, LabelFeatureData subModelScore, Model subModel, BoostTreeLoss boostTreeLoss, string saveTreeBinFile, string saveTreeTextFile) { this.labelFeatureDataCoded = labelFeatureDataCoded; this.subModelScore = subModelScore; UnpackData(); this.subModel = subModel; this.boostTreeLoss = boostTreeLoss; this.featureNames = labelFeatureDataCoded.FeatureNames; this.saveTreeBinFile = saveTreeBinFile; this.saveTreeTextFile = saveTreeTextFile; this.saveTreeXmlFile = saveTreeTextFile + ".xml"; }
private CLabelFeatureDataCodedComposite(LabelFeatureData[] labelFeatureDataElements) : base(labelFeatureDataElements) { //the LabelFeatureDataCoded object is always the first one in the input array this.labelFeatureDataCoded = (LabelFeatureDataCoded)labelFeatureDataElements[0]; }
//the first one id always train data, which is always coded public static CLabelFeatureDataCodedComposite Create(LabelFeatureDataCoded trainLabelFeatureDataCoded, LabelFeatureData validLabelFeatureData, LabelFeatureData testLabelFeatureData) { if(trainLabelFeatureDataCoded == null) { return null; } int cPartition = (int)DataPartitionType.cTypes; List<LabelFeatureData> listLabelFeatureData = new List<LabelFeatureData>(cPartition); listLabelFeatureData.Add(trainLabelFeatureDataCoded); listLabelFeatureData.Add(validLabelFeatureData); listLabelFeatureData.Add(testLabelFeatureData); LabelFeatureData[] labelFeatureDataArray = listLabelFeatureData.ToArray(); CLabelFeatureDataCodedComposite labelFeatureData = new CLabelFeatureDataCodedComposite(labelFeatureDataArray); int[] cDataGroups = new int[cPartition]; for (int i = 0; i < cPartition; i++) { cDataGroups[i] = 0; if (labelFeatureDataArray[i] != null) { cDataGroups[i] = labelFeatureDataArray[i].DataGroups.GroupCounts; } } //train/valid/test data partition labelFeatureData.DataGroups.PartitionData(cDataGroups); return labelFeatureData; }
/// <summary> /// construct a single uniform LabelFeatureData from an array of such data objects /// Features: the features of the unified final object is the same as the features of the first object in the input array /// If a feature does not exist/missing in the subsequent object, its value is set to zero /// DataPoints/index: the data points are aggragated together and indexed in the same order as the input object array /// DataGroups/index: the data groups are also agregrated together and indexed in the same order as the input object array /// </summary> /// <param name="labelFeatureDataElements"></param> public CLabelFeatureDataComposite(LabelFeatureData[] labelFeatureDataElements) { this.labelFeatureDataElements = labelFeatureDataElements; //number of the datapoints are the sum of the ones in the input object array numDataPoint = 0; for (int i = 0; i < this.labelFeatureDataElements.Length; i++) { if (this.labelFeatureDataElements[i] != null) { this.numDataPoint += this.labelFeatureDataElements[i].NumDataPoint; } } //featureNames is set to the feature names of the first element this.featureNames = this.labelFeatureDataElements[0].FeatureNames; //index map used to look up the feature id of the original input object from that of the unified object this.idxMaps = new int[labelFeatureDataElements.Length][]; this.accGroups = new int[labelFeatureDataElements.Length]; for (int i = 0; i < this.labelFeatureDataElements.Length; i++) { this.idxMaps[i] = null; if (this.labelFeatureDataElements[i] != null) { this.idxMaps[i] = new int[this.featureNames.Length]; for (int j = 0; j < this.featureNames.Length; j++) { string name = this.featureNames[j]; this.idxMaps[i][j] = -1; int k = 0; for (k = 0; k < this.labelFeatureDataElements[i].FeatureNames.Length; k++) { if (string.Compare(name, this.labelFeatureDataElements[i].FeatureNames[k], true) == 0) { this.idxMaps[i][j] = k; break; } } if (k >= this.labelFeatureDataElements[i].FeatureNames.Length) { Console.WriteLine("Feature " + name + " does not exist"); } } } this.accGroups[i] = ((i==0)? 0 : this.accGroups[i-1]) + ((this.labelFeatureDataElements[i] == null)? 0 : this.labelFeatureDataElements[i].DataGroups.GroupCounts); } }
public TopNSet(LabelFeatureData subModelScore, int N) { this.Datakept = new bool[subModelScore.NumDataPoint]; for (int i = 0; i < subModelScore.DataGroups.GroupCounts; i++) { DataGroup dataGroup = subModelScore.DataGroups[i]; float[] scores = new float[dataGroup.cSize]; int[] idx = new int[dataGroup.cSize]; for (int j = 0; j < dataGroup.cSize; j++) { this.Datakept[dataGroup.iStart + j] = false; float[] features = subModelScore.GetFeature(dataGroup.iStart + j); scores[j] = 0-features[0]; //sort is in increasing order idx[j] = j; } Array.Sort(scores, idx); for (int j = 0; j < dataGroup.cSize; j++) { if (j >= N) { break; } this.Datakept[dataGroup.iStart+idx[j]] = true; } } }
/// <summary> /// Compute and store the scores of the input data given a model /// </summary> /// <param name="model">the model to be evaluated</param> /// <param name="labelFeatureData">input data</param> /// <param name="subModelScore">pre-computed scores for the input data</param> public void ModelEval(Model model, LabelFeatureData labelFeatureData, LabelFeatureData subModelScore) { if (subModelScore != null) { Debug.Assert(this.numSamples == subModelScore.NumDataPoint); for (int i = 0; i < this.numSamples; i++) { this.score[i] = subModelScore.GetFeature(0, i); } } else if (model != null && labelFeatureData != null) { Debug.Assert(this.numSamples == labelFeatureData.NumDataPoint); float[] scores = new float[1]; for (int i = 0; i < this.numSamples; i++) { model.Evaluate(labelFeatureData.GetFeature(i), scores); this.score[i] = scores[0]; } } else { for (int i = 0; i < this.numSamples; i++) { this.score[i] = 0.0F; } } }
/// <summary> /// Compute and store the scores of the input data given a model /// </summary> /// <param name="model">the model to be evaluated</param> /// <param name="labelFeatureData">input data</param> /// <param name="subModelScore">pre-computed scores for the input data</param> public void ModelEval(Model model, LabelFeatureData labelFeatureData, LabelFeatureData subModelScore) { if (subModelScore != null) { Debug.Assert(this.numSamples == subModelScore.NumDataPoint); for (int i = 0; i < this.numSamples; i++) { for (int k = 0; k < this.numClass; k++) { this.ModelScores[k][i] = subModelScore.GetFeature(k, i); } } } else if (model != null && labelFeatureData != null) // REVIEW by CJCB: this was 'if', I think it should be 'else if' so I changed it { Debug.Assert(this.numSamples == labelFeatureData.NumDataPoint); float[] scores = new float[this.numClass]; for (int i = 0; i < this.numSamples; i++) { model.Evaluate(labelFeatureData.GetFeature(i), scores); for (int k = 0; k < this.numClass; k++) { this.ModelScores[k][i] = scores[k]; } } } else { //TODO: qiangwu - should we always requre subModel to exist //to make the default model value computation explicit??? //It is probabaly safer that way revist the issue later float score = (float)1.0 / (float)this.numClass; for (int i = 0; i < this.numSamples; i++) { for (int k = 0; k < this.numClass; k++) { this.ModelScores[k][i] = score; } } } }
public float[][] Predict(LabelFeatureData testData) { return Predict(testData, this.optIter); }
public float[][] Predict(LabelFeatureData testData, int numIter) { Predict(testData, numIter, this.boostTreeLoss, null, false); return this.boostTreeLoss.ModelScores; }
public void Predict(LabelFeatureData testData, int numIter, Metrics metrics, bool silent) { Predict(testData, numIter, this.boostTreeLoss, metrics, silent); }
public void Predict(LabelFeatureData testData, int numIter, Metrics metrics) { Predict(testData, numIter, this.boostTreeLoss, metrics, false); }
public void Predict(LabelFeatureData testData, int numIter, BoostTreeLoss boostTreeLoss ) { Predict(testData, numIter, boostTreeLoss, null, false); }
public void Predict(LabelFeatureData labelFeatureData, int numIter, BoostTreeLoss boostTreeLoss, Metrics metrics, //reporting the error for each iteration if the following are set bool silent // If true, only report results on the last iteration ) { if (numIter > this.TotalIter) numIter = this.TotalIter; boostTreeLoss.Reset(labelFeatureData.NumDataPoint); //(1) compute the probabilities produced by the sub-model boostTreeLoss.ModelEval(this.subModel, labelFeatureData, null); //(2) compute the corresponding function values; boostTreeLoss.ModelScoresToFuncValues(); if (metrics != null) { metrics.ComputeMetrics(boostTreeLoss.ModelScores, 0, this.optIter == 0); #if VERBOSE Console.WriteLine(metrics.ResultsHeaderStr()); Console.WriteLine(metrics.ResultsStr(0)); #endif } //(3) accumulate the function values for each boosted regression tree int numSamples = labelFeatureData.NumDataPoint; float[] funValueGain = new float[numSamples]; #if GET_PER_DOC_PER_ITER_SCORES float[][] saveScores = ArrayUtils.FloatMatrix(numIter+2, labelFeatureData.NumDataPoint); // We will take transpose when we print for (int i = 0; i < labelFeatureData.NumDataPoint; ++i) { saveScores[0][i] = labelFeatureData.GetGroupId(i); saveScores[1][i] = labelFeatureData.GetLabel(i); } #endif for (int m = 0; m < numIter; m++) { // fit a residual model (regression trees) from the pesuso response // to compensate the error of the current system for (int k = 0; k < boostTreeLoss.NumTreesPerIteration; k++) { if (this.regressionTrees[m, 0] == null) break; #if GET_PER_DOC_PER_ITER_SCORES this.regressionTrees[m, k].PredictFunValueNKeepScores(labelFeatureData, this.Train2TestIdx, funValueGain, saveScores[m+2]); #else this.regressionTrees[m, k].PredictFunValue(labelFeatureData, this.Train2TestIdx, funValueGain); #endif boostTreeLoss.AccFuncValueGain(funValueGain, 1.0f, k); } if (metrics != null) { //compute the metrics of the current system boostTreeLoss.FuncValuesToModelScores(); metrics.ComputeMetrics(boostTreeLoss.ModelScores, m + 1, this.optIter == m + 1); if(m==numIter-1 || !silent) Console.WriteLine(metrics.ResultsStr(m + 1)); } } #if GET_PER_DOC_PER_ITER_SCORES using (StreamWriter sw = new StreamWriter("allScores.tsv")) { sw.Write("m:QueryID\tm:Rating"); // Write the header (with no tab at the end!) for (int j = 1; j < numIter+1; ++j) sw.Write("\tFtr_" + j.ToString("0000")); sw.WriteLine(); for (int j = 0; j < labelFeatureData.NumDataPoint; ++j) { sw.Write("{0}\t{1}", saveScores[0][j], saveScores[1][j]); // Write the query ID and label for (int m = 2; m < numIter + 2; ++m) sw.Write("\t{0:G6}", saveScores[m][j]); sw.WriteLine(); } } #endif if (metrics == null) { boostTreeLoss.FuncValuesToModelScores(); } else metrics.SaveScores("DataScores.txt", boostTreeLoss.ModelScores); }
//dataType==null <=> all the data are used in one partition public PrecRecall(LabelFeatureData labelFeatureData, LabelConverter labelConvert, DataPartitionType[] dataTypes) : base(labelFeatureData, labelConvert, dataTypes) { this.stats = new int[2, 2]; }
public void PredictFunValueNKeepScores(LabelFeatureData data, int[] Train2TestIdx, float[] funValue, float[] keepScores) { Debug.Assert(data.NumDataPoint == funValue.Length); for (int i = 0; i < data.NumDataPoint; i++) { funValue[i] = PredictFunValue(data.GetFeature(i), Train2TestIdx); keepScores[i] = funValue[i]; } }
public void PredictFunValue(LabelFeatureData data, int[] Train2TestIdx, float[] funValue) { Debug.Assert(data.NumDataPoint == funValue.Length); for (int i = 0; i < data.NumDataPoint; i++) { funValue[i] = PredictFunValue(data.GetFeature(i), Train2TestIdx); //int node = 0; //bool nextDataPoint = false; //while (nextDataPoint == false) //{ // if (this.tree[node].isTerminal) // { // funValues[i] = this.tree[node].regionValue; // nextDataPoint = true; // continue; // } // if (data[i][this.tree[node].split] <= this.tree[node].splitValue) // node = this.tree[node].leftChild; // else // node = this.tree[node].rightChild; //} } }