/// <summary> /// Fill a sparse DMatrix using CSR compression. /// See http://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.html. /// </summary> private DMatrix FillSparseMatrix(IChannel ch, int nbDim, long nbRows, RoleMappedData data, out Float[] labels, out uint[] groupCount) { // Allocation. if ((2 * nbRows) >= Utils.ArrayMaxSize) { throw _host.Except("The training dataset is too big to hold in memory. " + "2 features multiplied by the number of rows must be less than {0}.", Utils.ArrayMaxSize); } var features = new Float[nbRows * 2]; var indices = new uint[features.Length]; var indptr = new ulong[nbRows + 1]; long nelem = 0; labels = new Float[nbRows]; var hasWeights = data.Schema.Weight != null; var hasGroup = data.Schema.Group != null; var weights = hasWeights ? new Float[nbRows] : null; var groupsML = hasGroup ? new uint[nbRows] : null; groupCount = hasGroup ? new uint[nbRows] : null; var groupId = hasGroup ? new HashSet <uint>() : null; int count = 0; int lastGroup = -1; var flags = CursOpt.Features | CursOpt.Label | CursOpt.AllowBadEverything | CursOpt.Weight | CursOpt.Group; var featureVector = default(VBuffer <float>); var labelProxy = float.NaN; var groupProxy = ulong.MaxValue; using (var cursor = data.CreateRowCursor(flags, null)) { var featureGetter = cursor.GetFeatureFloatVectorGetter(data); var labelGetter = cursor.GetLabelFloatGetter(data); var weighGetter = cursor.GetOptWeightFloatGetter(data); var groupGetter = cursor.GetOptGroupGetter(data); while (cursor.MoveNext()) { featureGetter(ref featureVector); labelGetter(ref labelProxy); labels[count] = labelProxy; if (Single.IsNaN(labels[count])) { continue; } indptr[count] = (ulong)nelem; int nbValues = featureVector.Count; if (nbValues > 0) { if (nelem + nbValues > features.Length) { long newSize = Math.Max(nelem + nbValues, features.Length * 2); if (newSize >= Utils.ArrayMaxSize) { throw _host.Except("The training dataset is too big to hold in memory. " + "It should be half of {0}.", Utils.ArrayMaxSize); } Array.Resize(ref features, (int)newSize); Array.Resize(ref indices, (int)newSize); } Array.Copy(featureVector.Values, 0, features, nelem, nbValues); if (featureVector.IsDense) { for (int i = 0; i < nbValues; ++i) { indices[nelem++] = (uint)i; } } else { for (int i = 0; i < nbValues; ++i) { indices[nelem++] = (uint)featureVector.Indices[i]; } } } if (hasWeights) { weighGetter(ref weights[count]); } if (hasGroup) { groupGetter(ref groupProxy); if (groupProxy >= uint.MaxValue) { throw _host.Except($"Group is above {uint.MaxValue}"); } groupsML[count] = (uint)groupProxy; if (count == 0 || groupsML[count - 1] != groupsML[count]) { groupCount[++lastGroup] = 1; ch.Check(!groupId.Contains(groupsML[count]), "Group Id are not contiguous."); groupId.Add(groupsML[count]); } else { ++groupCount[lastGroup]; } } ++count; } } indptr[count] = (uint)nelem; if (nelem < features.Length * 3 / 4) { Array.Resize(ref features, (int)nelem); Array.Resize(ref indices, (int)nelem); } PostProcessLabelsBeforeCreatingXGBoostContainer(ch, data, labels); // We create a DMatrix. DMatrix dtrain = new DMatrix((uint)nbDim, indptr, indices, features, (uint)count, (uint)nelem, labels: labels, weights: weights, groups: groupCount); return(dtrain); }
public void Predict(ref VBuffer <Float> features, ref VBuffer <Float> predictedValues, ref XGBoostTreeBuffer internalBuffer, bool outputMargin = true, int ntreeLimit = 0) { PredictOneOff(ref features, ref predictedValues, ref internalBuffer, outputMargin, ntreeLimit); #if (DEBUG && MORE_CHECKING) // This part checks that the function PredictOneOff which relies on a customized version // of XGBoost produces the same result as the official API. // This makes the prediction terribly slow as the prediction are called twice // and the second call (PredictN) cannot be parallelized (lock protected). VBuffer <Float> check = new VBuffer <float>(); DMatrix data; if (features.IsDense) { data = new DMatrix(features.Values, 1, (uint)features.Count); } else { int nb = features.Count; var indptr = new ulong[] { 0, (uint)nb }; var indices = new uint[nb]; for (int i = 0; i < nb; ++i) { indices[i] = (uint)features.Indices[i]; } data = new DMatrix((uint)features.Length, indptr, indices, features.Values, 1, (uint)nb); } PredictN(data, ref check, outputMargin, ntreeLimit); if (check.Count != predictedValues.Count) { string message = string.Format( "Count={0} Length={1} IsDense={2}\nValues={3}\nIndices={4}\nCustom Ouput={5}\nOfficial API={6}", features.Count, features.Length, features.IsDense, features.Values == null ? "" : string.Join(", ", features.Values.Select(c => c.ToString()).ToArray()), features.Indices == null ? "" : string.Join(", ", features.Indices.Select(c => c.ToString()).ToArray()), predictedValues.Values == null ? "" : string.Join(", ", predictedValues.Values.Select(c => c.ToString()).ToArray()), check.Values == null ? "" : string.Join(", ", check.Values.Select(c => c.ToString()).ToArray())); throw Contracts.Except("Mismatch between official API and custom API (dimension).\n" + message); } for (int i = 0; i < check.Count; ++i) { if (Math.Abs(check.Values[0] - predictedValues.Values[0]) > 1e-5) { string message = string.Format( "Count={0} Length={1} IsDense={2}\nValues={3}\nIndices={4}\nCustom Ouput={5}\nOfficial API={6}", features.Count, features.Length, features.IsDense, features.Values == null ? "" : string.Join(", ", features.Values.Select(c => c.ToString()).ToArray()), features.Indices == null ? "" : string.Join(", ", features.Indices.Select(c => c.ToString()).ToArray()), predictedValues.Values == null ? "" : string.Join(", ", predictedValues.Values.Select(c => c.ToString()).ToArray()), check.Values == null ? "" : string.Join(", ", check.Values.Select(c => c.ToString()).ToArray())); PredictOneOff(ref features, ref predictedValues, ref internalBuffer, outputMargin, ntreeLimit); message += string.Format("\nSecond computation\n{0}", predictedValues.Values == null ? "" : string.Join(", ", predictedValues.Values.Select(c => c.ToString()).ToArray())); throw Contracts.Except("Mismatch between official API and custom API (output).\n" + message); } } #endif }
private DMatrix FillDenseMatrix(IChannel ch, int nbDim, long nbRows, RoleMappedData data, out Float[] labels, out uint[] groupCount) { // Allocation. string errorMessageGroup = string.Format("Group is above {0}.", uint.MaxValue); if (nbDim * nbRows >= Utils.ArrayMaxSize) { throw _host.Except("The training dataset is too big to hold in memory. " + "Number of features ({0}) multiplied by the number of rows ({1}) must be less than {2}.", nbDim, nbRows, Utils.ArrayMaxSize); } var features = new Float[nbDim * nbRows]; labels = new Float[nbRows]; var hasWeights = data.Schema.Weight != null; var hasGroup = data.Schema.Group != null; var weights = hasWeights ? new Float[nbRows] : null; var groupsML = hasGroup ? new uint[nbRows] : null; groupCount = hasGroup ? new uint[nbRows] : null; var groupId = hasGroup ? new HashSet <uint>() : null; int count = 0; int lastGroup = -1; int fcount = 0; var flags = CursOpt.Features | CursOpt.Label | CursOpt.AllowBadEverything | CursOpt.Weight | CursOpt.Group; var featureVector = default(VBuffer <float>); var labelProxy = float.NaN; var groupProxy = ulong.MaxValue; using (var cursor = data.CreateRowCursor(flags, null)) { var featureGetter = cursor.GetFeatureFloatVectorGetter(data); var labelGetter = cursor.GetLabelFloatGetter(data); var weighGetter = cursor.GetOptWeightFloatGetter(data); var groupGetter = cursor.GetOptGroupGetter(data); while (cursor.MoveNext()) { featureGetter(ref featureVector); labelGetter(ref labelProxy); labels[count] = labelProxy; if (Single.IsNaN(labels[count])) { continue; } featureVector.CopyTo(features, fcount, Single.NaN); fcount += featureVector.Count; if (hasWeights) { weighGetter(ref weights[count]); } if (hasGroup) { groupGetter(ref groupProxy); _host.Check(groupProxy < uint.MaxValue, errorMessageGroup); groupsML[count] = (uint)groupProxy; if (count == 0 || groupsML[count - 1] != groupsML[count]) { groupCount[++lastGroup] = 1; ch.Check(!groupId.Contains(groupsML[count]), "Group Id are not contiguous."); groupId.Add(groupsML[count]); } else { ++groupCount[lastGroup]; } } ++count; } } PostProcessLabelsBeforeCreatingXGBoostContainer(ch, data, labels); // We create a DMatrix. DMatrix dtrain = new DMatrix(features, (uint)count, (uint)nbDim, labels: labels, weights: weights, groups: groupCount); return(dtrain); }
/// <summary> /// Train and returns a booster. /// </summary> /// <param name="ch">IChannel</param> /// <param name="pch">IProgressChannel</param> /// <param name="numberOfTrees">Number of trained trees</param> /// <param name="parameters">Parameters see <see cref="XGBoostArguments"/></param> /// <param name="dtrain">Training set</param> /// <param name="numBoostRound">Number of trees to train</param> /// <param name="obj">Custom objective</param> /// <param name="maximize">Whether to maximize feval.</param> /// <param name="verboseEval">Requires at least one item in evals. /// If "verbose_eval" is True then the evaluation metric on the validation set is /// printed at each boosting stage.</param> /// <param name="xgbModel">For continuous training.</param> /// <param name="saveBinaryDMatrix">Save DMatrix in binary format (for debugging purpose).</param> public static Booster Train(IChannel ch, IProgressChannel pch, out int numberOfTrees, Dictionary <string, string> parameters, DMatrix dtrain, int numBoostRound = 10, Booster.FObjType obj = null, bool maximize = false, bool verboseEval = true, Booster xgbModel = null, string saveBinaryDMatrix = null) { #if (!XGBOOST_RABIT) if (WrappedXGBoostInterface.RabitIsDistributed() == 1) { var pname = WrappedXGBoostInterface.RabitGetProcessorName(); ch.Info("[WrappedXGBoostTraining.Train] start {0}:{1}", pname, WrappedXGBoostInterface.RabitGetRank()); } #endif if (!string.IsNullOrEmpty(saveBinaryDMatrix)) { dtrain.SaveBinary(saveBinaryDMatrix); } Booster bst = new Booster(parameters, dtrain, xgbModel); int numParallelTree = 1; int nboost = 0; if (parameters != null && parameters.ContainsKey("num_parallel_tree")) { numParallelTree = Convert.ToInt32(parameters["num_parallel_tree"]); nboost /= numParallelTree; } if (parameters.ContainsKey("num_class")) { int numClass = Convert.ToInt32(parameters["num_class"]); nboost /= numClass; } var prediction = new VBuffer <Float>(); var grad = new VBuffer <Float>(); var hess = new VBuffer <Float>(); var start = DateTime.Now; #if (!XGBOOST_RABIT) int version = bst.LoadRabitCheckpoint(); ch.Check(WrappedXGBoostInterface.RabitGetWorldSize() != 1 || version == 0); #else int version = 0; #endif int startIteration = version / 2; nboost += startIteration; int logten = 0; int temp = numBoostRound * 5; while (temp > 0) { logten += 1; temp /= 10; } temp = Math.Max(logten - 2, 0); logten = 1; while (temp-- > 0) { logten *= 10; } var metrics = new List <string>() { "Iteration", "Training Time" }; var units = new List <string>() { "iterations", "seconds" }; if (verboseEval) { metrics.Add("Training Error"); metrics.Add(parameters["objective"]); } var header = new ProgressHeader(metrics.ToArray(), units.ToArray()); int iter = 0; double trainTime = 0; double trainError = double.NaN; pch.SetHeader(header, e => { e.SetProgress(0, iter, numBoostRound - startIteration); e.SetProgress(1, trainTime); if (verboseEval) { e.SetProgress(2, trainError); } }); for (iter = startIteration; iter < numBoostRound; ++iter) { if (version % 2 == 0) { bst.Update(dtrain, iter, ref grad, ref hess, ref prediction, obj); #if (!XGBOOST_RABIT) bst.SaveRabitCheckpoint(); #endif version += 1; } #if (!XGBOOST_RABIT) ch.Check(WrappedXGBoostInterface.RabitGetWorldSize() == 1 || version == WrappedXGBoostInterface.RabitVersionNumber()); #endif nboost += 1; trainTime = (DateTime.Now - start).TotalMilliseconds; if (verboseEval) { pch.Checkpoint(new double?[] { iter, trainTime, trainError }); if (iter == startIteration || iter == numBoostRound - 1 || iter % logten == 0 || (DateTime.Now - start) > TimeSpan.FromMinutes(2)) { string strainError = bst.EvalSet(new[] { dtrain }, new[] { "Train" }, iter); // Example: "[0]\tTrain-error:0.028612" if (!string.IsNullOrEmpty(strainError) && strainError.Contains(":")) { double val; if (double.TryParse(strainError.Split(':').Last(), out val)) { trainError = val; } } } } else { pch.Checkpoint(new double?[] { iter, trainTime }); } version += 1; } numberOfTrees = numBoostRound * numParallelTree; if (WrappedXGBoostInterface.RabitIsDistributed() == 1) { var pname = WrappedXGBoostInterface.RabitGetProcessorName(); ch.Info("[WrappedXGBoostTraining.Train] end {0}:{1}", pname, WrappedXGBoostInterface.RabitGetRank()); } return(bst); }