public uint GetNumCols() { uint nb = 0; WrappedXGBoostInterface.Check(WrappedXGBoostInterface.XGDMatrixNumCol(_handle, ref nb)); return(nb); }
/// <summary> /// Create a dense matrix used in XGBoost. /// </summary> /// <param name="data">Matrix as a Float array</param> /// <param name="nrow">Number of rows</param> /// <param name="ncol">Number of columns</param> /// <param name="labels">Labels</param> /// <param name="missing">Missing value</param> /// <param name="weights">Vector of weights (can be null)</param> /// <param name="groups">Vector of groups (can be null)</param> /// <param name="featureNames">Set names for features.</param> /// <param name="featureTypes">Set types for features.</param> public DMatrix(Float[] data, uint nrow, uint ncol, Float[] labels = null, Float missing = Float.NaN, Float[] weights = null, uint[] groups = null, IEnumerable <string> featureNames = null, IEnumerable <string> featureTypes = null) { #if (DEBUG) _gcKeep = new GcKeep() { data = data, labels = labels, weights = weights, groups = groups }; #endif WrappedXGBoostInterface.Check(WrappedXGBoostInterface.XGDMatrixCreateFromMat(data, nrow, ncol, missing, ref _handle)); if (labels != null) { SetLabel(labels, nrow); } if (weights != null) { SetWeight(weights, nrow); } if (groups != null) { SetGroups(groups, nrow); } _featureNames = featureNames == null ? null : featureNames.ToArray(); _featureTypes = featureTypes == null ? null : featureTypes.ToArray(); }
/// <summary> /// Set parameters into the Booster. /// </summary> /// <param name="parameters">List of parameters used by XGBoost. See <see cref="XGBoostArguments"/>.</param> public void SetParam(Dictionary <string, string> parameters) { foreach (var pair in parameters) { WrappedXGBoostInterface.XGBoosterSetParam(_handle, pair.Key, pair.Value); } }
/// <summary> /// Initialize the Booster. /// </summary> /// <param name="parameters">Parameters for boosters. See <see cref="XGBoostArguments"/>.</param> /// <param name="data">training data<see cref="DMatrix"/></param> /// <param name="continuousTraining">Start from a trained model</param> public Booster(Dictionary <string, string> parameters, DMatrix data, Booster continuousTraining) { _featureNames = null; _featureTypes = null; _numFeatures = (int)data.GetNumCols(); Contracts.Assert(_numFeatures > 0); _handle = IntPtr.Zero; WrappedXGBoostInterface.Check(WrappedXGBoostInterface.XGBoosterCreate(new[] { data.Handle }, 1, ref _handle)); if (continuousTraining != null) { // There should be another way than serialized then loading the model. var saved = continuousTraining.SaveRaw(); unsafe { fixed(byte *buf = saved) WrappedXGBoostInterface.Check(WrappedXGBoostInterface.XGBoosterLoadModelFromBuffer(_handle, buf, (uint)saved.Length)); } } if (parameters != null) { SetParam(parameters); } WrappedXGBoostInterface.Check(WrappedXGBoostInterface.XGBoosterLazyInit(_handle)); }
/// <summary> /// Boost the booster for one iteration, with customized gradient statistics. /// </summary> /// <param name="dtrain">DMatrix (training set)</param> /// <param name="grad">Gradient as a vector of floats (can be null).</param> /// <param name="hess">Hessien as a vector of floats (can be null).</param> private void Boost(DMatrix dtrain, ref VBuffer <Float> grad, ref VBuffer <Float> hess) { Contracts.Assert(grad.Length == hess.Length, string.Format("grad / hess length mismatch: {0} / {1}", grad.Length, hess.Length)); ValidateFeatures(dtrain); Contracts.Assert(grad.IsDense, "grad"); Contracts.Assert(hess.IsDense, "hess"); WrappedXGBoostInterface.Check(WrappedXGBoostInterface.XGBoosterBoostOneIter(_handle, dtrain.Handle, grad.Values, hess.Values, (uint)grad.Length)); }
/// <summary> /// Initialize the model by load from rabit checkpoint. /// </summary> public int LoadRabitCheckpoint() { int version = 0; #if (!XGBOOST_RABIT) WrappedXGBoostInterface.Check(WrappedXGBoostInterface.XGBoosterLoadRabitCheckpoint(_handle, ref version)); #endif return(version); }
/// <summary> /// Save the model to a in memory buffer represetation. /// </summary> public byte[] SaveRaw() { unsafe { byte *buffer; uint size = 0; WrappedXGBoostInterface.Check(WrappedXGBoostInterface.XGBoosterGetModelRaw(_handle, ref size, out buffer)); byte[] content = new byte[size]; Marshal.Copy((IntPtr)buffer, content, 0, content.Length); return(content); } }
/// <summary> /// Initialize the booster with a byte string obtained by serializing a Booster. /// </summary> public Booster(byte[] content, int numFeatures) { Contracts.Assert(numFeatures > 0); WrappedXGBoostInterface.Check(WrappedXGBoostInterface.XGBoosterCreate(new IntPtr[] { }, 0, ref _handle)); unsafe { fixed(byte *p = content) WrappedXGBoostInterface.Check(WrappedXGBoostInterface.XGBoosterLoadModelFromBuffer(_handle, p, (uint)content.Length)); WrappedXGBoostInterface.Check(WrappedXGBoostInterface.XGBoosterLazyInit(_handle)); } _numFeatures = numFeatures; }
public static double XGBoosterGetNumInfo(IntPtr handle, string nameStr) { double[] info = new double[1]; unsafe { fixed(double *pd = info) { IntPtr ptr = (IntPtr)pd; WrappedXGBoostInterface.XGBoosterGetNumInfoTest(handle, ptr, "NumTrees"); } } return(info[0]); }
/// <summary> /// Evaluates a set of data and returns a string as a result. /// Used by the training function to display intermediate results on each iteration. /// </summary> public string EvalSet(DMatrix[] dmats, string[] names, int iteration = 0) { IntPtr outResult; for (int i = 0; i < dmats.Length; ++i) { ValidateFeatures(dmats[i]); } WrappedXGBoostInterface.Check(WrappedXGBoostInterface.XGBoosterEvalOneIter(Handle, iteration, dmats.Select(c => c.Handle).ToArray(), names, (uint)dmats.Length, out outResult)); return(WrappedXGBoostInterface.CastString(outResult)); }
/// <summary> /// Predict with data. Calls the official API of XGBoost library. /// The function is protected against concurrent calls. /// </summary> /// <param name="data">Data as DMatrix</param> /// <param name="predictedValues">Results of the prediction</param> /// <param name="outputMargin">Whether to output the raw untransformed margin value.</param> /// <param name="ntreeLimit">Limit number of trees in the prediction; defaults to 0 (use all trees).</param> public void PredictN(DMatrix data, ref VBuffer <Float> predictedValues, bool outputMargin = true, int ntreeLimit = 0) { int optionMask = 0x00; if (outputMargin) { optionMask |= 0x01; } // REVIEW xadupre: see review in function PredictOneOff. ValidateFeatures(data); uint length = 0; IntPtr ppreds = IntPtr.Zero; unsafe { // XGBoost uses OMP to parallelize the computation // of the output, each observation will be computed in a separate thread // and will use thread specific context. // Read https://blogs.msdn.microsoft.com/oldnewthing/20101122-00/?p=12233. // This function is called from multiple threads in C# for the evaluation with an iterator, // XGBoost parallelizes the computation for each evaluation (even if it is one in this case). // It chooses the number of thread with: nthread = omp_get_num_threads() (gbtree.cc) // The lock nullifies the parallelization done by Microsoft.ML. // There is no parallelization done by XGBoost on one observation. // Without the lock, the program fails (null pointer or something similar). // This item is a request: https://github.com/dmlc/xgboost/issues/1449. // As a consequence, this function is only used during training to evaluate the model on a batch of observations. // The reason is XGBoost is using caches in many places assuming XGBoost is called from one unique thread. // That explains this lock. // That function only relies on the offical API of XGBoost. lock (this) { int t = WrappedXGBoostInterface.XGBoosterPredict(_handle, data.Handle, optionMask, (uint)ntreeLimit, ref length, ref ppreds); WrappedXGBoostInterface.Check(t); } Float *preds = (Float *)ppreds; Contracts.Assert(0 < length && length < Int32.MaxValue); if (length > (ulong)predictedValues.Length) { predictedValues = new VBuffer <Float>((int)length, new Float[length]); } WrappedXGBoostInterface.Copy((IntPtr)preds, 0, predictedValues.Values, (int)length); } }
/// <summary> /// Update for one iteration, with objective function calculated internally. /// </summary> /// <param name="dtrain">Training data</param> /// <param name="iteration">Iteration number</param> /// <param name="grad">Gradient (used if fobj != null)</param> /// <param name="hess">Hessien (used if fobj != null)</param> /// <param name="prediction">Predictions (used if fobj != null)</param> /// <param name="fobj">Custom objective function, it returns gradient and hessien for this objective.</param> public void Update(DMatrix dtrain, int iteration, ref VBuffer <Float> grad, ref VBuffer <Float> hess, ref VBuffer <Float> prediction, FObjType fobj = null) { ValidateFeatures(dtrain); if (fobj == null) { WrappedXGBoostInterface.Check(WrappedXGBoostInterface.XGBoosterUpdateOneIter(_handle, iteration, dtrain.Handle)); } else { PredictN(dtrain, ref prediction); fobj(ref prediction, dtrain, ref grad, ref hess); Boost(dtrain, ref grad, ref hess); } }
/// <summary> /// Create a sparse matrix used in XGBoost. /// </summary> /// <param name="numColumn">number of features or columns</param> /// <param name="indptr">Pointer to row headers</param> /// <param name="indices">column indices</param> /// <param name="data">Matrix as a Float array</param> /// <param name="nrow">Rows in the matix</param> /// <param name="nelem">Number of nonzero elements in the matrix</param> /// <param name="labels">Labels</param> /// <param name="weights">Vector of weights (can be null)</param> /// <param name="groups">Vector of groups (can be null)</param> /// <param name="featureNames">Set names for features.</param> /// <param name="featureTypes">Set types for features.</param> public DMatrix(/*bst_ulong*/ uint numColumn, /*size_t*/ ulong[] indptr, uint[] indices, Float[] data, uint nrow, uint nelem, Float[] labels = null, Float[] weights = null, uint[] groups = null, IEnumerable <string> featureNames = null, IEnumerable <string> featureTypes = null) { Contracts.Assert(nrow + 1 == indptr.Length); #if (DEBUG) _gcKeep = new GcKeep() { indptr = indptr, indices = indices, data = data, labels = labels, weights = weights, groups = groups }; #endif #if (XGB_EXTENDED) WrappedXGBoostInterface.Check(WrappedXGBoostInterface.XGDMatrixCreateFromCSREx(indptr, indices, data, (ulong)indptr.Length, nelem, numColumn, ref _handle)); #else WrappedXGBoostInterface.Check(WrappedXGBoostInterface.XGDMatrixCreateFromCSR(indptr, indices, data, (uint)indptr.Length, nelem, ref _handle)); #endif if (labels != null) { SetLabel(labels, nrow); } if (weights != null) { SetWeight(weights, nrow); } if (groups != null) { SetGroups(groups, nrow); } _featureNames = featureNames == null ? null : featureNames.ToArray(); _featureTypes = featureTypes == null ? null : featureTypes.ToArray(); Contracts.Assert(nrow == (int)GetNumRows()); Contracts.Assert((int)GetNumCols() == numColumn); }
/// <summary> /// Check the buffer can hold the current input, resize it otherwise. /// </summary> /// <param name="numSparseFeatures">number of sparsed features (VBuffer.Count), can be different for every observation</param> /// <param name="numFeatures">number of features (VBuffer.Length), same for all observations</param> public void ResizeEntries(uint numSparseFeatures, int numFeatures) { uint xgboostEntriesSize = numSparseFeatures * (sizeof(float) + sizeof(uint)); if (_xgboostEntries == null || _xgboostEntries.Length < xgboostEntriesSize || xgboostEntriesSize > _xgboostEntries.Length * 2) { _xgboostEntries = new byte[xgboostEntriesSize]; } #if (XGB_EXTENDED) if (_regTreeFVec == IntPtr.Zero || _regTreeFVecLength < numFeatures || numFeatures > _regTreeFVecLength * 2) { if (_regTreeFVec != IntPtr.Zero) { WrappedXGBoostInterface.XGBoosterPredictNoInsideCacheFree(_regTreeFVec); } WrappedXGBoostInterface.Check(WrappedXGBoostInterface.XGBoosterPredictNoInsideCacheAllocate(numFeatures, ref _regTreeFVec)); _regTreeFVecLength = numFeatures; } #endif }
/// <summary> /// Train and returns a booster. /// </summary> /// <param name="ch">IChannel</param> /// <param name="pch">IProgressChannel</param> /// <param name="numberOfTrees">Number of trained trees</param> /// <param name="parameters">Parameters see <see cref="XGBoostArguments"/></param> /// <param name="dtrain">Training set</param> /// <param name="numBoostRound">Number of trees to train</param> /// <param name="obj">Custom objective</param> /// <param name="maximize">Whether to maximize feval.</param> /// <param name="verboseEval">Requires at least one item in evals. /// If "verbose_eval" is True then the evaluation metric on the validation set is /// printed at each boosting stage.</param> /// <param name="xgbModel">For continuous training.</param> /// <param name="saveBinaryDMatrix">Save DMatrix in binary format (for debugging purpose).</param> public static Booster Train(IChannel ch, IProgressChannel pch, out int numberOfTrees, Dictionary <string, string> parameters, DMatrix dtrain, int numBoostRound = 10, Booster.FObjType obj = null, bool maximize = false, bool verboseEval = true, Booster xgbModel = null, string saveBinaryDMatrix = null) { #if (!XGBOOST_RABIT) if (WrappedXGBoostInterface.RabitIsDistributed() == 1) { var pname = WrappedXGBoostInterface.RabitGetProcessorName(); ch.Info("[WrappedXGBoostTraining.Train] start {0}:{1}", pname, WrappedXGBoostInterface.RabitGetRank()); } #endif if (!string.IsNullOrEmpty(saveBinaryDMatrix)) { dtrain.SaveBinary(saveBinaryDMatrix); } Booster bst = new Booster(parameters, dtrain, xgbModel); int numParallelTree = 1; int nboost = 0; if (parameters != null && parameters.ContainsKey("num_parallel_tree")) { numParallelTree = Convert.ToInt32(parameters["num_parallel_tree"]); nboost /= numParallelTree; } if (parameters.ContainsKey("num_class")) { int numClass = Convert.ToInt32(parameters["num_class"]); nboost /= numClass; } var prediction = new VBuffer <Float>(); var grad = new VBuffer <Float>(); var hess = new VBuffer <Float>(); var start = DateTime.Now; #if (!XGBOOST_RABIT) int version = bst.LoadRabitCheckpoint(); ch.Check(WrappedXGBoostInterface.RabitGetWorldSize() != 1 || version == 0); #else int version = 0; #endif int startIteration = version / 2; nboost += startIteration; int logten = 0; int temp = numBoostRound * 5; while (temp > 0) { logten += 1; temp /= 10; } temp = Math.Max(logten - 2, 0); logten = 1; while (temp-- > 0) { logten *= 10; } var metrics = new List <string>() { "Iteration", "Training Time" }; var units = new List <string>() { "iterations", "seconds" }; if (verboseEval) { metrics.Add("Training Error"); metrics.Add(parameters["objective"]); } var header = new ProgressHeader(metrics.ToArray(), units.ToArray()); int iter = 0; double trainTime = 0; double trainError = double.NaN; pch.SetHeader(header, e => { e.SetProgress(0, iter, numBoostRound - startIteration); e.SetProgress(1, trainTime); if (verboseEval) { e.SetProgress(2, trainError); } }); for (iter = startIteration; iter < numBoostRound; ++iter) { if (version % 2 == 0) { bst.Update(dtrain, iter, ref grad, ref hess, ref prediction, obj); #if (!XGBOOST_RABIT) bst.SaveRabitCheckpoint(); #endif version += 1; } #if (!XGBOOST_RABIT) ch.Check(WrappedXGBoostInterface.RabitGetWorldSize() == 1 || version == WrappedXGBoostInterface.RabitVersionNumber()); #endif nboost += 1; trainTime = (DateTime.Now - start).TotalMilliseconds; if (verboseEval) { pch.Checkpoint(new double?[] { iter, trainTime, trainError }); if (iter == startIteration || iter == numBoostRound - 1 || iter % logten == 0 || (DateTime.Now - start) > TimeSpan.FromMinutes(2)) { string strainError = bst.EvalSet(new[] { dtrain }, new[] { "Train" }, iter); // Example: "[0]\tTrain-error:0.028612" if (!string.IsNullOrEmpty(strainError) && strainError.Contains(":")) { double val; if (double.TryParse(strainError.Split(':').Last(), out val)) { trainError = val; } } } } else { pch.Checkpoint(new double?[] { iter, trainTime }); } version += 1; } numberOfTrees = numBoostRound * numParallelTree; if (WrappedXGBoostInterface.RabitIsDistributed() == 1) { var pname = WrappedXGBoostInterface.RabitGetProcessorName(); ch.Info("[WrappedXGBoostTraining.Train] end {0}:{1}", pname, WrappedXGBoostInterface.RabitGetRank()); } return(bst); }
/// <summary> /// Set float type property into the DMatrix. /// </summary> /// <param name="field">The field name of the information</param> /// <param name="data">The array of data to be set</param> /// <param name="nrow">Number of rows</param> private void SetFloatInfo(string field, IEnumerable <Float> data, uint nrow) { Float[] cont = data.ToArray(); WrappedXGBoostInterface.Check(WrappedXGBoostInterface.XGDMatrixSetFloatInfo(_handle, field, cont, nrow)); }
public void LazyInit() { WrappedXGBoostInterface.Check(WrappedXGBoostInterface.XGBoosterLazyInit(_handle)); }
/// <summary> /// Set group size of DMatrix (used for ranking). /// </summary> public void SetGroups(IEnumerable <uint> group, uint nrow) { var agroup = group.ToArray(); WrappedXGBoostInterface.Check(WrappedXGBoostInterface.XGDMatrixSetGroup(_handle, agroup, nrow)); }
/// <summary> /// Predict with data. /// This function uses a modified API which does not use caches. /// </summary> /// <param name="vbuf">one row</param> /// <param name="predictedValues">Results of the prediction</param> /// <param name="internalBuffer">buffers allocated by Microsoft.ML and given to XGBoost to avoid XGBoost to allocated caches on its own</param> /// <param name="outputMargin">Whether to output the raw untransformed margin value.</param> /// <param name="ntreeLimit">Limit number of trees in the prediction; defaults to 0 (use all trees).</param> public void PredictOneOff(ref VBuffer <Float> vbuf, ref VBuffer <Float> predictedValues, ref XGBoostTreeBuffer internalBuffer, bool outputMargin = true, int ntreeLimit = 0) { // REVIEW xadupre: XGBoost can produce an output per tree (pred_leaf=true) // When this option is on, the output will be a matrix of (nsample, ntrees) // with each record indicating the predicted leaf index of each sample in each tree. // Note that the leaf index of a tree is unique per tree, so you may find leaf 1 // in both tree 1 and tree 0. // if (pred_leaf) // option_mask |= 0x02; // This might be an interesting feature to implement. int optionMask = 0x00; if (outputMargin) { optionMask |= 0x01; } Contracts.Check(internalBuffer != null); uint length = 0; uint lengthBuffer = 0; uint nb = (uint)vbuf.Count; // This function relies on a modified API. Instead of letting XGBoost handle its own caches, // the function calls XGBoosterPredictOutputSize to know what cache size is required. // Microsoft.ML allocated the caches and gives them to XGBoost. // First, we allocated the cache for the features. Only then XGBoost // will be able to known the required cache size. #if (XGB_EXTENDED) internalBuffer.ResizeEntries(nb, vbuf.Length); #else internalBuffer.ResizeEntries(nb); #endif unsafe { fixed(float *p = vbuf.Values) fixed(int *i = vbuf.Indices) fixed(byte *entries = internalBuffer.XGBoostEntries) { WrappedXGBoostInterface.XGBoosterCopyEntries((IntPtr)entries, ref nb, p, vbuf.IsDense ? null : i, float.NaN); WrappedXGBoostInterface.XGBoosterPredictOutputSize(_handle, (IntPtr)entries, nb, optionMask, (uint)ntreeLimit, ref length, ref lengthBuffer); } } // Then we allocated the cache for the prediction. internalBuffer.ResizeOutputs(length, lengthBuffer, ref predictedValues); unsafe { fixed(byte *entries = internalBuffer.XGBoostEntries) fixed(float *ppreds = predictedValues.Values) fixed(float *ppredBuffer = internalBuffer.PredBuffer) fixed(uint *ppredCounter = internalBuffer.PredCounter) { WrappedXGBoostInterface.XGBoosterPredictNoInsideCache(_handle, (IntPtr)entries, nb, optionMask, (uint)ntreeLimit, length, lengthBuffer, ppreds, ppredBuffer, ppredCounter #if (XGB_EXTENDED) , internalBuffer.RegTreeFVec #endif ); } } }
public void SaveBinary(string name, int silent = 0) { WrappedXGBoostInterface.Check(WrappedXGBoostInterface.XGDMatrixSaveBinary(_handle, name, silent)); }
public int GetNumTrees() { double res = WrappedXGBoostInterface.XGBoosterGetNumInfo(_handle, "NumTrees"); return((int)res); }
/// <summary> /// Save the current booster to rabit checkpoint. /// </summary> public void SaveRabitCheckpoint() { #if (!XGBOOST_RABIT) WrappedXGBoostInterface.Check(WrappedXGBoostInterface.XGBoosterSaveRabitCheckpoint(_handle)); #endif }