/// <summary>
        /// Fill a sparse DMatrix using CSR compression.
        /// See http://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.html.
        /// </summary>
        private DMatrix FillSparseMatrix(IChannel ch, int nbDim, long nbRows, RoleMappedData data,
                                         out Float[] labels, out uint[] groupCount)
        {
            // Allocation.
            if ((2 * nbRows) >= Utils.ArrayMaxSize)
            {
                throw _host.Except("The training dataset is too big to hold in memory. " +
                                   "2 features multiplied by the number of rows must be less than {0}.", Utils.ArrayMaxSize);
            }

            var  features = new Float[nbRows * 2];
            var  indices  = new uint[features.Length];
            var  indptr   = new ulong[nbRows + 1];
            long nelem    = 0;

            labels = new Float[nbRows];
            var hasWeights = data.Schema.Weight != null;
            var hasGroup   = data.Schema.Group != null;
            var weights    = hasWeights ? new Float[nbRows] : null;
            var groupsML   = hasGroup ? new uint[nbRows] : null;

            groupCount = hasGroup ? new uint[nbRows] : null;
            var groupId = hasGroup ? new HashSet <uint>() : null;

            int count     = 0;
            int lastGroup = -1;
            var flags     = CursOpt.Features | CursOpt.Label | CursOpt.AllowBadEverything | CursOpt.Weight | CursOpt.Group;

            var featureVector = default(VBuffer <float>);
            var labelProxy    = float.NaN;
            var groupProxy    = ulong.MaxValue;

            using (var cursor = data.CreateRowCursor(flags, null))
            {
                var featureGetter = cursor.GetFeatureFloatVectorGetter(data);
                var labelGetter   = cursor.GetLabelFloatGetter(data);
                var weighGetter   = cursor.GetOptWeightFloatGetter(data);
                var groupGetter   = cursor.GetOptGroupGetter(data);
                while (cursor.MoveNext())
                {
                    featureGetter(ref featureVector);
                    labelGetter(ref labelProxy);
                    labels[count] = labelProxy;
                    if (Single.IsNaN(labels[count]))
                    {
                        continue;
                    }

                    indptr[count] = (ulong)nelem;
                    int nbValues = featureVector.Count;
                    if (nbValues > 0)
                    {
                        if (nelem + nbValues > features.Length)
                        {
                            long newSize = Math.Max(nelem + nbValues, features.Length * 2);
                            if (newSize >= Utils.ArrayMaxSize)
                            {
                                throw _host.Except("The training dataset is too big to hold in memory. " +
                                                   "It should be half of {0}.", Utils.ArrayMaxSize);
                            }
                            Array.Resize(ref features, (int)newSize);
                            Array.Resize(ref indices, (int)newSize);
                        }

                        Array.Copy(featureVector.Values, 0, features, nelem, nbValues);
                        if (featureVector.IsDense)
                        {
                            for (int i = 0; i < nbValues; ++i)
                            {
                                indices[nelem++] = (uint)i;
                            }
                        }
                        else
                        {
                            for (int i = 0; i < nbValues; ++i)
                            {
                                indices[nelem++] = (uint)featureVector.Indices[i];
                            }
                        }
                    }

                    if (hasWeights)
                    {
                        weighGetter(ref weights[count]);
                    }
                    if (hasGroup)
                    {
                        groupGetter(ref groupProxy);
                        if (groupProxy >= uint.MaxValue)
                        {
                            throw _host.Except($"Group is above {uint.MaxValue}");
                        }
                        groupsML[count] = (uint)groupProxy;
                        if (count == 0 || groupsML[count - 1] != groupsML[count])
                        {
                            groupCount[++lastGroup] = 1;
                            ch.Check(!groupId.Contains(groupsML[count]), "Group Id are not contiguous.");
                            groupId.Add(groupsML[count]);
                        }
                        else
                        {
                            ++groupCount[lastGroup];
                        }
                    }
                    ++count;
                }
            }
            indptr[count] = (uint)nelem;

            if (nelem < features.Length * 3 / 4)
            {
                Array.Resize(ref features, (int)nelem);
                Array.Resize(ref indices, (int)nelem);
            }

            PostProcessLabelsBeforeCreatingXGBoostContainer(ch, data, labels);

            // We create a DMatrix.
            DMatrix dtrain = new DMatrix((uint)nbDim, indptr, indices, features, (uint)count, (uint)nelem, labels: labels, weights: weights, groups: groupCount);

            return(dtrain);
        }
Exemplo n.º 2
0
        public void Predict(ref VBuffer <Float> features,
                            ref VBuffer <Float> predictedValues,
                            ref XGBoostTreeBuffer internalBuffer,
                            bool outputMargin = true,
                            int ntreeLimit    = 0)
        {
            PredictOneOff(ref features, ref predictedValues, ref internalBuffer, outputMargin, ntreeLimit);

#if (DEBUG && MORE_CHECKING)
            // This part checks that the function PredictOneOff which relies on a customized version
            // of XGBoost produces the same result as the official API.
            // This makes the prediction terribly slow as the prediction are called twice
            // and the second call (PredictN) cannot be parallelized (lock protected).
            VBuffer <Float> check = new VBuffer <float>();
            DMatrix         data;
            if (features.IsDense)
            {
                data = new DMatrix(features.Values, 1, (uint)features.Count);
            }
            else
            {
                int nb      = features.Count;
                var indptr  = new ulong[] { 0, (uint)nb };
                var indices = new uint[nb];
                for (int i = 0; i < nb; ++i)
                {
                    indices[i] = (uint)features.Indices[i];
                }
                data = new DMatrix((uint)features.Length, indptr, indices, features.Values, 1, (uint)nb);
            }

            PredictN(data, ref check, outputMargin, ntreeLimit);
            if (check.Count != predictedValues.Count)
            {
                string message =
                    string.Format(
                        "Count={0} Length={1} IsDense={2}\nValues={3}\nIndices={4}\nCustom Ouput={5}\nOfficial API={6}",
                        features.Count, features.Length, features.IsDense,
                        features.Values == null
                            ? ""
                            : string.Join(", ", features.Values.Select(c => c.ToString()).ToArray()),
                        features.Indices == null
                            ? ""
                            : string.Join(", ", features.Indices.Select(c => c.ToString()).ToArray()),
                        predictedValues.Values == null
                            ? ""
                            : string.Join(", ", predictedValues.Values.Select(c => c.ToString()).ToArray()),
                        check.Values == null
                            ? ""
                            : string.Join(", ", check.Values.Select(c => c.ToString()).ToArray()));
                throw Contracts.Except("Mismatch between official API and custom API (dimension).\n" + message);
            }
            for (int i = 0; i < check.Count; ++i)
            {
                if (Math.Abs(check.Values[0] - predictedValues.Values[0]) > 1e-5)
                {
                    string message =
                        string.Format(
                            "Count={0} Length={1} IsDense={2}\nValues={3}\nIndices={4}\nCustom Ouput={5}\nOfficial API={6}",
                            features.Count, features.Length, features.IsDense,
                            features.Values == null
                                ? ""
                                : string.Join(", ", features.Values.Select(c => c.ToString()).ToArray()),
                            features.Indices == null
                                ? ""
                                : string.Join(", ", features.Indices.Select(c => c.ToString()).ToArray()),
                            predictedValues.Values == null
                                ? ""
                                : string.Join(", ", predictedValues.Values.Select(c => c.ToString()).ToArray()),
                            check.Values == null
                                ? ""
                                : string.Join(", ", check.Values.Select(c => c.ToString()).ToArray()));
                    PredictOneOff(ref features, ref predictedValues, ref internalBuffer, outputMargin, ntreeLimit);
                    message += string.Format("\nSecond computation\n{0}", predictedValues.Values == null
                        ? ""
                        : string.Join(", ", predictedValues.Values.Select(c => c.ToString()).ToArray()));
                    throw Contracts.Except("Mismatch between official API and custom API (output).\n" + message);
                }
            }
#endif
        }
        private DMatrix FillDenseMatrix(IChannel ch, int nbDim, long nbRows,
                                        RoleMappedData data, out Float[] labels, out uint[] groupCount)
        {
            // Allocation.
            string errorMessageGroup = string.Format("Group is above {0}.", uint.MaxValue);

            if (nbDim * nbRows >= Utils.ArrayMaxSize)
            {
                throw _host.Except("The training dataset is too big to hold in memory. " +
                                   "Number of features ({0}) multiplied by the number of rows ({1}) must be less than {2}.", nbDim, nbRows, Utils.ArrayMaxSize);
            }
            var features = new Float[nbDim * nbRows];

            labels = new Float[nbRows];
            var hasWeights = data.Schema.Weight != null;
            var hasGroup   = data.Schema.Group != null;
            var weights    = hasWeights ? new Float[nbRows] : null;
            var groupsML   = hasGroup ? new uint[nbRows] : null;

            groupCount = hasGroup ? new uint[nbRows] : null;
            var groupId = hasGroup ? new HashSet <uint>() : null;

            int count     = 0;
            int lastGroup = -1;
            int fcount    = 0;
            var flags     = CursOpt.Features | CursOpt.Label | CursOpt.AllowBadEverything | CursOpt.Weight | CursOpt.Group;

            var featureVector = default(VBuffer <float>);
            var labelProxy    = float.NaN;
            var groupProxy    = ulong.MaxValue;

            using (var cursor = data.CreateRowCursor(flags, null))
            {
                var featureGetter = cursor.GetFeatureFloatVectorGetter(data);
                var labelGetter   = cursor.GetLabelFloatGetter(data);
                var weighGetter   = cursor.GetOptWeightFloatGetter(data);
                var groupGetter   = cursor.GetOptGroupGetter(data);

                while (cursor.MoveNext())
                {
                    featureGetter(ref featureVector);
                    labelGetter(ref labelProxy);

                    labels[count] = labelProxy;
                    if (Single.IsNaN(labels[count]))
                    {
                        continue;
                    }

                    featureVector.CopyTo(features, fcount, Single.NaN);
                    fcount += featureVector.Count;

                    if (hasWeights)
                    {
                        weighGetter(ref weights[count]);
                    }
                    if (hasGroup)
                    {
                        groupGetter(ref groupProxy);
                        _host.Check(groupProxy < uint.MaxValue, errorMessageGroup);
                        groupsML[count] = (uint)groupProxy;
                        if (count == 0 || groupsML[count - 1] != groupsML[count])
                        {
                            groupCount[++lastGroup] = 1;
                            ch.Check(!groupId.Contains(groupsML[count]), "Group Id are not contiguous.");
                            groupId.Add(groupsML[count]);
                        }
                        else
                        {
                            ++groupCount[lastGroup];
                        }
                    }
                    ++count;
                }
            }

            PostProcessLabelsBeforeCreatingXGBoostContainer(ch, data, labels);

            // We create a DMatrix.
            DMatrix dtrain = new DMatrix(features, (uint)count, (uint)nbDim, labels: labels, weights: weights, groups: groupCount);

            return(dtrain);
        }
Exemplo n.º 4
0
        /// <summary>
        /// Train and returns a booster.
        /// </summary>
        /// <param name="ch">IChannel</param>
        /// <param name="pch">IProgressChannel</param>
        /// <param name="numberOfTrees">Number of trained trees</param>
        /// <param name="parameters">Parameters see <see cref="XGBoostArguments"/></param>
        /// <param name="dtrain">Training set</param>
        /// <param name="numBoostRound">Number of trees to train</param>
        /// <param name="obj">Custom objective</param>
        /// <param name="maximize">Whether to maximize feval.</param>
        /// <param name="verboseEval">Requires at least one item in evals.
        ///     If "verbose_eval" is True then the evaluation metric on the validation set is
        ///     printed at each boosting stage.</param>
        /// <param name="xgbModel">For continuous training.</param>
        /// <param name="saveBinaryDMatrix">Save DMatrix in binary format (for debugging purpose).</param>
        public static Booster Train(IChannel ch, IProgressChannel pch, out int numberOfTrees,
                                    Dictionary <string, string> parameters, DMatrix dtrain, int numBoostRound = 10,
                                    Booster.FObjType obj     = null, bool maximize    = false,
                                    bool verboseEval         = true, Booster xgbModel = null,
                                    string saveBinaryDMatrix = null)
        {
#if (!XGBOOST_RABIT)
            if (WrappedXGBoostInterface.RabitIsDistributed() == 1)
            {
                var pname = WrappedXGBoostInterface.RabitGetProcessorName();
                ch.Info("[WrappedXGBoostTraining.Train] start {0}:{1}", pname, WrappedXGBoostInterface.RabitGetRank());
            }
#endif

            if (!string.IsNullOrEmpty(saveBinaryDMatrix))
            {
                dtrain.SaveBinary(saveBinaryDMatrix);
            }

            Booster bst             = new Booster(parameters, dtrain, xgbModel);
            int     numParallelTree = 1;
            int     nboost          = 0;

            if (parameters != null && parameters.ContainsKey("num_parallel_tree"))
            {
                numParallelTree = Convert.ToInt32(parameters["num_parallel_tree"]);
                nboost         /= numParallelTree;
            }
            if (parameters.ContainsKey("num_class"))
            {
                int numClass = Convert.ToInt32(parameters["num_class"]);
                nboost /= numClass;
            }

            var prediction = new VBuffer <Float>();
            var grad       = new VBuffer <Float>();
            var hess       = new VBuffer <Float>();
            var start      = DateTime.Now;

#if (!XGBOOST_RABIT)
            int version = bst.LoadRabitCheckpoint();
            ch.Check(WrappedXGBoostInterface.RabitGetWorldSize() != 1 || version == 0);
#else
            int version = 0;
#endif
            int startIteration = version / 2;
            nboost += startIteration;
            int logten = 0;
            int temp   = numBoostRound * 5;
            while (temp > 0)
            {
                logten += 1;
                temp   /= 10;
            }
            temp   = Math.Max(logten - 2, 0);
            logten = 1;
            while (temp-- > 0)
            {
                logten *= 10;
            }

            var metrics = new List <string>()
            {
                "Iteration", "Training Time"
            };
            var units = new List <string>()
            {
                "iterations", "seconds"
            };
            if (verboseEval)
            {
                metrics.Add("Training Error");
                metrics.Add(parameters["objective"]);
            }
            var header = new ProgressHeader(metrics.ToArray(), units.ToArray());

            int    iter       = 0;
            double trainTime  = 0;
            double trainError = double.NaN;

            pch.SetHeader(header, e =>
            {
                e.SetProgress(0, iter, numBoostRound - startIteration);
                e.SetProgress(1, trainTime);
                if (verboseEval)
                {
                    e.SetProgress(2, trainError);
                }
            });
            for (iter = startIteration; iter < numBoostRound; ++iter)
            {
                if (version % 2 == 0)
                {
                    bst.Update(dtrain, iter, ref grad, ref hess, ref prediction, obj);
#if (!XGBOOST_RABIT)
                    bst.SaveRabitCheckpoint();
#endif
                    version += 1;
                }

#if (!XGBOOST_RABIT)
                ch.Check(WrappedXGBoostInterface.RabitGetWorldSize() == 1 ||
                         version == WrappedXGBoostInterface.RabitVersionNumber());
#endif
                nboost += 1;

                trainTime = (DateTime.Now - start).TotalMilliseconds;

                if (verboseEval)
                {
                    pch.Checkpoint(new double?[] { iter, trainTime, trainError });
                    if (iter == startIteration || iter == numBoostRound - 1 || iter % logten == 0 ||
                        (DateTime.Now - start) > TimeSpan.FromMinutes(2))
                    {
                        string strainError = bst.EvalSet(new[] { dtrain }, new[] { "Train" }, iter);
                        // Example: "[0]\tTrain-error:0.028612"
                        if (!string.IsNullOrEmpty(strainError) && strainError.Contains(":"))
                        {
                            double val;
                            if (double.TryParse(strainError.Split(':').Last(), out val))
                            {
                                trainError = val;
                            }
                        }
                    }
                }
                else
                {
                    pch.Checkpoint(new double?[] { iter, trainTime });
                }

                version += 1;
            }
            numberOfTrees = numBoostRound * numParallelTree;
            if (WrappedXGBoostInterface.RabitIsDistributed() == 1)
            {
                var pname = WrappedXGBoostInterface.RabitGetProcessorName();
                ch.Info("[WrappedXGBoostTraining.Train] end {0}:{1}", pname, WrappedXGBoostInterface.RabitGetRank());
            }
            return(bst);
        }