예제 #1
0
        private Dataset LoadTrainingData(IChannel ch, RoleMappedData trainData, out CategoricalMetaData catMetaData)
        {
            // Verifications.
            Host.AssertValue(ch);
            ch.CheckValue(trainData, nameof(trainData));

            CheckDataValid(ch, trainData);

            // Load metadata first.
            var factory = CreateCursorFactory(trainData);

            GetMetainfo(ch, factory, out int numRow, out float[] labels, out float[] weights, out int[] groups);
            catMetaData = GetCategoricalMetaData(ch, trainData, numRow);
            GetDefaultParameters(ch, numRow, catMetaData.CategoricalBoudaries != null, catMetaData.TotalCats);

            Dataset dtrain;
            string  param = LightGbmInterfaceUtils.JoinParameters(Options);

            // To reduce peak memory usage, only enable one sampling task at any given time.
            lock (LightGbmShared.SampleLock)
            {
                CreateDatasetFromSamplingData(ch, factory, numRow,
                                              param, labels, weights, groups, catMetaData, out dtrain);
            }

            // Push rows into dataset.
            LoadDataset(ch, factory, dtrain, numRow, Args.BatchSize, catMetaData);

            // Some checks.
            CheckAndUpdateParametersBeforeTraining(ch, trainData, labels, groups);
            return(dtrain);
        }
 private CategoricalMetaData GetCategoricalMetaData(IChannel ch, RoleMappedData trainData, int numRow)
 {
     CategoricalMetaData catMetaData = new CategoricalMetaData();
     int[] categoricalFeatures = null;
     const int useCatThreshold = 50000;
     // Disable cat when data is too small, reduce the overfitting.
     bool useCat = Args.UseCat ?? numRow > useCatThreshold;
     if (!Args.UseCat.HasValue)
         ch.Info("Auto-tuning parameters: " + nameof(Args.UseCat) + " = " + useCat);
     if (useCat)
     {
         var featureCol = trainData.Schema.Schema[DefaultColumnNames.Features];
         MetadataUtils.TryGetCategoricalFeatureIndices(trainData.Schema.Schema, featureCol.Index, out categoricalFeatures);
     }
     var colType = trainData.Schema.Feature.Value.Type;
     int rawNumCol = colType.GetVectorSize();
     FeatureCount = rawNumCol;
     catMetaData.TotalCats = 0;
     if (categoricalFeatures == null)
     {
         catMetaData.CategoricalBoudaries = null;
         catMetaData.NumCol = rawNumCol;
     }
     else
     {
         var catIndices = ConstructCategoricalFeatureMetaData(categoricalFeatures, rawNumCol, ref catMetaData);
         // Set categorical features
         Options["categorical_feature"] = string.Join(",", catIndices);
     }
     return catMetaData;
 }
        private void GetFeatureValueSparse(IChannel ch, FloatLabelCursor cursor,
                                           CategoricalMetaData catMetaData, Random rand, out ReadOnlySpan <int> indices,
                                           out ReadOnlySpan <float> featureValues, out int cnt)
        {
            var cursorFeaturesValues  = cursor.Features.GetValues();
            var cursorFeaturesIndices = cursor.Features.GetIndices();

            if (catMetaData.CategoricalBoudaries != null)
            {
                List <int>   featureIndices = new List <int>();
                List <float> values         = new List <float>();
                int          lastIdx        = -1;
                int          nhot           = 0;
                for (int i = 0; i < cursorFeaturesValues.Length; ++i)
                {
                    float fv        = cursorFeaturesValues[i];
                    int   colIdx    = cursorFeaturesIndices[i];
                    int   newColIdx = catMetaData.OnehotIndices[colIdx];
                    if (catMetaData.IsCategoricalFeature[newColIdx])
                    {
                        fv = catMetaData.OnehotBias[colIdx] + 1;
                    }
                    if (newColIdx != lastIdx)
                    {
                        featureIndices.Add(newColIdx);
                        values.Add(fv);
                        nhot = 1;
                    }
                    else
                    {
                        // Multi-hot.
                        ++nhot;
                        var prob = rand.NextSingle();
                        if (prob < 1.0f / nhot)
                        {
                            values[values.Count - 1] = fv;
                        }
                    }
                    lastIdx = newColIdx;
                }
                indices       = featureIndices.ToArray();
                featureValues = values.ToArray();
                cnt           = featureIndices.Count;
            }
            else
            {
                indices       = cursorFeaturesIndices;
                featureValues = cursorFeaturesValues;
                cnt           = cursorFeaturesValues.Length;
            }
        }
예제 #4
0
        private void CopyToCsr(IChannel ch, FloatLabelCursor cursor,
                               int[] indices, float[] features, CategoricalMetaData catMetaData, IRandom rand, ref int numElem)
        {
            int numValue = cursor.Features.Count;

            if (numValue > 0)
            {
                ch.Assert(indices.Length >= numElem + numValue);
                ch.Assert(features.Length >= numElem + numValue);

                if (cursor.Features.IsDense)
                {
                    GetFeatureValueDense(ch, cursor, catMetaData, rand, out float[] featureValues);
                    for (int i = 0; i < catMetaData.NumCol; ++i)
                    {
                        float fv = featureValues[i];
                        if (fv == 0)
                        {
                            continue;
                        }
                        features[numElem] = fv;
                        indices[numElem]  = i;
                        ++numElem;
                    }
                }
                else
                {
                    GetFeatureValueSparse(ch, cursor, catMetaData, rand, out int[] featureIndices, out float[] featureValues, out int cnt);
                    for (int i = 0; i < cnt; ++i)
                    {
                        int   colIdx = featureIndices[i];
                        float fv     = featureValues[i];
                        if (fv == 0)
                        {
                            continue;
                        }
                        features[numElem] = fv;
                        indices[numElem]  = colIdx;
                        ++numElem;
                    }
                }
            }
        }
예제 #5
0
        private void TrainCore(IChannel ch, IProgressChannel pch, Dataset dtrain, CategoricalMetaData catMetaData, Dataset dvalid = null)
        {
            Host.AssertValue(ch);
            Host.AssertValue(pch);
            Host.AssertValue(dtrain);
            Host.AssertValueOrNull(dvalid);
            // For multi class, the number of labels is required.
            ch.Assert(PredictionKind != PredictionKind.MultiClassClassification || Options.ContainsKey("num_class"),
                      "LightGBM requires the number of classes to be specified in the parameters.");

            // Only enable one trainer to run at one time.
            lock (LightGbmShared.LockForMultiThreadingInside)
            {
                ch.Info("LightGBM objective={0}", Options["objective"]);
                using (Booster bst = WrappedLightGbmTraining.Train(ch, pch, Options, dtrain,
                                                                   dvalid: dvalid, numIteration: Args.NumBoostRound,
                                                                   verboseEval: Args.VerboseEval, earlyStoppingRound: Args.EarlyStoppingRound))
                {
                    TrainedEnsemble = bst.GetModel(catMetaData.CategoricalBoudaries);
                }
            }
        }
        private void GetFeatureValueDense(IChannel ch, FloatLabelCursor cursor, CategoricalMetaData catMetaData, Random rand, out ReadOnlySpan <float> featureValues)
        {
            var cursorFeaturesValues = cursor.Features.GetValues();

            if (catMetaData.CategoricalBoudaries != null)
            {
                float[] featureValuesTemp = new float[catMetaData.NumCol];
                for (int i = 0; i < catMetaData.NumCol; ++i)
                {
                    float fv = cursorFeaturesValues[catMetaData.CategoricalBoudaries[i]];
                    if (catMetaData.IsCategoricalFeature[i])
                    {
                        int hotIdx = catMetaData.CategoricalBoudaries[i] - 1;
                        int nhot   = 0;
                        for (int j = catMetaData.CategoricalBoudaries[i]; j < catMetaData.CategoricalBoudaries[i + 1]; ++j)
                        {
                            if (cursorFeaturesValues[j] > 0)
                            {
                                // Reservoir Sampling.
                                nhot++;
                                var prob = rand.NextSingle();
                                if (prob < 1.0f / nhot)
                                {
                                    hotIdx = j;
                                }
                            }
                        }
                        // All-Zero is category 0.
                        fv = hotIdx - catMetaData.CategoricalBoudaries[i] + 1;
                    }
                    featureValuesTemp[i] = fv;
                }
                featureValues = featureValuesTemp;
            }
            else
            {
                featureValues = cursorFeaturesValues;
            }
        }
예제 #7
0
 private void CopyToArray(IChannel ch, FloatLabelCursor cursor, float[] features, CategoricalMetaData catMetaData, IRandom rand, ref int numElem)
 {
     ch.Assert(features.Length >= numElem + catMetaData.NumCol);
     if (catMetaData.CategoricalBoudaries != null)
     {
         if (cursor.Features.IsDense)
         {
             GetFeatureValueDense(ch, cursor, catMetaData, rand, out float[] featureValues);
             for (int i = 0; i < catMetaData.NumCol; ++i)
             {
                 features[numElem + i] = featureValues[i];
             }
             numElem += catMetaData.NumCol;
         }
         else
         {
             GetFeatureValueSparse(ch, cursor, catMetaData, rand, out int[] indices, out float[] featureValues, out int cnt);
             int lastIdx = 0;
             for (int i = 0; i < cnt; i++)
             {
                 int   slot = indices[i];
                 float fv   = featureValues[i];
                 Contracts.Assert(slot >= lastIdx);
                 while (lastIdx < slot)
                 {
                     features[numElem + lastIdx++] = 0.0f;
                 }
                 Contracts.Assert(lastIdx == slot);
                 features[numElem + lastIdx++] = fv;
             }
             while (lastIdx < catMetaData.NumCol)
             {
                 features[numElem + lastIdx++] = 0.0f;
             }
             numElem += catMetaData.NumCol;
         }
     }
     else
     {
         cursor.Features.CopyTo(features, numElem, 0.0f);
         numElem += catMetaData.NumCol;
     }
 }
예제 #8
0
        /// <summary>
        /// Load dataset. Use row batch way to reduce peak memory cost.
        /// </summary>
        private void LoadDataset(IChannel ch, FloatLabelCursor.Factory factory, Dataset dataset, int numRow, int batchSize, CategoricalMetaData catMetaData)
        {
            Host.AssertValue(ch);
            ch.AssertValue(factory);
            ch.AssertValue(dataset);
            ch.Assert(dataset.GetNumRows() == numRow);
            ch.Assert(dataset.GetNumCols() == catMetaData.NumCol);
            var rand = Host.Rand;

            // To avoid array resize, batch size should bigger than size of one row.
            batchSize = Math.Max(batchSize, catMetaData.NumCol);
            double density       = DetectDensity(factory);
            int    numElem       = 0;
            int    totalRowCount = 0;
            int    curRowCount   = 0;

            if (density >= 0.5)
            {
                int batchRow = batchSize / catMetaData.NumCol;
                batchRow = Math.Max(1, batchRow);
                if (batchRow > numRow)
                {
                    batchRow = numRow;
                }

                // This can only happen if the size of ONE example(row) exceeds the max array size. This looks like a very unlikely case.
                if ((long)catMetaData.NumCol * batchRow > Utils.ArrayMaxSize)
                {
                    throw ch.Except("Size of array exceeded the " + nameof(Utils.ArrayMaxSize));
                }

                float[] features = new float[catMetaData.NumCol * batchRow];

                using (var cursor = factory.Create())
                {
                    while (cursor.MoveNext())
                    {
                        ch.Assert(totalRowCount < numRow);
                        CopyToArray(ch, cursor, features, catMetaData, rand, ref numElem);
                        ++totalRowCount;
                        ++curRowCount;
                        if (batchRow == curRowCount)
                        {
                            ch.Assert(numElem == curRowCount * catMetaData.NumCol);
                            // PushRows is run by multi-threading inside, so lock here.
                            lock (LightGbmShared.LockForMultiThreadingInside)
                                dataset.PushRows(features, curRowCount, catMetaData.NumCol, totalRowCount - curRowCount);
                            curRowCount = 0;
                            numElem     = 0;
                        }
                    }
                    ch.Assert(totalRowCount == numRow);
                    if (curRowCount > 0)
                    {
                        ch.Assert(numElem == curRowCount * catMetaData.NumCol);
                        // PushRows is run by multi-threading inside, so lock here.
                        lock (LightGbmShared.LockForMultiThreadingInside)
                            dataset.PushRows(features, curRowCount, catMetaData.NumCol, totalRowCount - curRowCount);
                    }
                }
            }
            else
            {
                int esimateBatchRow = (int)(batchSize / (catMetaData.NumCol * density));
                esimateBatchRow = Math.Max(1, esimateBatchRow);
                float[] features = new float[batchSize];
                int[]   indices  = new int[batchSize];
                int[]   indptr   = new int[esimateBatchRow + 1];

                using (var cursor = factory.Create())
                {
                    while (cursor.MoveNext())
                    {
                        ch.Assert(totalRowCount < numRow);
                        // Need push rows to LightGBM.
                        if (numElem + cursor.Features.Count > features.Length)
                        {
                            // Mini batch size is greater than size of one row.
                            // So, at least we have the data of one row.
                            ch.Assert(curRowCount > 0);
                            Utils.EnsureSize(ref indptr, curRowCount + 1);
                            indptr[curRowCount] = numElem;
                            // PushRows is run by multi-threading inside, so lock here.
                            lock (LightGbmShared.LockForMultiThreadingInside)
                            {
                                dataset.PushRows(indptr, indices, features,
                                                 curRowCount + 1, numElem, catMetaData.NumCol, totalRowCount - curRowCount);
                            }
                            curRowCount = 0;
                            numElem     = 0;
                        }
                        Utils.EnsureSize(ref indptr, curRowCount + 1);
                        indptr[curRowCount] = numElem;
                        CopyToCsr(ch, cursor, indices, features, catMetaData, rand, ref numElem);
                        ++totalRowCount;
                        ++curRowCount;
                    }
                    ch.Assert(totalRowCount == numRow);
                    if (curRowCount > 0)
                    {
                        Utils.EnsureSize(ref indptr, curRowCount + 1);
                        indptr[curRowCount] = numElem;
                        // PushRows is run by multi-threading inside, so lock here.
                        lock (LightGbmShared.LockForMultiThreadingInside)
                        {
                            dataset.PushRows(indptr, indices, features, curRowCount + 1,
                                             numElem, catMetaData.NumCol, totalRowCount - curRowCount);
                        }
                    }
                }
            }
        }
예제 #9
0
        /// <summary>
        /// Create a dataset from the sampling data.
        /// </summary>
        private void CreateDatasetFromSamplingData(IChannel ch, FloatLabelCursor.Factory factory,
                                                   int numRow, string param, float[] labels, float[] weights, int[] groups, CategoricalMetaData catMetaData,
                                                   out Dataset dataset)
        {
            Host.AssertValue(ch);

            int numSampleRow = GetNumSampleRow(numRow, FeatureCount);

            var    rand        = Host.Rand;
            double averageStep = (double)numRow / numSampleRow;
            int    totalIdx    = 0;
            int    sampleIdx   = 0;
            double density     = DetectDensity(factory);

            double[][] sampleValuePerColumn   = new double[catMetaData.NumCol][];
            int[][]    sampleIndicesPerColumn = new int[catMetaData.NumCol][];
            int[]      nonZeroCntPerColumn    = new int[catMetaData.NumCol];
            int        estimateNonZeroCnt     = (int)(numSampleRow * density);

            estimateNonZeroCnt = Math.Max(1, estimateNonZeroCnt);
            for (int i = 0; i < catMetaData.NumCol; i++)
            {
                nonZeroCntPerColumn[i]    = 0;
                sampleValuePerColumn[i]   = new double[estimateNonZeroCnt];
                sampleIndicesPerColumn[i] = new int[estimateNonZeroCnt];
            }
            ;
            using (var cursor = factory.Create())
            {
                int step = 1;
                if (averageStep > 1)
                {
                    step = rand.Next((int)(2 * averageStep - 1)) + 1;
                }
                while (MoveMany(cursor, step))
                {
                    if (cursor.Features.IsDense)
                    {
                        GetFeatureValueDense(ch, cursor, catMetaData, rand, out float[] featureValues);
                        for (int i = 0; i < catMetaData.NumCol; ++i)
                        {
                            float fv = featureValues[i];
                            if (fv == 0)
                            {
                                continue;
                            }
                            int curNonZeroCnt = nonZeroCntPerColumn[i];
                            Utils.EnsureSize(ref sampleValuePerColumn[i], curNonZeroCnt + 1);
                            Utils.EnsureSize(ref sampleIndicesPerColumn[i], curNonZeroCnt + 1);
                            sampleValuePerColumn[i][curNonZeroCnt]   = fv;
                            sampleIndicesPerColumn[i][curNonZeroCnt] = sampleIdx;
                            nonZeroCntPerColumn[i] = curNonZeroCnt + 1;
                        }
                    }
                    else
                    {
                        GetFeatureValueSparse(ch, cursor, catMetaData, rand, out int[] featureIndices, out float[] featureValues, out int cnt);
                        for (int i = 0; i < cnt; ++i)
                        {
                            int   colIdx = featureIndices[i];
                            float fv     = featureValues[i];
                            if (fv == 0)
                            {
                                continue;
                            }
                            int curNonZeroCnt = nonZeroCntPerColumn[colIdx];
                            Utils.EnsureSize(ref sampleValuePerColumn[colIdx], curNonZeroCnt + 1);
                            Utils.EnsureSize(ref sampleIndicesPerColumn[colIdx], curNonZeroCnt + 1);
                            sampleValuePerColumn[colIdx][curNonZeroCnt]   = fv;
                            sampleIndicesPerColumn[colIdx][curNonZeroCnt] = sampleIdx;
                            nonZeroCntPerColumn[colIdx] = curNonZeroCnt + 1;
                        }
                    }
                    totalIdx += step;
                    ++sampleIdx;
                    if (numSampleRow == sampleIdx || numRow == totalIdx)
                    {
                        break;
                    }
                    averageStep = (double)(numRow - totalIdx) / (numSampleRow - sampleIdx);
                    step        = 1;
                    if (averageStep > 1)
                    {
                        step = rand.Next((int)(2 * averageStep - 1)) + 1;
                    }
                }
            }
            dataset = new Dataset(sampleValuePerColumn, sampleIndicesPerColumn, catMetaData.NumCol, nonZeroCntPerColumn, sampleIdx, numRow, param, labels, weights, groups);
        }
예제 #10
0
        private Dataset LoadValidationData(IChannel ch, Dataset dtrain, RoleMappedData validData, CategoricalMetaData catMetaData)
        {
            // Verifications.
            Host.AssertValue(ch);

            ch.CheckValue(validData, nameof(validData));

            CheckDataValid(ch, validData);

            // Load meta info first.
            var factory = CreateCursorFactory(validData);

            GetMetainfo(ch, factory, out int numRow, out float[] labels, out float[] weights, out int[] groups);

            // Construct validation dataset.
            Dataset dvalid = new Dataset(dtrain, numRow, labels, weights, groups);

            // Push rows into dataset.
            LoadDataset(ch, factory, dvalid, numRow, Args.BatchSize, catMetaData);

            return(dvalid);
        }
예제 #11
0
        private static List <string> ConstructCategoricalFeatureMetaData(int[] categoricalFeatures, int rawNumCol, ref CategoricalMetaData catMetaData)
        {
            List <int> catBoundaries = GetCategoricalBoundires(categoricalFeatures, rawNumCol);

            catMetaData.NumCol = catBoundaries.Count - 1;
            catMetaData.CategoricalBoudaries = catBoundaries.ToArray();
            catMetaData.IsCategoricalFeature = new bool[catMetaData.NumCol];
            catMetaData.OnehotIndices        = new int[rawNumCol];
            catMetaData.OnehotBias           = new int[rawNumCol];
            List <string> catIndices = new List <string>();
            int           j          = 0;

            for (int i = 0; i < catMetaData.NumCol; ++i)
            {
                var numCat = catMetaData.CategoricalBoudaries[i + 1] - catMetaData.CategoricalBoudaries[i];
                if (numCat > 1)
                {
                    catMetaData.TotalCats += numCat;
                    catMetaData.IsCategoricalFeature[i] = true;
                    catIndices.Add(i.ToString());
                    for (int k = catMetaData.CategoricalBoudaries[i]; k < catMetaData.CategoricalBoudaries[i + 1]; ++k)
                    {
                        catMetaData.OnehotIndices[j] = i;
                        catMetaData.OnehotBias[j]    = k - catMetaData.CategoricalBoudaries[i];
                        ++j;
                    }
                }
                else
                {
                    catMetaData.IsCategoricalFeature[i] = false;
                    catMetaData.OnehotIndices[j]        = i;
                    catMetaData.OnehotBias[j]           = 0;
                    ++j;
                }
            }
            return(catIndices);
        }
        /// <summary>
        /// Create a dataset from the sampling data.
        /// </summary>
        private void CreateDatasetFromSamplingData(IChannel ch, FloatLabelCursor.Factory factory,
                                                   int numRow, string param, float[] labels, float[] weights, int[] groups, CategoricalMetaData catMetaData,
                                                   out Dataset dataset)
        {
            Host.AssertValue(ch);

            int numSampleRow = GetNumSampleRow(numRow, FeatureCount);

            var    rand        = Host.Rand;
            double averageStep = (double)numRow / numSampleRow;
            int    totalIdx    = 0;
            int    sampleIdx   = 0;
            double density     = DetectDensity(factory);

            double[][] sampleValuePerColumn   = new double[catMetaData.NumCol][];
            int[][]    sampleIndicesPerColumn = new int[catMetaData.NumCol][];
            int[]      nonZeroCntPerColumn    = new int[catMetaData.NumCol];
            int        estimateNonZeroCnt     = (int)(numSampleRow * density);

            estimateNonZeroCnt = Math.Max(1, estimateNonZeroCnt);
            for (int i = 0; i < catMetaData.NumCol; i++)
            {
                nonZeroCntPerColumn[i]    = 0;
                sampleValuePerColumn[i]   = new double[estimateNonZeroCnt];
                sampleIndicesPerColumn[i] = new int[estimateNonZeroCnt];
            }
            ;
            using (var cursor = factory.Create())
            {
                int step = 1;
                if (averageStep > 1)
                {
                    step = rand.Next((int)(2 * averageStep - 1)) + 1;
                }
                while (MoveMany(cursor, step))
                {
                    if (cursor.Features.IsDense)
                    {
                        GetFeatureValueDense(ch, cursor, catMetaData, rand, out ReadOnlySpan <float> featureValues);
                        for (int i = 0; i < catMetaData.NumCol; ++i)
                        {
                            float fv = featureValues[i];
                            if (fv == 0)
                            {
                                continue;
                            }
                            int curNonZeroCnt = nonZeroCntPerColumn[i];
                            Utils.EnsureSize(ref sampleValuePerColumn[i], curNonZeroCnt + 1);
                            Utils.EnsureSize(ref sampleIndicesPerColumn[i], curNonZeroCnt + 1);
                            // sampleValuePerColumn[i] is a vector whose j-th element is added when j-th non-zero value
                            // at the i-th feature is found as scanning the training data.
                            // In other words, sampleValuePerColumn[i][j] is the j-th non-zero i-th feature in the data set.
                            // when we scan the data matrix example-by-example.
                            sampleValuePerColumn[i][curNonZeroCnt] = fv;
                            // If the data set is dense, sampleValuePerColumn[i][j] would be the i-th feature at the j-th example.
                            // If the data set is not dense, sampleValuePerColumn[i][j] would be the i-th feature at the
                            // sampleIndicesPerColumn[i][j]-th example.
                            sampleIndicesPerColumn[i][curNonZeroCnt] = sampleIdx;
                            // The number of non-zero values at the i-th feature is nonZeroCntPerColumn[i].
                            nonZeroCntPerColumn[i] = curNonZeroCnt + 1;
                        }
                    }
                    else
                    {
                        GetFeatureValueSparse(ch, cursor, catMetaData, rand, out ReadOnlySpan <int> featureIndices, out ReadOnlySpan <float> featureValues, out int cnt);
                        for (int i = 0; i < cnt; ++i)
                        {
                            int   colIdx = featureIndices[i];
                            float fv     = featureValues[i];
                            if (fv == 0)
                            {
                                continue;
                            }
                            int curNonZeroCnt = nonZeroCntPerColumn[colIdx];
                            Utils.EnsureSize(ref sampleValuePerColumn[colIdx], curNonZeroCnt + 1);
                            Utils.EnsureSize(ref sampleIndicesPerColumn[colIdx], curNonZeroCnt + 1);
                            sampleValuePerColumn[colIdx][curNonZeroCnt]   = fv;
                            sampleIndicesPerColumn[colIdx][curNonZeroCnt] = sampleIdx;
                            nonZeroCntPerColumn[colIdx] = curNonZeroCnt + 1;
                        }
                    }
                    // Actual row indexed sampled from the original data set
                    totalIdx += step;
                    // Row index in the sub-sampled data created in this loop.
                    ++sampleIdx;
                    if (numSampleRow == sampleIdx || numRow == totalIdx)
                    {
                        break;
                    }
                    averageStep = (double)(numRow - totalIdx) / (numSampleRow - sampleIdx);
                    step        = 1;
                    if (averageStep > 1)
                    {
                        step = rand.Next((int)(2 * averageStep - 1)) + 1;
                    }
                }
            }
            dataset = new Dataset(sampleValuePerColumn, sampleIndicesPerColumn, catMetaData.NumCol, nonZeroCntPerColumn, sampleIdx, numRow, param, labels, weights, groups);
        }