private Dataset LoadTrainingData(IChannel ch, RoleMappedData trainData, out CategoricalMetaData catMetaData) { // Verifications. Host.AssertValue(ch); ch.CheckValue(trainData, nameof(trainData)); CheckDataValid(ch, trainData); // Load metadata first. var factory = CreateCursorFactory(trainData); GetMetainfo(ch, factory, out int numRow, out float[] labels, out float[] weights, out int[] groups); catMetaData = GetCategoricalMetaData(ch, trainData, numRow); GetDefaultParameters(ch, numRow, catMetaData.CategoricalBoudaries != null, catMetaData.TotalCats); Dataset dtrain; string param = LightGbmInterfaceUtils.JoinParameters(Options); // To reduce peak memory usage, only enable one sampling task at any given time. lock (LightGbmShared.SampleLock) { CreateDatasetFromSamplingData(ch, factory, numRow, param, labels, weights, groups, catMetaData, out dtrain); } // Push rows into dataset. LoadDataset(ch, factory, dtrain, numRow, Args.BatchSize, catMetaData); // Some checks. CheckAndUpdateParametersBeforeTraining(ch, trainData, labels, groups); return(dtrain); }
private CategoricalMetaData GetCategoricalMetaData(IChannel ch, RoleMappedData trainData, int numRow) { CategoricalMetaData catMetaData = new CategoricalMetaData(); int[] categoricalFeatures = null; const int useCatThreshold = 50000; // Disable cat when data is too small, reduce the overfitting. bool useCat = Args.UseCat ?? numRow > useCatThreshold; if (!Args.UseCat.HasValue) ch.Info("Auto-tuning parameters: " + nameof(Args.UseCat) + " = " + useCat); if (useCat) { var featureCol = trainData.Schema.Schema[DefaultColumnNames.Features]; MetadataUtils.TryGetCategoricalFeatureIndices(trainData.Schema.Schema, featureCol.Index, out categoricalFeatures); } var colType = trainData.Schema.Feature.Value.Type; int rawNumCol = colType.GetVectorSize(); FeatureCount = rawNumCol; catMetaData.TotalCats = 0; if (categoricalFeatures == null) { catMetaData.CategoricalBoudaries = null; catMetaData.NumCol = rawNumCol; } else { var catIndices = ConstructCategoricalFeatureMetaData(categoricalFeatures, rawNumCol, ref catMetaData); // Set categorical features Options["categorical_feature"] = string.Join(",", catIndices); } return catMetaData; }
private void GetFeatureValueSparse(IChannel ch, FloatLabelCursor cursor, CategoricalMetaData catMetaData, Random rand, out ReadOnlySpan <int> indices, out ReadOnlySpan <float> featureValues, out int cnt) { var cursorFeaturesValues = cursor.Features.GetValues(); var cursorFeaturesIndices = cursor.Features.GetIndices(); if (catMetaData.CategoricalBoudaries != null) { List <int> featureIndices = new List <int>(); List <float> values = new List <float>(); int lastIdx = -1; int nhot = 0; for (int i = 0; i < cursorFeaturesValues.Length; ++i) { float fv = cursorFeaturesValues[i]; int colIdx = cursorFeaturesIndices[i]; int newColIdx = catMetaData.OnehotIndices[colIdx]; if (catMetaData.IsCategoricalFeature[newColIdx]) { fv = catMetaData.OnehotBias[colIdx] + 1; } if (newColIdx != lastIdx) { featureIndices.Add(newColIdx); values.Add(fv); nhot = 1; } else { // Multi-hot. ++nhot; var prob = rand.NextSingle(); if (prob < 1.0f / nhot) { values[values.Count - 1] = fv; } } lastIdx = newColIdx; } indices = featureIndices.ToArray(); featureValues = values.ToArray(); cnt = featureIndices.Count; } else { indices = cursorFeaturesIndices; featureValues = cursorFeaturesValues; cnt = cursorFeaturesValues.Length; } }
private void CopyToCsr(IChannel ch, FloatLabelCursor cursor, int[] indices, float[] features, CategoricalMetaData catMetaData, IRandom rand, ref int numElem) { int numValue = cursor.Features.Count; if (numValue > 0) { ch.Assert(indices.Length >= numElem + numValue); ch.Assert(features.Length >= numElem + numValue); if (cursor.Features.IsDense) { GetFeatureValueDense(ch, cursor, catMetaData, rand, out float[] featureValues); for (int i = 0; i < catMetaData.NumCol; ++i) { float fv = featureValues[i]; if (fv == 0) { continue; } features[numElem] = fv; indices[numElem] = i; ++numElem; } } else { GetFeatureValueSparse(ch, cursor, catMetaData, rand, out int[] featureIndices, out float[] featureValues, out int cnt); for (int i = 0; i < cnt; ++i) { int colIdx = featureIndices[i]; float fv = featureValues[i]; if (fv == 0) { continue; } features[numElem] = fv; indices[numElem] = colIdx; ++numElem; } } } }
private void TrainCore(IChannel ch, IProgressChannel pch, Dataset dtrain, CategoricalMetaData catMetaData, Dataset dvalid = null) { Host.AssertValue(ch); Host.AssertValue(pch); Host.AssertValue(dtrain); Host.AssertValueOrNull(dvalid); // For multi class, the number of labels is required. ch.Assert(PredictionKind != PredictionKind.MultiClassClassification || Options.ContainsKey("num_class"), "LightGBM requires the number of classes to be specified in the parameters."); // Only enable one trainer to run at one time. lock (LightGbmShared.LockForMultiThreadingInside) { ch.Info("LightGBM objective={0}", Options["objective"]); using (Booster bst = WrappedLightGbmTraining.Train(ch, pch, Options, dtrain, dvalid: dvalid, numIteration: Args.NumBoostRound, verboseEval: Args.VerboseEval, earlyStoppingRound: Args.EarlyStoppingRound)) { TrainedEnsemble = bst.GetModel(catMetaData.CategoricalBoudaries); } } }
private void GetFeatureValueDense(IChannel ch, FloatLabelCursor cursor, CategoricalMetaData catMetaData, Random rand, out ReadOnlySpan <float> featureValues) { var cursorFeaturesValues = cursor.Features.GetValues(); if (catMetaData.CategoricalBoudaries != null) { float[] featureValuesTemp = new float[catMetaData.NumCol]; for (int i = 0; i < catMetaData.NumCol; ++i) { float fv = cursorFeaturesValues[catMetaData.CategoricalBoudaries[i]]; if (catMetaData.IsCategoricalFeature[i]) { int hotIdx = catMetaData.CategoricalBoudaries[i] - 1; int nhot = 0; for (int j = catMetaData.CategoricalBoudaries[i]; j < catMetaData.CategoricalBoudaries[i + 1]; ++j) { if (cursorFeaturesValues[j] > 0) { // Reservoir Sampling. nhot++; var prob = rand.NextSingle(); if (prob < 1.0f / nhot) { hotIdx = j; } } } // All-Zero is category 0. fv = hotIdx - catMetaData.CategoricalBoudaries[i] + 1; } featureValuesTemp[i] = fv; } featureValues = featureValuesTemp; } else { featureValues = cursorFeaturesValues; } }
private void CopyToArray(IChannel ch, FloatLabelCursor cursor, float[] features, CategoricalMetaData catMetaData, IRandom rand, ref int numElem) { ch.Assert(features.Length >= numElem + catMetaData.NumCol); if (catMetaData.CategoricalBoudaries != null) { if (cursor.Features.IsDense) { GetFeatureValueDense(ch, cursor, catMetaData, rand, out float[] featureValues); for (int i = 0; i < catMetaData.NumCol; ++i) { features[numElem + i] = featureValues[i]; } numElem += catMetaData.NumCol; } else { GetFeatureValueSparse(ch, cursor, catMetaData, rand, out int[] indices, out float[] featureValues, out int cnt); int lastIdx = 0; for (int i = 0; i < cnt; i++) { int slot = indices[i]; float fv = featureValues[i]; Contracts.Assert(slot >= lastIdx); while (lastIdx < slot) { features[numElem + lastIdx++] = 0.0f; } Contracts.Assert(lastIdx == slot); features[numElem + lastIdx++] = fv; } while (lastIdx < catMetaData.NumCol) { features[numElem + lastIdx++] = 0.0f; } numElem += catMetaData.NumCol; } } else { cursor.Features.CopyTo(features, numElem, 0.0f); numElem += catMetaData.NumCol; } }
/// <summary> /// Load dataset. Use row batch way to reduce peak memory cost. /// </summary> private void LoadDataset(IChannel ch, FloatLabelCursor.Factory factory, Dataset dataset, int numRow, int batchSize, CategoricalMetaData catMetaData) { Host.AssertValue(ch); ch.AssertValue(factory); ch.AssertValue(dataset); ch.Assert(dataset.GetNumRows() == numRow); ch.Assert(dataset.GetNumCols() == catMetaData.NumCol); var rand = Host.Rand; // To avoid array resize, batch size should bigger than size of one row. batchSize = Math.Max(batchSize, catMetaData.NumCol); double density = DetectDensity(factory); int numElem = 0; int totalRowCount = 0; int curRowCount = 0; if (density >= 0.5) { int batchRow = batchSize / catMetaData.NumCol; batchRow = Math.Max(1, batchRow); if (batchRow > numRow) { batchRow = numRow; } // This can only happen if the size of ONE example(row) exceeds the max array size. This looks like a very unlikely case. if ((long)catMetaData.NumCol * batchRow > Utils.ArrayMaxSize) { throw ch.Except("Size of array exceeded the " + nameof(Utils.ArrayMaxSize)); } float[] features = new float[catMetaData.NumCol * batchRow]; using (var cursor = factory.Create()) { while (cursor.MoveNext()) { ch.Assert(totalRowCount < numRow); CopyToArray(ch, cursor, features, catMetaData, rand, ref numElem); ++totalRowCount; ++curRowCount; if (batchRow == curRowCount) { ch.Assert(numElem == curRowCount * catMetaData.NumCol); // PushRows is run by multi-threading inside, so lock here. lock (LightGbmShared.LockForMultiThreadingInside) dataset.PushRows(features, curRowCount, catMetaData.NumCol, totalRowCount - curRowCount); curRowCount = 0; numElem = 0; } } ch.Assert(totalRowCount == numRow); if (curRowCount > 0) { ch.Assert(numElem == curRowCount * catMetaData.NumCol); // PushRows is run by multi-threading inside, so lock here. lock (LightGbmShared.LockForMultiThreadingInside) dataset.PushRows(features, curRowCount, catMetaData.NumCol, totalRowCount - curRowCount); } } } else { int esimateBatchRow = (int)(batchSize / (catMetaData.NumCol * density)); esimateBatchRow = Math.Max(1, esimateBatchRow); float[] features = new float[batchSize]; int[] indices = new int[batchSize]; int[] indptr = new int[esimateBatchRow + 1]; using (var cursor = factory.Create()) { while (cursor.MoveNext()) { ch.Assert(totalRowCount < numRow); // Need push rows to LightGBM. if (numElem + cursor.Features.Count > features.Length) { // Mini batch size is greater than size of one row. // So, at least we have the data of one row. ch.Assert(curRowCount > 0); Utils.EnsureSize(ref indptr, curRowCount + 1); indptr[curRowCount] = numElem; // PushRows is run by multi-threading inside, so lock here. lock (LightGbmShared.LockForMultiThreadingInside) { dataset.PushRows(indptr, indices, features, curRowCount + 1, numElem, catMetaData.NumCol, totalRowCount - curRowCount); } curRowCount = 0; numElem = 0; } Utils.EnsureSize(ref indptr, curRowCount + 1); indptr[curRowCount] = numElem; CopyToCsr(ch, cursor, indices, features, catMetaData, rand, ref numElem); ++totalRowCount; ++curRowCount; } ch.Assert(totalRowCount == numRow); if (curRowCount > 0) { Utils.EnsureSize(ref indptr, curRowCount + 1); indptr[curRowCount] = numElem; // PushRows is run by multi-threading inside, so lock here. lock (LightGbmShared.LockForMultiThreadingInside) { dataset.PushRows(indptr, indices, features, curRowCount + 1, numElem, catMetaData.NumCol, totalRowCount - curRowCount); } } } } }
/// <summary> /// Create a dataset from the sampling data. /// </summary> private void CreateDatasetFromSamplingData(IChannel ch, FloatLabelCursor.Factory factory, int numRow, string param, float[] labels, float[] weights, int[] groups, CategoricalMetaData catMetaData, out Dataset dataset) { Host.AssertValue(ch); int numSampleRow = GetNumSampleRow(numRow, FeatureCount); var rand = Host.Rand; double averageStep = (double)numRow / numSampleRow; int totalIdx = 0; int sampleIdx = 0; double density = DetectDensity(factory); double[][] sampleValuePerColumn = new double[catMetaData.NumCol][]; int[][] sampleIndicesPerColumn = new int[catMetaData.NumCol][]; int[] nonZeroCntPerColumn = new int[catMetaData.NumCol]; int estimateNonZeroCnt = (int)(numSampleRow * density); estimateNonZeroCnt = Math.Max(1, estimateNonZeroCnt); for (int i = 0; i < catMetaData.NumCol; i++) { nonZeroCntPerColumn[i] = 0; sampleValuePerColumn[i] = new double[estimateNonZeroCnt]; sampleIndicesPerColumn[i] = new int[estimateNonZeroCnt]; } ; using (var cursor = factory.Create()) { int step = 1; if (averageStep > 1) { step = rand.Next((int)(2 * averageStep - 1)) + 1; } while (MoveMany(cursor, step)) { if (cursor.Features.IsDense) { GetFeatureValueDense(ch, cursor, catMetaData, rand, out float[] featureValues); for (int i = 0; i < catMetaData.NumCol; ++i) { float fv = featureValues[i]; if (fv == 0) { continue; } int curNonZeroCnt = nonZeroCntPerColumn[i]; Utils.EnsureSize(ref sampleValuePerColumn[i], curNonZeroCnt + 1); Utils.EnsureSize(ref sampleIndicesPerColumn[i], curNonZeroCnt + 1); sampleValuePerColumn[i][curNonZeroCnt] = fv; sampleIndicesPerColumn[i][curNonZeroCnt] = sampleIdx; nonZeroCntPerColumn[i] = curNonZeroCnt + 1; } } else { GetFeatureValueSparse(ch, cursor, catMetaData, rand, out int[] featureIndices, out float[] featureValues, out int cnt); for (int i = 0; i < cnt; ++i) { int colIdx = featureIndices[i]; float fv = featureValues[i]; if (fv == 0) { continue; } int curNonZeroCnt = nonZeroCntPerColumn[colIdx]; Utils.EnsureSize(ref sampleValuePerColumn[colIdx], curNonZeroCnt + 1); Utils.EnsureSize(ref sampleIndicesPerColumn[colIdx], curNonZeroCnt + 1); sampleValuePerColumn[colIdx][curNonZeroCnt] = fv; sampleIndicesPerColumn[colIdx][curNonZeroCnt] = sampleIdx; nonZeroCntPerColumn[colIdx] = curNonZeroCnt + 1; } } totalIdx += step; ++sampleIdx; if (numSampleRow == sampleIdx || numRow == totalIdx) { break; } averageStep = (double)(numRow - totalIdx) / (numSampleRow - sampleIdx); step = 1; if (averageStep > 1) { step = rand.Next((int)(2 * averageStep - 1)) + 1; } } } dataset = new Dataset(sampleValuePerColumn, sampleIndicesPerColumn, catMetaData.NumCol, nonZeroCntPerColumn, sampleIdx, numRow, param, labels, weights, groups); }
private Dataset LoadValidationData(IChannel ch, Dataset dtrain, RoleMappedData validData, CategoricalMetaData catMetaData) { // Verifications. Host.AssertValue(ch); ch.CheckValue(validData, nameof(validData)); CheckDataValid(ch, validData); // Load meta info first. var factory = CreateCursorFactory(validData); GetMetainfo(ch, factory, out int numRow, out float[] labels, out float[] weights, out int[] groups); // Construct validation dataset. Dataset dvalid = new Dataset(dtrain, numRow, labels, weights, groups); // Push rows into dataset. LoadDataset(ch, factory, dvalid, numRow, Args.BatchSize, catMetaData); return(dvalid); }
private static List <string> ConstructCategoricalFeatureMetaData(int[] categoricalFeatures, int rawNumCol, ref CategoricalMetaData catMetaData) { List <int> catBoundaries = GetCategoricalBoundires(categoricalFeatures, rawNumCol); catMetaData.NumCol = catBoundaries.Count - 1; catMetaData.CategoricalBoudaries = catBoundaries.ToArray(); catMetaData.IsCategoricalFeature = new bool[catMetaData.NumCol]; catMetaData.OnehotIndices = new int[rawNumCol]; catMetaData.OnehotBias = new int[rawNumCol]; List <string> catIndices = new List <string>(); int j = 0; for (int i = 0; i < catMetaData.NumCol; ++i) { var numCat = catMetaData.CategoricalBoudaries[i + 1] - catMetaData.CategoricalBoudaries[i]; if (numCat > 1) { catMetaData.TotalCats += numCat; catMetaData.IsCategoricalFeature[i] = true; catIndices.Add(i.ToString()); for (int k = catMetaData.CategoricalBoudaries[i]; k < catMetaData.CategoricalBoudaries[i + 1]; ++k) { catMetaData.OnehotIndices[j] = i; catMetaData.OnehotBias[j] = k - catMetaData.CategoricalBoudaries[i]; ++j; } } else { catMetaData.IsCategoricalFeature[i] = false; catMetaData.OnehotIndices[j] = i; catMetaData.OnehotBias[j] = 0; ++j; } } return(catIndices); }
/// <summary> /// Create a dataset from the sampling data. /// </summary> private void CreateDatasetFromSamplingData(IChannel ch, FloatLabelCursor.Factory factory, int numRow, string param, float[] labels, float[] weights, int[] groups, CategoricalMetaData catMetaData, out Dataset dataset) { Host.AssertValue(ch); int numSampleRow = GetNumSampleRow(numRow, FeatureCount); var rand = Host.Rand; double averageStep = (double)numRow / numSampleRow; int totalIdx = 0; int sampleIdx = 0; double density = DetectDensity(factory); double[][] sampleValuePerColumn = new double[catMetaData.NumCol][]; int[][] sampleIndicesPerColumn = new int[catMetaData.NumCol][]; int[] nonZeroCntPerColumn = new int[catMetaData.NumCol]; int estimateNonZeroCnt = (int)(numSampleRow * density); estimateNonZeroCnt = Math.Max(1, estimateNonZeroCnt); for (int i = 0; i < catMetaData.NumCol; i++) { nonZeroCntPerColumn[i] = 0; sampleValuePerColumn[i] = new double[estimateNonZeroCnt]; sampleIndicesPerColumn[i] = new int[estimateNonZeroCnt]; } ; using (var cursor = factory.Create()) { int step = 1; if (averageStep > 1) { step = rand.Next((int)(2 * averageStep - 1)) + 1; } while (MoveMany(cursor, step)) { if (cursor.Features.IsDense) { GetFeatureValueDense(ch, cursor, catMetaData, rand, out ReadOnlySpan <float> featureValues); for (int i = 0; i < catMetaData.NumCol; ++i) { float fv = featureValues[i]; if (fv == 0) { continue; } int curNonZeroCnt = nonZeroCntPerColumn[i]; Utils.EnsureSize(ref sampleValuePerColumn[i], curNonZeroCnt + 1); Utils.EnsureSize(ref sampleIndicesPerColumn[i], curNonZeroCnt + 1); // sampleValuePerColumn[i] is a vector whose j-th element is added when j-th non-zero value // at the i-th feature is found as scanning the training data. // In other words, sampleValuePerColumn[i][j] is the j-th non-zero i-th feature in the data set. // when we scan the data matrix example-by-example. sampleValuePerColumn[i][curNonZeroCnt] = fv; // If the data set is dense, sampleValuePerColumn[i][j] would be the i-th feature at the j-th example. // If the data set is not dense, sampleValuePerColumn[i][j] would be the i-th feature at the // sampleIndicesPerColumn[i][j]-th example. sampleIndicesPerColumn[i][curNonZeroCnt] = sampleIdx; // The number of non-zero values at the i-th feature is nonZeroCntPerColumn[i]. nonZeroCntPerColumn[i] = curNonZeroCnt + 1; } } else { GetFeatureValueSparse(ch, cursor, catMetaData, rand, out ReadOnlySpan <int> featureIndices, out ReadOnlySpan <float> featureValues, out int cnt); for (int i = 0; i < cnt; ++i) { int colIdx = featureIndices[i]; float fv = featureValues[i]; if (fv == 0) { continue; } int curNonZeroCnt = nonZeroCntPerColumn[colIdx]; Utils.EnsureSize(ref sampleValuePerColumn[colIdx], curNonZeroCnt + 1); Utils.EnsureSize(ref sampleIndicesPerColumn[colIdx], curNonZeroCnt + 1); sampleValuePerColumn[colIdx][curNonZeroCnt] = fv; sampleIndicesPerColumn[colIdx][curNonZeroCnt] = sampleIdx; nonZeroCntPerColumn[colIdx] = curNonZeroCnt + 1; } } // Actual row indexed sampled from the original data set totalIdx += step; // Row index in the sub-sampled data created in this loop. ++sampleIdx; if (numSampleRow == sampleIdx || numRow == totalIdx) { break; } averageStep = (double)(numRow - totalIdx) / (numSampleRow - sampleIdx); step = 1; if (averageStep > 1) { step = rand.Next((int)(2 * averageStep - 1)) + 1; } } } dataset = new Dataset(sampleValuePerColumn, sampleIndicesPerColumn, catMetaData.NumCol, nonZeroCntPerColumn, sampleIdx, numRow, param, labels, weights, groups); }