Ejemplo n.º 1
0
        /// <summary>
        /// Create a <see cref="Dataset"/> for storing training and prediciton data under LightGBM framework. The main goal of this function
        /// is not marshaling ML.NET data set into LightGBM format but just creates a (unmanaged) container where examples can be pushed into by calling
        /// <see cref="PushRows(float[], int, int, int)"/>. It also pre-allocates memory so the actual size (number of examples and number of features)
        /// of the data set is required. A sub-sampled version of the original data set is passed in to compute some statictics needed by the training
        /// procedure. Note that we use "original" to indicate a property from the unsampled data set.
        /// </summary>
        /// <param name="sampleValuePerColumn">A 2-D array which encodes the sub-sampled data matrix. sampleValuePerColumn[i] stores
        /// all the non-zero values of the i-th feature. sampleValuePerColumn[i][j] is the j-th non-zero value of i-th feature encountered when scanning
        /// the values row-by-row (i.e., example-by-example) in the matrix and column-by-column (i.e., feature-by-feature) within one row. It is similar
        /// to CSC format for storing sparse matrix.</param>
        /// <param name="sampleIndicesPerColumn">A 2-D array which encodes sub-sampled example indexes of non-zero features stored in sampleValuePerColumn.
        /// The sampleIndicesPerColumn[i][j]-th example has a non-zero i-th feature whose value is sampleValuePerColumn[i][j].</param>
        /// <param name="numCol">Total number of features in the original data.</param>
        /// <param name="sampleNonZeroCntPerColumn">sampleNonZeroCntPerColumn[i] is the size of sampleValuePerColumn[i].</param>
        /// <param name="numSampleRow">The number of sampled examples in the sub-sampled data matrix.</param>
        /// <param name="numTotalRow">The number of original examples added using <see cref="PushRows(float[], int, int, int)"/>.</param>
        /// <param name="param">LightGBM parameter used in https://github.com/Microsoft/LightGBM/blob/c920e6345bcb41fc1ec6ac338f5437034b9f0d38/src/c_api.cpp#L421. </param>
        /// <param name="labels">Labels of the original data. labels[i] is the label of the i-th original example.</param>
        /// <param name="weights">Example weights of the original data. weights[i] is the weight of the i-th original example.</param>
        /// <param name="groups">Group identifiers of the original data. groups[i] is the group ID of the i-th original example.</param>
        public unsafe Dataset(double[][] sampleValuePerColumn,
                              int[][] sampleIndicesPerColumn,
                              int numCol,
                              int[] sampleNonZeroCntPerColumn,
                              int numSampleRow,
                              int numTotalRow,
                              string param, float[] labels, float[] weights = null, int[] groups = null)
        {
            _handle = null;

            // Use GCHandle to pin the memory, avoid the memory relocation.
            GCHandle[] gcValues  = new GCHandle[numCol];
            GCHandle[] gcIndices = new GCHandle[numCol];
            try
            {
                double *[] ptrArrayValues  = new double *[numCol];
                int *[]    ptrArrayIndices = new int *[numCol];
                for (int i = 0; i < numCol; i++)
                {
                    gcValues[i]        = GCHandle.Alloc(sampleValuePerColumn[i], GCHandleType.Pinned);
                    ptrArrayValues[i]  = (double *)gcValues[i].AddrOfPinnedObject().ToPointer();
                    gcIndices[i]       = GCHandle.Alloc(sampleIndicesPerColumn[i], GCHandleType.Pinned);
                    ptrArrayIndices[i] = (int *)gcIndices[i].AddrOfPinnedObject().ToPointer();
                }
                ;
                fixed(double **ptrValues = ptrArrayValues)
                fixed(int **ptrIndices = ptrArrayIndices)
                {
                    // Create container. Examples will pushed in later.
                    LightGbmInterfaceUtils.Check(WrappedLightGbmInterface.DatasetCreateFromSampledColumn(
                                                     (IntPtr)ptrValues, (IntPtr)ptrIndices, numCol, sampleNonZeroCntPerColumn, numSampleRow, numTotalRow,
                                                     param, out _handle));
                }
            }
            finally
            {
                for (int i = 0; i < numCol; i++)
                {
                    if (gcValues[i].IsAllocated)
                    {
                        gcValues[i].Free();
                    }
                    if (gcIndices[i].IsAllocated)
                    {
                        gcIndices[i].Free();
                    }
                }
                ;
            }
            // Before adding examples (i.e., feature vectors of the original data set), the original labels, weights, and groups are added.
            SetLabel(labels);
            SetWeights(weights);
            SetGroup(groups);

            Contracts.Assert(GetNumCols() == numCol);
            Contracts.Assert(GetNumRows() == numTotalRow);
        }
Ejemplo n.º 2
0
        public Dataset(Dataset reference, int numTotalRow, float[] labels, float[] weights = null, int[] groups = null)
        {
            WrappedLightGbmInterface.SafeDataSetHandle refHandle = reference?.Handle;

            LightGbmInterfaceUtils.Check(WrappedLightGbmInterface.DatasetCreateByReference(refHandle, numTotalRow, out _handle));

            SetLabel(labels);
            SetWeights(weights);
            SetGroup(groups);
        }
Ejemplo n.º 3
0
 public void Dispose()
 {
     _handle?.Dispose();
     _handle = null;
 }