Пример #1
0
        [ConditionalFact(typeof(Environment), nameof(Environment.Is64BitProcess))] // LightGBM is 64-bit only
        public void LightGbmBinaryClassification()
        {
            var env        = new MLContext(seed: 0);
            var dataPath   = GetDataPath(TestDatasets.breastCancer.trainFilename);
            var dataSource = new MultiFileSource(dataPath);
            var ctx        = new BinaryClassificationContext(env);

            var reader = TextLoader.CreateReader(env,
                                                 c => (label: c.LoadBool(0), features: c.LoadFloat(1, 9)));

            IPredictorWithFeatureWeights <float> pred = null;

            var est = reader.MakeNewEstimator()
                      .Append(r => (r.label, preds: ctx.Trainers.LightGbm(r.label, r.features,
                                                                          numBoostRound: 10,
                                                                          numLeaves: 5,
                                                                          learningRate: 0.01,
                                                                          onFit: (p) => { pred = p; })));

            var pipe = reader.Append(est);

            Assert.Null(pred);
            var model = pipe.Fit(dataSource);

            Assert.NotNull(pred);

            // 9 input features, so we ought to have 9 weights.
            VBuffer <float> weights = new VBuffer <float>();

            pred.GetFeatureWeights(ref weights);
            Assert.Equal(9, weights.Length);

            var data = model.Read(dataSource);

            var metrics = ctx.Evaluate(data, r => r.label, r => r.preds);

            // Run a sanity check against a few of the metrics.
            Assert.InRange(metrics.Accuracy, 0, 1);
            Assert.InRange(metrics.Auc, 0, 1);
            Assert.InRange(metrics.Auprc, 0, 1);
        }
Пример #2
0
        private protected override IPredictorWithFeatureWeights <double []> CreateManagedPredictor()
        {
            var numClass = Objective.NumClass;

            if (TrainedEnsemble.NumTrees % numClass != 0)
            {
                throw new Exception("Number of trees should be a multiple of number of classes.");
            }

            var isSoftMax = (Objective.Objective == ObjectiveType.MultiClass);

            IPredictorWithFeatureWeights <double>[] predictors = new IPredictorWithFeatureWeights <double> [numClass];
            var cali = isSoftMax ? null : new PlattCalibrator(-Objective.Sigmoid);

            for (int i = 0; i < numClass; ++i)
            {
                var pred = CreateBinaryPredictor(i) as IPredictorWithFeatureWeights <double>;
                predictors[i] = isSoftMax ? pred : new CalibratedPredictor(pred, cali);
            }
            return(OvaPredictor.Create(isSoftMax, predictors));
        }
Пример #3
0
        public void HogwildSGDBinaryClassification()
        {
            var env        = new MLContext(seed: 0);
            var dataPath   = GetDataPath(TestDatasets.breastCancer.trainFilename);
            var dataSource = new MultiFileSource(dataPath);
            var ctx        = new BinaryClassificationContext(env);

            var reader = TextLoaderStatic.CreateReader(env,
                                                       c => (label: c.LoadBool(0), features: c.LoadFloat(1, 9)));

            IPredictorWithFeatureWeights <float> pred = null;

            var est = reader.MakeNewEstimator()
                      .Append(r => (r.label, preds: ctx.Trainers.StochasticGradientDescentClassificationTrainer(r.label, r.features,
                                                                                                                l2Weight: 0,
                                                                                                                onFit: (p) => { pred = p; },
                                                                                                                advancedSettings: s => s.NumThreads = 1)));

            var pipe = reader.Append(est);

            Assert.Null(pred);
            var model = pipe.Fit(dataSource);

            Assert.NotNull(pred);

            // 9 input features, so we ought to have 9 weights.
            VBuffer <float> weights = new VBuffer <float>();

            pred.GetFeatureWeights(ref weights);
            Assert.Equal(9, weights.Length);

            var data = model.Read(dataSource);

            var metrics = ctx.Evaluate(data, r => r.label, r => r.preds);

            // Run a sanity check against a few of the metrics.
            Assert.InRange(metrics.Accuracy, 0, 1);
            Assert.InRange(metrics.Auc, 0, 1);
            Assert.InRange(metrics.Auprc, 0, 1);
        }
Пример #4
0
        public void TrainRegression()
        {
            var objectiveTypes =
                new ObjectiveType[] {
                ObjectiveType.Regression,
                ObjectiveType.RegressionL1,
                ObjectiveType.Huber,
                ObjectiveType.Fair,
                ObjectiveType.Poisson,
                ObjectiveType.Quantile,
                ObjectiveType.Mape,
                ObjectiveType.Gamma,
                ObjectiveType.Tweedie
            };

            var rand = new Random(Seed);

            for (int test = 0; test < 5; ++test)
            {
                int numColumns = rand.Next(1, 10);
                var objective  = objectiveTypes[rand.Next(objectiveTypes.Length)];
                var pms        = GenerateParameters(rand, objective, numColumns);
                if (rand.Next(2) == 0)
                {
                    pms.Objective.RegSqrt = true;
                }

                var learningRateSchedule = (rand.Next(2) == 0) ? (Func <int, double>)null : (iter => pms.Learning.LearningRate * Math.Pow(0.99, iter));

                try
                {
                    Dictionary <int, int> categorical = null;
                    var trainData = CreateRandomDenseRegressionData(rand, ref categorical, pms.Dataset.UseMissing, numColumns);
                    var validData = (pms.Learning.EarlyStoppingRound > 0 || rand.Next(2) == 0) ? CreateRandomDenseRegressionData(rand, ref categorical, pms.Dataset.UseMissing, numColumns) : null;
                    pms.Dataset.CategoricalFeature = categorical.Keys.ToArray();

                    // make labels positive for certain objective types
                    if (objective == ObjectiveType.Poisson ||
                        objective == ObjectiveType.Gamma ||
                        objective == ObjectiveType.Tweedie)
                    {
                        for (var i = 0; i < trainData.Labels.Length; i++)
                        {
                            trainData.Labels[i] = Math.Abs(trainData.Labels[i]);
                        }

                        if (validData != null)
                        {
                            for (var i = 0; i < validData.Labels.Length; i++)
                            {
                                validData.Labels[i] = Math.Abs(validData.Labels[i]);
                            }
                        }
                    }

                    // uncomment to select particular iteration
                    //if (test != 3)
                    //    continue;

                    using (var datasets = (rand.Next(2) == 0) ? new Datasets(pms.Common, pms.Dataset, trainData, validData) :
                                          new Datasets(pms.Common, pms.Dataset, Dense2Sparse(trainData), Dense2Sparse(validData)))
                        using (var trainer = new RegressionTrainer(pms.Learning, pms.Objective))
                        {
                            //if (true)
                            //    trainer.ToCommandLineFiles(datasets);

                            var model = trainer.Train(datasets, learningRateSchedule);
                            model.Managed.MaxThreads = rand.Next(1, Environment.ProcessorCount);

                            // possibly use subset of trees
                            var numIterations = -1;
                            if (rand.Next(2) == 0)
                            {
                                numIterations             = rand.Next(1, model.Managed.MaxNumTrees);
                                model.Managed.MaxNumTrees = numIterations;
                                model.Native.MaxNumTrees  = numIterations;
                            }

                            IPredictorWithFeatureWeights <double> model2 = null;
                            using (var ms = new System.IO.MemoryStream())
                                using (var writer = new System.IO.BinaryWriter(ms))
                                    using (var reader = new System.IO.BinaryReader(ms))
                                    {
                                        PredictorPersist.Save(model.Managed, writer);
                                        ms.Position = 0;
                                        model2      = PredictorPersist.Load <double>(reader);
                                        Assert.Equal(ms.Position, ms.Length);
                                    }

                            IPredictorWithFeatureWeights <double> model2native = null;
                            using (var ms = new System.IO.MemoryStream())
                                using (var writer = new System.IO.BinaryWriter(ms))
                                    using (var reader = new System.IO.BinaryReader(ms))
                                    {
                                        NativePredictorPersist.Save(model.Native, writer);
                                        ms.Position  = 0;
                                        model2native = NativePredictorPersist.Load <double>(reader);
                                        Assert.Equal(ms.Position, ms.Length);
                                    }

                            var output3s = trainer.Evaluate(Booster.PredictType.Normal, trainData.Features, numIterations);
                            Assert.Equal(trainData.Features.Length, output3s.GetLength(0));
                            Assert.Equal(1, output3s.GetLength(1));

                            var output3natives = model.Native.GetOutputs(trainData.Features);
                            Assert.Equal(trainData.Features.Length, output3s.Length);

                            for (int i = 0; i < trainData.Features.Length; i++)
                            {
                                var row = trainData.Features[i];

                                double output = 0;
                                var    input  = new VBuffer <float>(row.Length, row);
                                model.Managed.GetOutput(ref input, ref output);
                                Assert.False(double.IsNaN(output));

                                double output2 = 0;
                                model2.GetOutput(ref input, ref output2);
                                Compare(output, output2);

                                var output3 = trainer.Evaluate(Booster.PredictType.Normal, row, numIterations);
                                Assert.Single(output3);
                                Assert.Equal(output3[0], output3s[i, 0]);
                                Assert.Equal(output3[0], output3natives[i]);
                                Compare(output, output3[0]);
                                //Console.WriteLine(trainer.GetModelString());
                                //throw new Exception($"Output mismatch {output} vs {output3[0]} (error: {Math.Abs(output - output3[0])}) input: {String.Join(", ", row)}");

                                double outputNative = 0;
                                model.Native.GetOutput(ref input, ref outputNative);
                                Assert.Equal(outputNative, output3[0]);

                                model2native.GetOutput(ref input, ref outputNative);
                                Assert.Equal(outputNative, output3[0]);
                            }

                            var normalise   = rand.Next(2) == 0;
                            var getSplits   = rand.Next(2) == 0;
                            var gains       = model.Managed.GetFeatureWeights(normalise, getSplits);
                            var gainsNative = model.Native.GetFeatureWeights(normalise, getSplits);
                            Assert.Equal(gains.Count, gainsNative.Count);
                            foreach (var kv in gains)
                            {
                                Assert.True(0 <= kv.Key && kv.Key < trainData.NumColumns);
                                Assert.True(0.0 <= kv.Value);
                                Compare(kv.Value, gainsNative[kv.Key]);
                            }

                            if (!getSplits && !normalise)
                            {
                                var totGain1 = gains.Values.Sum();
                                var totGain2 = Enumerable.Range(0, trainData.NumColumns).SelectMany(i => model.Managed.GetFeatureGains(i)).Sum();
                                Compare(totGain1, totGain2);
                            }
                        }
                }
                catch (Exception e)
                {
                    throw new Exception($"Failed: {Seed} #{test} {pms}", e);
                }
            }
        }
Пример #5
0
 protected override BinaryPredictionTransformer <IPredictorWithFeatureWeights <float> > MakeTransformer(IPredictorWithFeatureWeights <float> model, Schema trainSchema)
 => new BinaryPredictionTransformer <IPredictorWithFeatureWeights <float> >(Host, model, trainSchema, FeatureColumn.Name);
Пример #6
0
 public Predictors(IPredictorWithFeatureWeights <TOutput> managed, IVectorisedPredictorWithFeatureWeights <TOutput> native)
 {
     Managed = managed;
     Native  = native;
 }
Пример #7
0
 public CalibratedPredictor(IPredictorWithFeatureWeights <double> predictor, ICalibrator calibrator)
 {
     SubPredictor = predictor;
     Calibrator   = calibrator;
 }