Пример #1
0
        public void LogisticRegressionBinaryClassification()
        {
            var env        = new MLContext(seed: 0);
            var dataPath   = GetDataPath(TestDatasets.breastCancer.trainFilename);
            var dataSource = new MultiFileSource(dataPath);
            var ctx        = new BinaryClassificationContext(env);

            var reader = TextLoader.CreateReader(env,
                                                 c => (label: c.LoadBool(0), features: c.LoadFloat(1, 9)));

            IPredictorWithFeatureWeights <float> pred = null;

            var est = reader.MakeNewEstimator()
                      .Append(r => (r.label, preds: ctx.Trainers.LogisticRegressionBinaryClassifier(r.label, r.features,
                                                                                                    l1Weight: 10,
                                                                                                    onFit: (p) => { pred = p; },
                                                                                                    advancedSettings: s => s.NumThreads = 1)));

            var pipe = reader.Append(est);

            Assert.Null(pred);
            var model = pipe.Fit(dataSource);

            Assert.NotNull(pred);

            // 9 input features, so we ought to have 9 weights.
            VBuffer <float> weights = new VBuffer <float>();

            pred.GetFeatureWeights(ref weights);
            Assert.Equal(9, weights.Length);

            var data = model.Read(dataSource);

            var metrics = ctx.Evaluate(data, r => r.label, r => r.preds);

            // Run a sanity check against a few of the metrics.
            Assert.InRange(metrics.Accuracy, 0, 1);
            Assert.InRange(metrics.Auc, 0, 1);
            Assert.InRange(metrics.Auprc, 0, 1);
        }
Пример #2
0
        public void TestCustomTransformer()
        {
            string dataPath = GetDataPath("adult.tiny.with-schema.txt");
            var    source   = new MultiFileSource(dataPath);
            var    loader   = ML.Data.CreateTextLoader(new[] {
                new TextLoader.Column("Float1", DataKind.R4, 9),
                new TextLoader.Column("Float4", DataKind.R4, new[] { new TextLoader.Range(9), new TextLoader.Range(10), new TextLoader.Range(11), new TextLoader.Range(12) })
            }, hasHeader: true);

            var data = loader.Read(source);

            IDataView transformedData;
            // We create a temporary environment to instantiate the custom transformer. This is to ensure that we don't need the same
            // environment for saving and loading.
            var tempoEnv  = new MLContext();
            var customEst = new CustomMappingEstimator <MyInput, MyOutput>(tempoEnv, MyLambda.MyAction, "MyLambda");

            try
            {
                TestEstimatorCore(customEst, data);
                Assert.True(false, "Cannot work without MEF injection");
            }
            catch (InvalidOperationException ex)
            {
                if (!ex.IsMarked())
                {
                    throw;
                }
            }
            ML.CompositionContainer = new CompositionContainer(new TypeCatalog(typeof(MyLambda)));
            TestEstimatorCore(customEst, data);
            transformedData = customEst.Fit(data).Transform(data);

            var inputs  = ML.CreateEnumerable <MyInput>(transformedData, true);
            var outputs = ML.CreateEnumerable <MyOutput>(transformedData, true);

            Assert.True(inputs.Zip(outputs, (x, y) => y.Together == $"{x.Float1} + {string.Join(", ", x.Float4)}").All(x => x));

            Done();
        }
Пример #3
0
        public void SaveCompositeLoaderAndLoad()
        {
            var file      = new MultiFileSource(GetDataPath(TestDatasets.adult.trainFilename));
            var loader    = ML.Data.CreateTextLoader <InputData>(hasHeader: true, dataSample: file);
            var composite = loader.Append(ML.Transforms.Normalize("Features"));
            var loaderWithEmbeddedModel = composite.Fit(file);

            string modelPath = GetOutputPath(FullTestName + "-model.zip");

            ML.Model.Save(null, loaderWithEmbeddedModel, modelPath);

            Load(modelPath, out var loadedWithSchema, out var loadedSchema,
                 out var loadedWithLoader, out var loadedLoaderWithTransformer);
            // Because we saved the transform model as part of the composite loader, with no transforms,
            // the transform that should be loaded should be an empty transformer chain, since the "model,"
            // such as it is, has been combined with the loader.
            Assert.Empty(Assert.IsType <TransformerChain <ITransformer> >(loadedWithSchema));
            Assert.Empty(Assert.IsType <TransformerChain <ITransformer> >(loadedWithLoader));

            var expectedSchema = loaderWithEmbeddedModel.GetOutputSchema();

            Assert.True(expectedSchema.Count == 3);
            Assert.NotNull(expectedSchema.GetColumnOrNull("Label"));
            Assert.NotNull(expectedSchema.GetColumnOrNull("Features"));
            Assert.True(expectedSchema["Features"].HasSlotNames());

            CheckSameSchemas(loaderWithEmbeddedModel.GetOutputSchema(), loadedSchema);
            var schemaFromLoadedLoader = loadedLoaderWithTransformer.GetOutputSchema();

            CheckSameSchemas(loaderWithEmbeddedModel.GetOutputSchema(), schemaFromLoadedLoader);

            // The type of the loader itself should be a composite data loader, and its single transformer
            // should be the normalizing transformer.
            var compositeLoader = Assert.IsType <CompositeDataLoader <IMultiStreamSource, ITransformer> >(loadedLoaderWithTransformer);
            var chainFromLoader = compositeLoader.Transformer;

            Assert.IsType <NormalizingTransformer>(Assert.Single(compositeLoader.Transformer));

            Done();
        }
Пример #4
0
        public void SdcaBinaryClassification()
        {
            var env        = new TlcEnvironment(seed: 0);
            var dataPath   = GetDataPath("breast-cancer.txt");
            var dataSource = new MultiFileSource(dataPath);

            var reader = TextLoader.CreateReader(env,
                                                 c => (label: c.LoadBool(0), features: c.LoadFloat(1, 9)));

            LinearBinaryPredictor pred = null;
            ParameterMixingCalibratedPredictor cali = null;

            var est = reader.MakeNewEstimator()
                      .Append(r => (r.label, preds: r.label.PredictSdcaBinaryClassification(r.features,
                                                                                            maxIterations: 2,
                                                                                            onFit: (p, c) => { pred = p; cali = c; })));

            var pipe = reader.Append(est);

            Assert.Null(pred);
            Assert.Null(cali);
            var model = pipe.Fit(dataSource);

            Assert.NotNull(pred);
            Assert.NotNull(cali);
            // 9 input features, so we ought to have 9 weights.
            Assert.Equal(9, pred.Weights2.Count);

            var data = model.Read(dataSource);

            // Just output some data on the schema for fun.
            var rows   = DataViewUtils.ComputeRowCount(data.AsDynamic);
            var schema = data.AsDynamic.Schema;

            for (int c = 0; c < schema.ColumnCount; ++c)
            {
                Console.WriteLine($"{schema.GetColumnName(c)}, {schema.GetColumnType(c)}");
            }
        }
Пример #5
0
        public InferRecipesCommand(IHostEnvironment env, Arguments args)
        {
            Contracts.CheckValue(env, nameof(env));
            _host = env.Register("InferRecipes", seed: 0, verbose: true);
            _host.CheckValue(args, nameof(args));

            var files = new MultiFileSource(args.DataFile);

            _host.CheckUserArg(files.Count > 0, nameof(args.DataFile), "dataFile is required");
            _dataFile = args.DataFile;
            if (!string.IsNullOrWhiteSpace(args.RspOutputFile))
            {
                Utils.CheckOptionalUserDirectory(args.RspOutputFile, nameof(args.RspOutputFile));
                _rspOutFile = args.RspOutputFile;
            }

            if (!string.IsNullOrWhiteSpace(args.SchemaDefinitionFile))
            {
                Utils.CheckOptionalUserDirectory(args.SchemaDefinitionFile, nameof(args.SchemaDefinitionFile));
                _schemaDefinitionFile = args.SchemaDefinitionFile;
            }
        }
Пример #6
0
        public void AveragePerceptronNoCalibration()
        {
            var env        = new ConsoleEnvironment(seed: 0);
            var dataPath   = GetDataPath(TestDatasets.breastCancer.trainFilename);
            var dataSource = new MultiFileSource(dataPath);
            var ctx        = new BinaryClassificationContext(env);

            var reader = TextLoader.CreateReader(env,
                                                 c => (label: c.LoadBool(0), features: c.LoadFloat(1, 9)));

            LinearBinaryPredictor pred = null;

            var loss = new HingeLoss(new HingeLoss.Arguments()
            {
                Margin = 1
            });

            var est = reader.MakeNewEstimator()
                      .Append(r => (r.label, preds: ctx.Trainers.AveragedPerceptron(r.label, r.features, lossFunction: loss,
                                                                                    numIterations: 2, onFit: p => pred = p)));

            var pipe = reader.Append(est);

            Assert.Null(pred);
            var model = pipe.Fit(dataSource);

            Assert.NotNull(pred);
            // 9 input features, so we ought to have 9 weights.
            Assert.Equal(9, pred.Weights2.Count);

            var data = model.Read(dataSource);

            var metrics = ctx.Evaluate(data, r => r.label, r => r.preds);

            // Run a sanity check against a few of the metrics.
            Assert.InRange(metrics.Accuracy, 0, 1);
            Assert.InRange(metrics.Auc, 0, 1);
            Assert.InRange(metrics.Auprc, 0, 1);
        }
Пример #7
0
        public void SaveTransformerAndSchemaAndLoad()
        {
            var file      = new MultiFileSource(GetDataPath(TestDatasets.adult.trainFilename));
            var loader    = _ml.Data.CreateTextLoader <InputData>(hasHeader: true, dataSample: file);
            var estimator = _ml.Transforms.Normalize("Features");
            var model     = estimator.Fit(loader.Load(file));

            string modelPath = GetOutputPath(FullTestName + "-model.zip");

            _ml.Model.Save(model, loader.GetOutputSchema(), modelPath);

            Load(modelPath, out var loadedWithSchema, out var loadedSchema, out var loadedLoader,
                 out var loadedWithLoader, out var loadedLoaderWithTransformer);
            Assert.True(loadedWithSchema is NormalizingTransformer);
            Assert.True(loadedSchema.Count == 2 &&
                        loadedSchema.GetColumnOrNull("Label") != null &&
                        loadedSchema.GetColumnOrNull("Features") != null &&
                        loadedSchema["Features"].HasSlotNames());
            Assert.Null(loadedLoader);
            Assert.Null(loadedWithLoader);
            Assert.Null(loadedLoaderWithTransformer);
        }
Пример #8
0
        public void ConcatWith()
        {
            var env      = new ConsoleEnvironment(seed: 0);
            var dataPath = GetDataPath("iris.data");
            var reader   = TextLoader.CreateReader(env,
                                                   c => (label: c.LoadText(4), values: c.LoadFloat(0, 3), value: c.LoadFloat(2)),
                                                   separator: ',');
            var dataSource = new MultiFileSource(dataPath);
            var data       = reader.Read(dataSource);

            var est = data.MakeNewEstimator()
                      .Append(r => (
                                  r.label, r.values, r.value,
                                  c0: r.label.AsVector(), c1: r.label.ConcatWith(r.label),
                                  c2: r.value.ConcatWith(r.values), c3: r.values.ConcatWith(r.value, r.values)));

            var tdata  = est.Fit(data).Transform(data);
            var schema = tdata.AsDynamic.Schema;

            int[] idx = new int[4];
            for (int i = 0; i < idx.Length; ++i)
            {
                Assert.True(schema.TryGetColumnIndex("c" + i, out idx[i]), $"Could not find col c{i}");
            }
            var types = new VectorType[idx.Length];

            int[] expectedLen = new int[] { 1, 2, 5, 9 };
            for (int i = 0; i < idx.Length; ++i)
            {
                var type = schema.GetColumnType(idx[i]);
                Assert.True(type.VectorSize > 0, $"Col c{i} had unexpected type {type}");
                types[i] = type.AsVector;
                Assert.Equal(expectedLen[i], type.VectorSize);
            }
            Assert.Equal(TextType.Instance, types[0].ItemType);
            Assert.Equal(TextType.Instance, types[1].ItemType);
            Assert.Equal(NumberType.Float, types[2].ItemType);
            Assert.Equal(NumberType.Float, types[3].ItemType);
        }
Пример #9
0
        public void PrincipalComponentAnalysis()
        {
            var env        = new ConsoleEnvironment(seed: 0);
            var dataPath   = GetDataPath("generated_regression_dataset.csv");
            var dataSource = new MultiFileSource(dataPath);

            var reader = TextLoader.CreateReader(env,
                                                 c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)),
                                                 separator: ';', hasHeader: true);
            var data = reader.Read(dataSource);

            var est = reader.MakeNewEstimator()
                      .Append(r => (r.label,
                                    pca: r.features.ToPrincipalComponents(rank: 5)));
            var tdata  = est.Fit(data).Transform(data);
            var schema = tdata.AsDynamic.Schema;

            Assert.True(schema.TryGetColumnIndex("pca", out int pcaCol));
            var type = schema.GetColumnType(pcaCol);

            Assert.True(type.IsVector && type.IsKnownSizeVector && type.ItemType.IsNumber);
        }
Пример #10
0
        public void TestSchemaPropagation()
        {
            string dataPath = GetDataPath("adult.test");
            var    source   = new MultiFileSource(dataPath);
            var    loader   = ML.Data.CreateTextLoader(new[] {
                new TextLoader.Column("Float1", DataKind.R4, 0),
                new TextLoader.Column("Float4", DataKind.R4, new[] { new TextLoader.Range(0), new TextLoader.Range(2), new TextLoader.Range(4), new TextLoader.Range(10) }),
                new TextLoader.Column("Text1", DataKind.Text, 0)
            }, hasHeader: true, separatorChar: ',');

            var data = loader.Read(source);

            Action <MyInput, MyOutput> mapping = (input, output) => output.Together = input.Float1.ToString();
            var est = ML.Transforms.CustomMapping(mapping, null);

            // Make sure schema propagation works for valid data.
            est.GetOutputSchema(SchemaShape.Create(data.Schema));

            var badData1 = ML.Transforms.CopyColumns("Text1", "Float1").Fit(data).Transform(data);

            try
            {
                est.GetOutputSchema(SchemaShape.Create(badData1.Schema));
                Assert.True(false);
            }
            catch (Exception) { }

            var badData2 = ML.Transforms.SelectColumns(new[] { "Float1" }).Fit(data).Transform(data);

            try
            {
                est.GetOutputSchema(SchemaShape.Create(badData2.Schema));
                Assert.True(false);
            }
            catch (Exception) { }

            Done();
        }
Пример #11
0
        public void ToKey()
        {
            var env      = new ConsoleEnvironment(seed: 0);
            var dataPath = GetDataPath("iris.data");
            var reader   = TextLoader.CreateReader(env,
                                                   c => (label: c.LoadText(4), values: c.LoadFloat(0, 3)),
                                                   separator: ',');
            var dataSource = new MultiFileSource(dataPath);
            var data       = reader.Read(dataSource);

            var est = data.MakeNewEstimator()
                      .Append(r => (labelKey: r.label.ToKey(), valuesKey: r.values.ToKey(onFit: m => { })))
                      .Append(r => (r.labelKey, r.valuesKey, valuesKeyKey: r.valuesKey.ToKey()));

            var tdata  = est.Fit(data).Transform(data);
            var schema = tdata.AsDynamic.Schema;

            Assert.True(schema.TryGetColumnIndex("labelKey", out int labelCol));
            Assert.True(schema.TryGetColumnIndex("valuesKey", out int valuesCol));
            Assert.True(schema.TryGetColumnIndex("valuesKeyKey", out int valuesKeyCol));

            Assert.Equal(3, schema.GetColumnType(labelCol).KeyCount);
            Assert.True(schema.GetColumnType(valuesCol).ItemType.IsKey);
            Assert.True(schema.GetColumnType(valuesKeyCol).ItemType.IsKey);

            var labelKeyType     = schema.GetMetadataTypeOrNull(MetadataUtils.Kinds.KeyValues, labelCol);
            var valuesKeyType    = schema.GetMetadataTypeOrNull(MetadataUtils.Kinds.KeyValues, valuesCol);
            var valuesKeyKeyType = schema.GetMetadataTypeOrNull(MetadataUtils.Kinds.KeyValues, valuesKeyCol);

            Assert.NotNull(labelKeyType);
            Assert.NotNull(valuesKeyType);
            Assert.NotNull(valuesKeyKeyType);
            Assert.True(labelKeyType.IsVector && labelKeyType.ItemType == TextType.Instance);
            Assert.True(valuesKeyType.IsVector && valuesKeyType.ItemType == NumberType.Float);
            Assert.True(valuesKeyKeyType.IsVector && valuesKeyKeyType.ItemType == NumberType.Float);
            // Because they're over exactly the same data, they ought to have the same cardinality and everything.
            Assert.True(valuesKeyKeyType.Equals(valuesKeyType));
        }
        void New_FileBasedSavingOfData()
        {
            var ml        = new MLContext(seed: 1, conc: 1);
            var src       = new MultiFileSource(GetDataPath(TestDatasets.Sentiment.trainFilename));
            var trainData = ml.Data.TextReader(MakeSentimentTextLoaderArgs())
                            .Append(ml.Transforms.Text.FeaturizeText("SentimentText", "Features"))
                            .Fit(src).Read(src);

            var path = DeleteOutputPath("i.idv");

            using (var file = File.Create(path))
            {
                var saver = new BinarySaver(ml, new BinarySaver.Arguments());
                using (var ch = ((IHostEnvironment)ml).Start("SaveData"))
                    DataSaverUtils.SaveDataView(ch, saver, trainData, file);
            }

            var trainer         = ml.BinaryClassification.Trainers.StochasticDualCoordinateAscent("Label", "Features", advancedSettings: s => s.NumThreads = 1);
            var loadedTrainData = new BinaryLoader(ml, new BinaryLoader.Arguments(), new MultiFileSource(path));

            // Train.
            var model = trainer.Fit(loadedTrainData);
        }
Пример #13
0
        public void CrossValidate()
        {
            var env        = new MLContext(seed: 0);
            var dataPath   = GetDataPath(TestDatasets.iris.trainFilename);
            var dataSource = new MultiFileSource(dataPath);

            var ctx    = new MulticlassClassificationContext(env);
            var reader = TextLoader.CreateReader(env,
                                                 c => (label: c.LoadText(0), features: c.LoadFloat(1, 4)));

            var est = reader.MakeNewEstimator()
                      .Append(r => (label: r.label.ToKey(), r.features))
                      .Append(r => (r.label, preds: ctx.Trainers.Sdca(
                                        r.label,
                                        r.features,
                                        maxIterations: 2)));

            var results = ctx.CrossValidate(reader.Read(dataSource), est, r => r.label)
                          .Select(x => x.metrics).ToArray();

            Assert.Equal(5, results.Length);
            Assert.True(results.All(x => x.LogLoss > 0));
        }
Пример #14
0
        public void LpGcNormAndWhitening()
        {
            var env        = new ConsoleEnvironment(seed: 0);
            var dataPath   = GetDataPath("generated_regression_dataset.csv");
            var dataSource = new MultiFileSource(dataPath);

            var reader = TextLoader.CreateReader(env,
                                                 c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)),
                                                 separator: ';', hasHeader: true);
            var data = reader.Read(dataSource);

            var est = reader.MakeNewEstimator()
                      .Append(r => (r.label,
                                    lpnorm: r.features.LpNormalize(),
                                    gcnorm: r.features.GlobalContrastNormalize(),
                                    zcawhitened: r.features.ZcaWhitening(),
                                    pcswhitened: r.features.PcaWhitening()));
            var tdata  = est.Fit(data).Transform(data);
            var schema = tdata.AsDynamic.Schema;

            Assert.True(schema.TryGetColumnIndex("lpnorm", out int lpnormCol));
            var type = schema.GetColumnType(lpnormCol);

            Assert.True(type.IsVector && type.IsKnownSizeVector && type.ItemType.IsNumber);

            Assert.True(schema.TryGetColumnIndex("gcnorm", out int gcnormCol));
            type = schema.GetColumnType(gcnormCol);
            Assert.True(type.IsVector && type.IsKnownSizeVector && type.ItemType.IsNumber);

            Assert.True(schema.TryGetColumnIndex("zcawhitened", out int zcawhitenedCol));
            type = schema.GetColumnType(zcawhitenedCol);
            Assert.True(type.IsVector && type.IsKnownSizeVector && type.ItemType.IsNumber);

            Assert.True(schema.TryGetColumnIndex("pcswhitened", out int pcswhitenedCol));
            type = schema.GetColumnType(pcswhitenedCol);
            Assert.True(type.IsVector && type.IsKnownSizeVector && type.ItemType.IsNumber);
        }
Пример #15
0
        public void TestCustomTransformer()
        {
            string dataPath = GetDataPath("adult.test");
            var    source   = new MultiFileSource(dataPath);
            var    loader   = ML.Data.TextReader(new[] {
                new TextLoader.Column("Float1", DataKind.R4, 0),
                new TextLoader.Column("Float4", DataKind.R4, new[] { new TextLoader.Range(0), new TextLoader.Range(2), new TextLoader.Range(4), new TextLoader.Range(10) })
            }, s => { s.Separator = ","; s.HasHeader = true; });

            var data = loader.Read(source);

            IDataView transformedData;
            // We create a temporary environment to instantiate the custom transformer. This is to ensure that we don't need the same
            // environment for saving and loading.
            var tempoEnv  = new MLContext();
            var customEst = new CustomMappingEstimator <MyInput, MyOutput>(tempoEnv, MyLambda.MyAction, "MyLambda");

            try
            {
                TestEstimatorCore(customEst, data);
                Assert.True(false, "Cannot work without MEF injection");
            }
            catch (Exception)
            {
                // REVIEW: we should have a common mechanism that will make sure this is 'our' exception thrown.
            }
            ML.CompositionContainer = new CompositionContainer(new TypeCatalog(typeof(MyLambda)));
            TestEstimatorCore(customEst, data);
            transformedData = customEst.Fit(data).Transform(data);

            var inputs  = transformedData.AsEnumerable <MyInput>(ML, true);
            var outputs = transformedData.AsEnumerable <MyOutput>(ML, true);

            Assert.True(inputs.Zip(outputs, (x, y) => y.Together == $"{x.Float1} + {string.Join(", ", x.Float4)}").All(x => x));

            Done();
        }
Пример #16
0
        public void TestStatefulCustomMappingTransformer()
        {
            string dataPath = GetDataPath("breast-cancer.txt");
            var    source   = new MultiFileSource(dataPath);
            var    loader   = ML.Data.CreateTextLoader(new[] {
                new TextLoader.Column("Features", DataKind.Single, 1, 9),
                new TextLoader.Column("Label", DataKind.String, 0),
                new TextLoader.Column("Value", DataKind.Single, 2),
            });
            var data = loader.Load(source);

            // We create a temporary environment to instantiate the custom transformer. This is to ensure that we don't need the same
            // environment for saving and loading.
            var tempoEnv  = new MLContext();
            var customEst = tempoEnv.Transforms.StatefulCustomMapping <MyStatefulInput, MyStatefulOutput, MyState>(MyStatefulLambda.MyStatefulAction, MyStatefulLambda.MyStateInit, nameof(MyStatefulLambda));

            TestEstimatorCore(customEst, data);
            var transformedData = customEst.Fit(data).Transform(data);
            var outputs         = transformedData.GetColumn <bool>(transformedData.Schema[nameof(MyStatefulOutput.FirstAppearance)]);

            Assert.Equal(10, outputs.Count(output => output));

            Done();
        }
Пример #17
0
        public void TestCustomTransformer(bool registerAssembly)
        {
            string dataPath = GetDataPath("adult.tiny.with-schema.txt");
            var    source   = new MultiFileSource(dataPath);
            var    loader   = ML.Data.CreateTextLoader(new[] {
                new TextLoader.Column("Float1", DataKind.Single, 9),
                new TextLoader.Column("Float4", DataKind.Single, new[] { new TextLoader.Range(9), new TextLoader.Range(10), new TextLoader.Range(11), new TextLoader.Range(12) })
            }, hasHeader: true);

            var data = loader.Load(source);

            IDataView transformedData;
            // We create a temporary environment to instantiate the custom transformer. This is to ensure that we don't need the same
            // environment for saving and loading.
            var tempoEnv  = new MLContext(1);
            var customEst = new CustomMappingEstimator <MyInput, MyOutput>(tempoEnv, MyLambda.MyAction, "MyLambda");

            // Before 1.5-preview3 it was required to register the assembly.
            // Now, the assembly information is automatically saved in the model and the assembly is registered
            // when loading.
            // This tests the case that the CustomTransformer still works even if you explicitly register the assembly
            if (registerAssembly)
            {
                ML.ComponentCatalog.RegisterAssembly(typeof(MyLambda).Assembly);
            }

            TestEstimatorCore(customEst, data);
            transformedData = customEst.Fit(data).Transform(data);

            var inputs  = ML.Data.CreateEnumerable <MyInput>(transformedData, true);
            var outputs = ML.Data.CreateEnumerable <MyOutput>(transformedData, true);

            Assert.True(inputs.Zip(outputs, (x, y) => y.Together == $"{x.Float1} + {string.Join(", ", x.Float4)}").All(x => x));

            Done();
        }
Пример #18
0
        public void Normalizer()
        {
            var env        = new ConsoleEnvironment(seed: 0);
            var dataPath   = GetDataPath("generated_regression_dataset.csv");
            var dataSource = new MultiFileSource(dataPath);

            var reader = TextLoader.CreateReader(env,
                                                 c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)),
                                                 separator: ';', hasHeader: true);
            var data = reader.Read(dataSource);

            var est = reader.MakeNewEstimator()
                      .Append(r => (r.label, r.features, bin: r.features.NormalizeByBinning(), mm: r.features.Normalize()));
            var tdata = est.Fit(data).Transform(data);

            var schema = tdata.AsDynamic.Schema;

            Assert.True(schema.TryGetColumnIndex("features", out int featCol));
            Assert.True(schema.TryGetColumnIndex("bin", out int binCol));
            Assert.True(schema.TryGetColumnIndex("mm", out int mmCol));
            Assert.False(schema.IsNormalized(featCol));
            Assert.True(schema.IsNormalized(binCol));
            Assert.True(schema.IsNormalized(mmCol));
        }
Пример #19
0
        /// <summary>
        /// Reads a text file as a IDataView.
        /// Follows pandas API.
        /// </summary>
        /// <param name="filename">filename</param>
        /// <param name="sep">column separator</param>
        /// <param name="header">has a header or not</param>
        /// <param name="names">column names (can be empty)</param>
        /// <param name="dtypes">column types (can be empty)</param>
        /// <param name="nrows">number of rows to read</param>
        /// <param name="guess_rows">number of rows used to guess types</param>
        /// <param name="encoding">text encoding</param>
        /// <param name="useThreads">specific to TextLoader</param>
        /// <param name="host">host</param>
        /// <param name="index">add a column to hold the index</param>
        /// <returns>TextLoader</returns>
        public static IDataView ReadCsvToTextLoader(string[] filenames,
                                                    char sep          = ',', bool header = true,
                                                    string[] names    = null, ColumnType[] dtypes = null,
                                                    int nrows         = -1, int guess_rows        = 10,
                                                    Encoding encoding = null, bool useThreads     = true,
                                                    bool index        = false, IHost host         = null)
        {
            var df = ReadCsv(filenames[0], sep: sep, header: header, names: names, dtypes: dtypes,
                             nrows: guess_rows, guess_rows: guess_rows, encoding: encoding, index: index);
            var sch  = df.Schema;
            var cols = new TextLoader.Column[sch.ColumnCount];

            for (int i = 0; i < cols.Length; ++i)
            {
                cols[i] = TextLoader.Column.Parse(df.NameType(i));
            }

            var args = new TextLoader.Arguments()
            {
                AllowQuoting   = false,
                Separator      = string.Format("{0}", sep),
                Column         = cols,
                TrimWhitespace = true,
                UseThreads     = useThreads,
                HasHeader      = header,
                MaxRows        = nrows > 0 ? (int?)nrows : null
            };

            if (host == null)
            {
                host = new ConsoleEnvironment().Register("TextLoader");
            }
            var multiSource = new MultiFileSource(filenames);

            return(new TextLoader(host, args, multiSource).Read(multiSource));
        }
        public void FastTreeRegressionRepresentationWithCategoricalSplit()
        {
            var env        = new MLContext(seed: 0);
            var dataPath   = GetDataPath(TestDatasets.generatedRegressionDataset.trainFilename);
            var dataSource = new MultiFileSource(dataPath);

            var catalog = new RegressionCatalog(env);

            var reader = TextLoaderStatic.CreateLoader(env,
                                                       c => (label: c.LoadFloat(11), features: c.LoadText(0, 10)),
                                                       separator: ';', hasHeader: true);

            FastTreeRegressionModelParameters pred = null;

            var opts = new FastTreeRegressionTrainer.Options()
            {
                CategoricalSplit = true,
                NumTrees         = 3,
                NumLeaves        = 5,
                NumThreads       = 1,
                // This is the minimal samples to form a split (i.e., generating two extra nodes/leaves). For a small data set,
                // we should set a small value. Otherwise, the trained trees could be empty.
                MinDocumentsInLeafs = 2
            };

            var est = reader.MakeNewEstimator()
                      .Append(r => (r.label, features: r.features.OneHotEncoding()))
                      .Append(r => (r.label, score: catalog.Trainers.FastTree(r.label, r.features, null, opts,
                                                                              onFit: (p) => { pred = p; })));

            var pipe = reader.Append(est);

            Assert.Null(pred);
            var model = pipe.Fit(dataSource);

            Assert.NotNull(pred);

            var treeCollection = pred.TrainedTreeEnsemble;

            Assert.Equal(0, treeCollection.Bias);
            Assert.Equal(3, treeCollection.Trees.Count);
            Assert.Equal(3, treeCollection.TreeWeights.Count);

            var trees = treeCollection.Trees;

            Assert.Equal(4, trees[0].NumNodes);

            var expectedGtChild = new int[] { 3, -3, -4, -5 };

            Assert.Equal(4, trees[0].GtChild.Count);
            Assert.Equal(expectedGtChild, trees[0].GtChild);

            var expectedLteChild = new int[] { 1, 2, -1, -2 };

            Assert.Equal(4, trees[0].LteChild.Count);
            Assert.Equal(expectedLteChild, trees[0].LteChild);

            var expectedCategoricalSplitFlags = new bool[] { true, true, true, true };

            Assert.Equal(4, trees[0].CategoricalSplitFlags.Count);
            Assert.Equal(expectedCategoricalSplitFlags, trees[0].CategoricalSplitFlags);

            var expectedNumericalSplitFeatureIndexes = new int[] { 5312, 2, 2126, 533 };

            Assert.Equal(4, trees[0].NumericalSplitFeatureIndexes.Count);
            Assert.Equal(expectedNumericalSplitFeatureIndexes, trees[0].NumericalSplitFeatureIndexes);

            var expectedNumericalSplitThresholds = new float[] { 0.5f, 0.5f, 0.5f, 0.5f };

            Assert.Equal(4, trees[0].NumericalSplitThresholds.Count);
            for (int i = 0; i < trees[0].NumericalSplitThresholds.Count; ++i)
            {
                Assert.Equal(expectedNumericalSplitThresholds[i], trees[0].NumericalSplitThresholds[i], 6);
            }

            var actualCategoricalRanges0 = trees[0].GetCategoricalCategoricalSplitFeatureRangeAt(0);

            Assert.Equal(actualCategoricalRanges0, new int[] { 5312, 5782 });

            var actualCategoricalRanges1 = trees[0].GetCategoricalCategoricalSplitFeatureRangeAt(1);

            Assert.Equal(actualCategoricalRanges1, new int[] { 2, 417 });

            var actualCategoricalRanges2 = trees[0].GetCategoricalCategoricalSplitFeatureRangeAt(2);

            Assert.Equal(actualCategoricalRanges2, new int[] { 2126, 2593 });

            var actualCategoricalRanges3 = trees[0].GetCategoricalCategoricalSplitFeatureRangeAt(3);

            Assert.Equal(actualCategoricalRanges3, new int[] { 533, 983 });

            int[] expectedCounts = { 62, 52, 54, 22 };
            int[] expectedStarts = { 5315, 10, 2141, 533 };
            int[] expectedEnds   = { 5782, 401, 2558, 874 };
            for (int i = 0; i < trees[0].NumNodes; ++i)
            {
                // Retrieve i-th node's split features.
                var actualCategoricalSplitFeatures = trees[0].GetCategoricalSplitFeaturesAt(i);
                Assert.Equal(expectedCounts[i], actualCategoricalSplitFeatures.Count);
                Assert.Equal(expectedStarts[i], actualCategoricalSplitFeatures[0]);
                Assert.Equal(expectedEnds[i], actualCategoricalSplitFeatures[expectedCounts[i] - 1]);
            }

            Assert.Equal(5, trees[0].NumLeaves);

            var expectedLeafValues = new double[] { 48.456055413607892, 86.584156799316418, 87.017326642027, 76.381184971185391, 117.68872643673058 };

            Assert.Equal(5, trees[0].LeafValues.Count);
            for (int i = 0; i < trees[0].LeafValues.Count; ++i)
            {
                Assert.Equal(expectedLeafValues[i], trees[0].LeafValues[i], 6);
            }
        }
Пример #21
0
        // This examples shows all the ways to load data with TextLoader.
        public static void Example()
        {
            // Create 5 data files to illustrate different loading methods.
            var dataFiles         = new List <string>();
            var random            = new Random(1);
            var dataDirectoryName = "DataDir";

            Directory.CreateDirectory(dataDirectoryName);
            for (int i = 0; i < 5; i++)
            {
                var fileName = Path.Combine(dataDirectoryName, $"Data_{i}.csv");
                dataFiles.Add(fileName);
                using (var fs = File.CreateText(fileName))
                {
                    // Write without header with 10 random columns, forcing
                    // approximately 80% of values to be 0.
                    for (int line = 0; line < 10; line++)
                    {
                        var sb = new StringBuilder();
                        for (int pos = 0; pos < 10; pos++)
                        {
                            var value = random.NextDouble();
                            sb.Append((value < 0.8 ? 0 : value).ToString() + '\t');
                        }
                        fs.WriteLine(sb.ToString(0, sb.Length - 1));
                    }
                }
            }

            // Create a TextLoader.
            var mlContext = new MLContext();
            var loader    = mlContext.Data.CreateTextLoader(
                columns: new[]
            {
                new TextLoader.Column("Features", DataKind.Single, 0, 9)
            },
                hasHeader: false
                );

            // Load a single file from path.
            var singleFileData = loader.Load(dataFiles[0]);

            PrintRowCount(singleFileData);

            // Expected Output:
            //   10


            // Load all 5 files from path.
            var multipleFilesData = loader.Load(dataFiles.ToArray());

            PrintRowCount(multipleFilesData);

            // Expected Output:
            //   50


            // Load all files using path wildcard.
            var multipleFilesWildcardData =
                loader.Load(Path.Combine(dataDirectoryName, "Data_*.csv"));

            PrintRowCount(multipleFilesWildcardData);

            // Expected Output:
            //   50


            // Create a TextLoader with user defined type.
            var loaderWithCustomType =
                mlContext.Data.CreateTextLoader <Data>(hasHeader: false);

            // Load a single file from path.
            var singleFileCustomTypeData = loaderWithCustomType.Load(dataFiles[0]);

            PrintRowCount(singleFileCustomTypeData);

            // Expected Output:
            //   10


            // Create a TextLoader with unknown column length to illustrate
            // how a data sample may be used to infer column size.
            var dataSample = new MultiFileSource(dataFiles[0]);
            var loaderWithUnknownLength = mlContext.Data.CreateTextLoader(
                columns: new[]
            {
                new TextLoader.Column("Features",
                                      DataKind.Single,
                                      new[] { new TextLoader.Range(0, null) })
            },
                dataSample: dataSample
                );

            var dataWithInferredLength = loaderWithUnknownLength.Load(dataFiles[0]);
            var featuresColumn         = dataWithInferredLength.Schema.GetColumnOrNull("Features");

            if (featuresColumn.HasValue)
            {
                Console.WriteLine(featuresColumn.Value.ToString());
            }

            // Expected Output:
            //   Features: Vector<Single, 10>
            //
            // ML.NET infers the correct length of 10 for the Features column,
            // which is of type Vector<Single>.

            PrintRowCount(dataWithInferredLength);

            // Expected Output:
            //   10


            // Save the data with 10 rows to a text file to illustrate the use of
            // sparse format.
            var sparseDataFileName = Path.Combine(dataDirectoryName, "saved_data.tsv");

            using (FileStream stream = new FileStream(sparseDataFileName, FileMode.Create))
                mlContext.Data.SaveAsText(singleFileData, stream);

            // Since there are many zeroes in the data, it will be saved in a sparse
            // representation to save disk space. The data may be forced to be saved
            // in a dense representation by setting forceDense to true. The sparse
            // data will look like the following:
            //
            //   10 7:0.943862259
            //   10 3:0.989767134
            //   10 0:0.949778438   8:0.823028445   9:0.886469543
            //
            // The sparse representation of the first row indicates that there are
            // 10 columns, the column 7 (8-th column) has value 0.943862259, and other
            // omitted columns have value 0.

            // Create a TextLoader that allows sparse input.
            var sparseLoader = mlContext.Data.CreateTextLoader(
                columns: new[]
            {
                new TextLoader.Column("Features", DataKind.Single, 0, 9)
            },
                allowSparse: true
                );

            // Load the saved sparse data.
            var sparseData = sparseLoader.Load(sparseDataFileName);

            PrintRowCount(sparseData);

            // Expected Output:
            //   10


            // Create a TextLoader without any column schema using TextLoader.Options.
            // Since the sparse data file was saved with ML.NET, it has the schema
            // enoded in its header that the loader can understand:
            //
            // #@ TextLoader{
            // #@   sep=tab
            // #@   col=Features:R4:0-9
            // #@ }
            //
            // The schema syntax is unimportant since it is only used internally. In
            // short, it tells the loader that the values are separated by tabs, and
            // that columns 0-9 in the text file are to be read into one column named
            // "Features" of type Single (internal type R4).

            var options = new TextLoader.Options()
            {
                AllowSparse = true,
            };
            var dataSampleWithSchema   = new MultiFileSource(sparseDataFileName);
            var sparseLoaderWithSchema =
                mlContext.Data.CreateTextLoader(options, dataSample: dataSampleWithSchema);

            // Load the saved sparse data.
            var sparseDataWithSchema = sparseLoaderWithSchema.Load(sparseDataFileName);

            PrintRowCount(sparseDataWithSchema);

            // Expected Output:
            //   10
        }
Пример #22
0
        public void MultiFileSourceUnitTest()
        {
            var fileSource = new MultiFileSource("adult.txt");

            Assert.True(fileSource.Count == 1);

            fileSource = new MultiFileSource("adult.tiny.with-schema.txt", "adult.tiny.with-schema.txt");
            Assert.True(fileSource.Count == 2, $"Error passing multiple paths to {nameof(MultiFileSource)}");

            //creating a directory with three files for the tests
            var dirName = Directory.CreateDirectory("MultiFileSourceUnitTest").FullName;

            var file1 = Path.Combine(dirName, "a.txt");
            var file2 = Path.Combine(dirName, "b.txt");

            File.WriteAllText(file1, "Unit Test");
            File.WriteAllText(file2, "Unit Test");

            fileSource = new MultiFileSource($"{file1}+{file2}");
            Assert.True(fileSource.Count == 2, $"Error passing concatenated paths to {nameof(MultiFileSource)}");

            fileSource = new MultiFileSource(Path.Combine(dirName, "..."));
            Assert.True(fileSource.Count == 2, $"Error passing concatenated paths to {nameof(MultiFileSource)}");

            /* Create test directories and files in the following specifications:
             * /MultiFileSourceUnitTest/Data
             * /MultiFileSourceUnitTest/Data/a.txt
             * /MultiFileSourceUnitTest/Data/b.txt
             * /MultiFileSourceUnitTest/DataFolder/
             * /MultiFileSourceUnitTest/DataFolder/SubFolder1
             * /MultiFileSourceUnitTest/DataFolder/SubFolder1/a.txt
             * /MultiFileSourceUnitTest/DataFolder/SubFolder2
             * /MultiFileSourceUnitTest/DataFolder/SubFolder2/b.txt
             */

            var dataDir = Directory.CreateDirectory("MultiFileSourceUnitTest/Data").FullName;

            var fileDataA = Path.Combine(dataDir, "a.txt");
            var fileDataB = Path.Combine(dataDir, "b.txt");

            File.WriteAllText(fileDataA, "Unit Test");
            File.WriteAllText(fileDataB, "Unit Test");

            var dataFolderDir = Directory.CreateDirectory("MultiFileSourceUnitTest/DataFolder").FullName;
            var subFolder1Dir = Directory.CreateDirectory("MultiFileSourceUnitTest/DataFolder/SubFolder1").FullName;
            var subFolder2Dir = Directory.CreateDirectory("MultiFileSourceUnitTest/DataFolder/SubFolder2").FullName;

            var fileDataSA = Path.Combine(subFolder1Dir, "a.txt");
            var fileDataSB = Path.Combine(subFolder2Dir, "b.txt");

            File.WriteAllText(fileDataSA, "Unit Test");
            File.WriteAllText(fileDataSB, "Unit Test");

            fileSource = new MultiFileSource(dataDir + "/*");
            Assert.True(fileSource.Count == 2, $"Error passing concatenated paths to {nameof(MultiFileSource)}");

            fileSource = new MultiFileSource(dataFolderDir + "/.../*");
            Assert.True(fileSource.Count == 2, $"Error passing concatenated paths to {nameof(MultiFileSource)}");

            //Delete test folder and files for test clean-up
            Directory.Delete(dirName, true);
        }
Пример #23
0
        void TestConcat()
        {
            string dataPath = GetDataPath("adult.tiny.with-schema.txt");

            var source = new MultiFileSource(dataPath);
            var loader = new TextLoader(ML, new TextLoader.Options
            {
                Columns = new[] {
                    new TextLoader.Column("float1", DataKind.Single, 9),
                    new TextLoader.Column("float4", DataKind.Single, new[] { new TextLoader.Range(9), new TextLoader.Range(10), new TextLoader.Range(11), new TextLoader.Range(12) }),
                    new TextLoader.Column("float6", DataKind.Single, new[] { new TextLoader.Range(9), new TextLoader.Range(10), new TextLoader.Range(11), new TextLoader.Range(12, 14) }),
                    new TextLoader.Column("vfloat", DataKind.Single, new[] { new TextLoader.Range(14, null)
                                                                             {
                                                                                 AutoEnd = false, VariableEnd = true
                                                                             } })
                },
                Separator = "\t",
                HasHeader = true
            }, new MultiFileSource(dataPath));
            var data = loader.Load(source);

            DataViewType GetType(DataViewSchema schema, string name)
            {
                Assert.True(schema.TryGetColumnIndex(name, out int cIdx), $"Could not find '{name}'");
                return(schema[cIdx].Type);
            }

            var pipe = ML.Transforms.Concatenate("f1", "float1")
                       .Append(ML.Transforms.Concatenate("f2", "float1", "float1"))
                       .Append(ML.Transforms.Concatenate("f3", "float4", "float1"))
                       .Append(ML.Transforms.Concatenate("f4", "float6", "vfloat", "float1"));

            data = ML.Data.TakeRows(data, 10);
            data = pipe.Fit(data).Transform(data);

            DataViewType t;

            t = GetType(data.Schema, "f1");
            Assert.True(t is VectorDataViewType vt1 && vt1.ItemType == NumberDataViewType.Single && vt1.Size == 1);
            t = GetType(data.Schema, "f2");
            Assert.True(t is VectorDataViewType vt2 && vt2.ItemType == NumberDataViewType.Single && vt2.Size == 2);
            t = GetType(data.Schema, "f3");
            Assert.True(t is VectorDataViewType vt3 && vt3.ItemType == NumberDataViewType.Single && vt3.Size == 5);
            t = GetType(data.Schema, "f4");
            Assert.True(t is VectorDataViewType vt4 && vt4.ItemType == NumberDataViewType.Single && vt4.Size == 0);

            data = ML.Transforms.SelectColumns("f1", "f2", "f3", "f4").Fit(data).Transform(data);

            var subdir     = Path.Combine("Transform", "Concat");
            var outputPath = GetOutputPath(subdir, "Concat1.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(ML, new TextSaver.Arguments {
                    Silent = true, Dense = true
                });
                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, data, fs, keepHidden: false);
            }

            CheckEquality(subdir, "Concat1.tsv");
            Done();
        }
Пример #24
0
        public void LoadModelAndExtractPredictor()
        {
            var mlContext = new MLContext(1);

            var file   = new MultiFileSource(TestCommon.GetDataPath(DataDir, TestDatasets.adult.trainFilename));
            var loader = mlContext.Data.CreateTextLoader <InputData>(hasHeader: true, dataSample: file);
            var data   = loader.Load(file);

            // Pipeline.
            var pipeline = mlContext.BinaryClassification.Trainers.Gam();
            // Define the same pipeline starting with the loader.
            var pipeline1 = loader.Append(mlContext.BinaryClassification.Trainers.Gam());

            // Train.
            var transformerModel     = pipeline.Fit(data);
            var compositeLoaderModel = pipeline1.Fit(file);

            // Save and reload the "same" model with some differences in structure.

            // In this case we are saving the transformer model, but *not* the loader, just the schema from that loader.
            string modelAndSchemaPath = TestCommon.GetOutputPath(OutDir, FullTestName + "-model-schema.zip");

            mlContext.Model.Save(transformerModel, data.Schema, modelAndSchemaPath);

            // In this case we have combined the loader with the transformer model to form a "composite" loader, and are just
            // saving that one loader to this file.
            string compositeLoaderModelPath = TestCommon.GetOutputPath(OutDir, FullTestName + "-composite-model.zip");

            mlContext.Model.Save(null, compositeLoaderModel, compositeLoaderModelPath);

            // In this case we are saving the transformer model, as well as the associated data loader.
            string loaderAndTransformerModelPath = TestCommon.GetOutputPath(OutDir, FullTestName + "-loader-transformer.zip");

            mlContext.Model.Save(transformerModel, loader, loaderAndTransformerModelPath);

            ITransformer loadedTransformerModel;
            IDataLoader <IMultiStreamSource> loadedCompositeLoader;
            ITransformer loadedTransformerModel1;

            using (var fs = File.OpenRead(modelAndSchemaPath))
                loadedTransformerModel = mlContext.Model.Load(fs, out var loadedSchema);
            using (var fs = File.OpenRead(compositeLoaderModelPath))
            {
                // This model can be loaded either as a composite data loader,
                // a transformer model + an input schema, or a transformer model + a data loader.
                var t = mlContext.Model.LoadWithDataLoader(fs, out loadedCompositeLoader);
                // This is a bit strange, as it seems to test that it can reload from the same
                // stream twice opened only once, which as far as I know is not really a requirement
                // of the design or API, but we are nonetheless testing it. If this winds up failing,
                // I'm not sure we should really insist on this as a design requirement.
                var t1 = mlContext.Model.Load(fs, out var s);

                TestCommon.CheckSameSchemas(loadedCompositeLoader.GetOutputSchema(), s);
                // We combined the GAM with the loader, so the remaining chain should just be empty.
                Assert.Empty(Assert.IsType <TransformerChain <ITransformer> >(t));
                Assert.Empty(Assert.IsType <TransformerChain <ITransformer> >(t1));
            }
            using (var fs = File.OpenRead(loaderAndTransformerModelPath))
            {
                // This model can be loaded either as a composite data loader,
                // a transformer model + an input schema, or a transformer model + a data loader.
                var t = mlContext.Model.Load(fs, out var s);
                TestCommon.CheckSameSchemas(loader.GetOutputSchema(), s);

                loadedTransformerModel1 = mlContext.Model.LoadWithDataLoader(fs, out var l);
            }
Пример #25
0
        public GenerateSweepCandidatesCommand(IHostEnvironment env, Arguments args)
        {
            Contracts.CheckValue(env, nameof(env));
            _host = env.Register("GenerateCandidates");
            _host.CheckValue(args, nameof(args));

            var files = new MultiFileSource(args.DataFile);

            _host.CheckUserArg(files.Count > 0, nameof(args.DataFile), "dataFile is required");
            _dataFile = args.DataFile;

            _rspsOutFolder = Utils.CreateFolderIfNotExists(args.RspOutFolder);
            _host.CheckUserArg(_rspsOutFolder != null, nameof(args.RspOutFolder), "Provide a value rspOutFolder (or 'out', the short name).");

            if (!string.IsNullOrWhiteSpace(args.SchemaDefinitionFile))
            {
                Utils.CheckOptionalUserDirectory(args.SchemaDefinitionFile, nameof(args.SchemaDefinitionFile));
                _schemaDefinitionFile = args.SchemaDefinitionFile;
            }

            if (!string.IsNullOrWhiteSpace(args.Sweeper))
            {
                var info = ComponentCatalog.GetLoadableClassInfo <SignatureSweeper>(args.Sweeper);
                _host.CheckUserArg(info?.SignatureTypes[0] == typeof(SignatureSweeper), nameof(args.Sweeper),
                                   "Please specify a valid sweeper.");
                _sweeper = args.Sweeper;
            }
            else
            {
                _sweeper = "kdo";
            }

            if (!string.IsNullOrWhiteSpace(args.Mode))
            {
                var info = ComponentCatalog.GetLoadableClassInfo <SignatureCommand>(args.Mode);
                _host.CheckUserArg(info?.Type == typeof(TrainCommand) ||
                                   info?.Type == typeof(TrainTestCommand) ||
                                   info?.Type == typeof(CrossValidationCommand), nameof(args.Mode), "Invalid mode.");
                _mode = args.Mode;
            }
            else
            {
                _mode = CrossValidationCommand.LoadName;
            }

            _indented = args.Indent;

            if (!string.IsNullOrWhiteSpace(args.TestFile))
            {
                files = new MultiFileSource(args.TestFile);
                _host.CheckUserArg(files.Count > 0, nameof(args.TestFile), "testFile needs to be a valid file, if provided.");
                _testFile = args.TestFile;
            }
            else
            {
                _host.CheckUserArg(_mode != TrainTestCommand.LoadName, nameof(args.TestFile), "testFile needs to be a valid file, for mode = TrainTest.");
            }

            _outputDataFolder = Utils.CreateFolderIfNotExists(args.OutputDataFolder);
            if (_outputDataFolder == null)
            {
                _outputDataFolder = _rspsOutFolder;
            }
        }
Пример #26
0
        public void LoadModelAndExtractPredictor()
        {
            var file   = new MultiFileSource(GetDataPath(TestDatasets.adult.trainFilename));
            var loader = ML.Data.CreateTextLoader <InputData>(hasHeader: true, dataSample: file);
            var data   = loader.Load(file);

            // Pipeline.
            var pipeline = ML.BinaryClassification.Trainers.Gam();
            // Define the same pipeline starting with the loader.
            var pipeline1 = loader.Append(ML.BinaryClassification.Trainers.Gam());

            // Train.
            var transformerModel     = pipeline.Fit(data);
            var compositeLoaderModel = pipeline1.Fit(file);

            // Save and reload the "same" model with some differences in structure.

            // In this case we are saving the transformer model, but *not* the loader, just the schema from that loader.
            string modelAndSchemaPath = GetOutputPath(FullTestName + "-model-schema.zip");

            ML.Model.Save(transformerModel, data.Schema, modelAndSchemaPath);

            // In this case we have combined the loader with the transformer model to form a "composite" loader, and are just
            // saving that one loader to this file.
            string compositeLoaderModelPath = GetOutputPath(FullTestName + "-composite-model.zip");

            ML.Model.Save(null, compositeLoaderModel, compositeLoaderModelPath);

            // In this case we are saving the transformer model, as well as the associated data loader.
            string loaderAndTransformerModelPath = GetOutputPath(FullTestName + "-loader-transformer.zip");

            ML.Model.Save(transformerModel, loader, loaderAndTransformerModelPath);

            ITransformer loadedTransformerModel;
            IDataLoader <IMultiStreamSource> loadedCompositeLoader;
            ITransformer loadedTransformerModel1;

            using (var fs = File.OpenRead(modelAndSchemaPath))
                loadedTransformerModel = ML.Model.Load(fs, out var loadedSchema);
            using (var fs = File.OpenRead(compositeLoaderModelPath))
            {
                // This model can be loaded either as a composite data loader,
                // a transformer model + an input schema, or a transformer model + a data loader.
                var t = ML.Model.LoadWithDataLoader(fs, out loadedCompositeLoader);
                // This is a bit strange, as it seems to test that it can reload from the same
                // stream twice opened only once, which as far as I know is not really a requirement
                // of the design or API, but we are nonetheless testing it. If this winds up failing,
                // I'm not sure we should really insist on this as a design requirement.
                var t1 = ML.Model.Load(fs, out var s);

                CheckSameSchemas(loadedCompositeLoader.GetOutputSchema(), s);
                // We combined the GAM with the loader, so the remaining chain should just be empty.
                Assert.Empty(Assert.IsType <TransformerChain <ITransformer> >(t));
                Assert.Empty(Assert.IsType <TransformerChain <ITransformer> >(t1));
            }
            using (var fs = File.OpenRead(loaderAndTransformerModelPath))
            {
                // This model can be loaded either as a composite data loader,
                // a transformer model + an input schema, or a transformer model + a data loader.
                var t = ML.Model.Load(fs, out var s);
                CheckSameSchemas(loader.GetOutputSchema(), s);

                loadedTransformerModel1 = ML.Model.LoadWithDataLoader(fs, out var l);
            }

            void AssertIsGam(ITransformer trans)
            {
                Assert.IsType <GamBinaryModelParameters>(
                    Assert.IsAssignableFrom <CalibratedModelParametersBase>(
                        Assert.IsAssignableFrom <ISingleFeaturePredictionTransformer <object> >(trans).Model).SubModel);
            }

            // In the case of the directly used transformer model, the thing we loaded should be itself the result from fitting GAM.
            AssertIsGam(loadedTransformerModel);

            // This is quite similar, the fact that we omitted saving the loader and saved the input schema to the model itself.
            AssertIsGam(loadedTransformerModel1);

            // If we had combined the transformer with the loader, and then saved *that*, then the resulting loaded "model"
            // will be empty (as tested above), but the loader itself with a composite loader containing the result from
            // fitting GAM as the sole item in its transformer chain.
            var fromComposite = Assert.Single(Assert.IsType <TransformerChain <ITransformer> >(
                                                  Assert.IsType <CompositeDataLoader <IMultiStreamSource, ITransformer> >(loadedCompositeLoader).Transformer));

            AssertIsGam(fromComposite);

            Done();
        }
Пример #27
0
        // This examples shows how to load data with SvmLightLoader.
        public static void Example()
        {
            // Create a random SVM light format file.
            var random            = new Random(42);
            var dataDirectoryName = "DataDir";

            Directory.CreateDirectory(dataDirectoryName);
            var fileName = Path.Combine(dataDirectoryName, $"SVM_Data.csv");

            using (var fs = File.CreateText(fileName))
            {
                // Write random lines in SVM light format
                for (int line = 0; line < 10; line++)
                {
                    var sb = new StringBuilder();
                    if (random.NextDouble() > 0.5)
                    {
                        sb.Append("1 ");
                    }
                    else
                    {
                        sb.Append("-1 ");
                    }
                    if (line % 2 == 0)
                    {
                        sb.Append("cost:1");
                    }
                    else
                    {
                        sb.Append("cost:2");
                    }
                    for (int i = 1; i <= 10; i++)
                    {
                        if (random.NextDouble() > 0.5)
                        {
                            continue;
                        }
                        sb.Append($"{i}:{random.NextDouble()} ");
                    }
                    fs.WriteLine(sb.ToString());
                }
            }

            // Create an SvmLightLoader.
            var mlContext = new MLContext();
            var file      = new MultiFileSource(fileName);
            var loader    = mlContext.Data.CreateSvmLightLoader(dataSample: file);

            // Load a single file from path.
            var svmData = loader.Load(file);

            PrintSchema(svmData);

            // Expected Output:
            // Column Label type Single
            // Column Weight type Single
            // Column GroupId type Key<UInt64, 0 - 18446744073709551613>
            // Column Comment type String
            // Column Features type Vector<Single, 10>

            PrintData(svmData);

            // Expected Output:
            // 1 1 0 0 0.2625927 0 0 0.7612506 0.2573214 0 0.3809696 0.5174511
            // -1 1 0 0 0 0.7051522 0 0 0.7111546 0.9062127 0 0
            // -1 1 0 0 0 0.535722 0 0 0.1491191 0.05100901 0 0
            // -1 1 0 0.6481459 0.04449836 0 0 0.4203662 0 0 0.01325378 0.2674384
            // -1 1 0 0 0.7978093 0.5134962 0.008952909 0 0.003074009 0.6541431 0.9135142 0
            // -1 1 0 0.3727672 0.4369507 0 0 0.2973725 0 0 0 0.8816807
            // 1 1 0 0.1031429 0.3332489 0 0.1346936 0.5916625 0 0 0 0
            // 1 1 0 0 0 0.3454075 0 0.2197472 0.03848049 0.5923384 0.09373277 0
            // -1 1 0 0.7511514 0 0.0420841 0 0 0.9262196 0 0.545344 0
            // 1 1 0 0.02958358 0.9334617 0 0 0.8833956 0.2947684 0 0 0

            // If the loader is created without a data sample we need to specify the number of features expected in the file.
            loader  = mlContext.Data.CreateSvmLightLoader(inputSize: 10);
            svmData = loader.Load(file);

            PrintSchema(svmData);
            PrintData(svmData);
        }
Пример #28
0
        void TestConcat()
        {
            string dataPath = GetDataPath("adult.test");

            var source = new MultiFileSource(dataPath);
            var loader = new TextLoader(Env, new TextLoader.Arguments
            {
                Column = new[] {
                    new TextLoader.Column("float1", DataKind.R4, 0),
                    new TextLoader.Column("float4", DataKind.R4, new[] { new TextLoader.Range(0), new TextLoader.Range(2), new TextLoader.Range(4), new TextLoader.Range(10) }),
                    new TextLoader.Column("vfloat", DataKind.R4, new[] { new TextLoader.Range(0), new TextLoader.Range(2), new TextLoader.Range(4), new TextLoader.Range(10, null)
                                                                         {
                                                                             AutoEnd = false, VariableEnd = true
                                                                         } })
                },
                Separator = ",",
                HasHeader = true
            }, new MultiFileSource(dataPath));
            var data = loader.Read(source);

            ColumnType GetType(ISchema schema, string name)
            {
                Assert.True(schema.TryGetColumnIndex(name, out int cIdx), $"Could not find '{name}'");
                return(schema.GetColumnType(cIdx));
            }

            var pipe = new ConcatEstimator(Env, "f1", "float1")
                       .Append(new ConcatEstimator(Env, "f2", "float1", "float1"))
                       .Append(new ConcatEstimator(Env, "f3", "float4", "float1"))
                       .Append(new ConcatEstimator(Env, "f4", "vfloat", "float1"));

            data = TakeFilter.Create(Env, data, 10);
            data = pipe.Fit(data).Transform(data);

            ColumnType t;

            t = GetType(data.Schema, "f1");
            Assert.True(t.IsVector && t.ItemType == NumberType.R4 && t.VectorSize == 1);
            t = GetType(data.Schema, "f2");
            Assert.True(t.IsVector && t.ItemType == NumberType.R4 && t.VectorSize == 2);
            t = GetType(data.Schema, "f3");
            Assert.True(t.IsVector && t.ItemType == NumberType.R4 && t.VectorSize == 5);
            t = GetType(data.Schema, "f4");
            Assert.True(t.IsVector && t.ItemType == NumberType.R4 && t.VectorSize == 0);

            data = new ChooseColumnsTransform(Env, data, "f1", "f2", "f3", "f4");

            var subdir     = Path.Combine("Transform", "Concat");
            var outputPath = GetOutputPath(subdir, "Concat1.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(Env, new TextSaver.Arguments {
                    Silent = true, Dense = true
                });
                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, data, fs, keepHidden: false);
            }

            CheckEquality(subdir, "Concat1.tsv");
            Done();
        }
        static IDataTransform Create(IHostEnvironment env, Arguments args, IDataView input, out IDataView sourceCtx)
        {
            sourceCtx = input;
            env.CheckValue(args.tag, "Tag cannot be empty.");
            if (TagHelper.EnumerateTaggedView(true, input).Where(c => c.Item1 == args.tag).Any())
            {
                throw env.Except("Tag '{0}' is already used.", args.tag);
            }
            env.CheckValue(args.selectTag, "Selected tag cannot be empty.");

            if (string.IsNullOrEmpty(args.filename))
            {
                var selected = TagHelper.EnumerateTaggedView(true, input).Where(c => c.Item1 == args.selectTag);
                if (!selected.Any())
                {
                    throw env.Except("Unable to find a view to select with tag '{0}'. Did you forget to specify a filename?", args.selectTag);
                }
                var first = selected.First();
                if (selected.Skip(1).Any())
                {
                    throw env.Except("Tag '{0}' is ambiguous, {1} views were found.", args.selectTag, selected.Count());
                }
                var tagged = input as ITaggedDataView;
                if (tagged == null)
                {
                    var ag = new TagViewTransform.Arguments {
                        tag = args.tag
                    };
                    tagged = new TagViewTransform(env, ag, input);
                }
                first.Item2.AddRange(new[] { new Tuple <string, ITaggedDataView>(args.tag, tagged) });
                tagged.AddRange(new[] { new Tuple <string, ITaggedDataView>(args.selectTag, first.Item2) });
#if (DEBUG_TIP)
                long count = DataViewUtils.ComputeRowCount(tagged);
                if (count == 0)
                {
                    throw env.Except("Replaced view is empty.");
                }
                count = DataViewUtils.ComputeRowCount(first.Item2);
                if (count == 0)
                {
                    throw env.Except("Selected view is empty.");
                }
#endif
                var tr = first.Item2 as IDataTransform;
                env.AssertValue(tr);
                return(tr);
            }
            else
            {
                if (!File.Exists(args.filename))
                {
                    throw env.Except("Unable to find file '{0}'.", args.filename);
                }
                var selected = TagHelper.EnumerateTaggedView(true, input).Where(c => c.Item1 == args.selectTag);
                if (selected.Any())
                {
                    throw env.Except("Tag '{0}' was already given. It cannot be assigned to the new file.", args.selectTag);
                }
                var loaderArgs   = new BinaryLoader.Arguments();
                var file         = new MultiFileSource(args.filename);
                var loadSettings = ScikitSubComponent <ILegacyDataLoader, SignatureDataLoader> .AsSubComponent(args.loaderSettings);

                IDataView loader = loadSettings.CreateInstance(env, file);

                var ag = new TagViewTransform.Arguments {
                    tag = args.selectTag
                };
                var newInput = new TagViewTransform(env, ag, loader);
                var tagged   = input as ITaggedDataView;
                if (tagged == null)
                {
                    ag = new TagViewTransform.Arguments {
                        tag = args.tag
                    };
                    tagged = new TagViewTransform(env, ag, input);
                }

                newInput.AddRange(new[] { new Tuple <string, ITaggedDataView>(args.tag, tagged) });
                tagged.AddRange(new[] { new Tuple <string, ITaggedDataView>(args.selectTag, newInput) });

                var schema = loader.Schema;
                if (schema.Count == 0)
                {
                    throw env.Except("The loaded view '{0}' is empty (empty schema).", args.filename);
                }
                return(newInput);
            }
        }
        public void FastTreeRegressionRepresentation()
        {
            var env        = new MLContext(seed: 0);
            var dataPath   = GetDataPath(TestDatasets.generatedRegressionDataset.trainFilename);
            var dataSource = new MultiFileSource(dataPath);

            var catalog = new RegressionCatalog(env);

            var reader = TextLoaderStatic.CreateLoader(env,
                                                       c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)),
                                                       separator: ';', hasHeader: true);

            var opts = new FastTreeRegressionTrainer.Options()
            {
                NumTrees   = 10,
                NumLeaves  = 5,
                NumThreads = 1
            };

            FastTreeRegressionModelParameters pred = null;

            var est = reader.MakeNewEstimator()
                      .Append(r => (r.label, score: catalog.Trainers.FastTree(r.label, r.features, null, opts,
                                                                              onFit: (p) => { pred = p; })));

            var pipe = reader.Append(est);

            Assert.Null(pred);
            var model = pipe.Fit(dataSource);

            Assert.NotNull(pred);

            var treeCollection = pred.TrainedTreeEnsemble;

            Assert.Equal(0, treeCollection.Bias);
            Assert.Equal(10, treeCollection.Trees.Count);
            Assert.Equal(10, treeCollection.TreeWeights.Count);

            var trees = treeCollection.Trees;

            Assert.Equal(4, trees[0].NumNodes);

            // Numerical split. There is no categorical split so the follwoing vector contains 0-element.
            var categoricalSplitFeatures = trees[0].GetCategoricalSplitFeaturesAt(0);

            Assert.Equal(0, categoricalSplitFeatures.Count);

            // Numerical split. There is no categorical split so the follwoing vector contains 0-element.
            var categoricalSplitFeatureRange = trees[0].GetCategoricalCategoricalSplitFeatureRangeAt(0);

            Assert.Equal(0, categoricalSplitFeatureRange.Count);

            var expectedGtChild = new int[] { 3, 2, -4, -5 };

            Assert.Equal(4, trees[0].GtChild.Count);
            Assert.Equal(expectedGtChild, trees[0].GtChild);

            var expectedLteChild = new int[] { 1, -1, -3, -2 };

            Assert.Equal(4, trees[0].LteChild.Count);
            Assert.Equal(expectedLteChild, trees[0].LteChild);

            var expectedCategoricalSplitFlags = new bool[] { false, false, false, false };

            Assert.Equal(4, trees[0].CategoricalSplitFlags.Count);
            Assert.Equal(expectedCategoricalSplitFlags, trees[0].CategoricalSplitFlags);

            var expectedNumericalSplitFeatureIndexes = new int[] { 0, 10, 2, 10 };

            Assert.Equal(4, trees[0].NumericalSplitFeatureIndexes.Count);
            Assert.Equal(expectedNumericalSplitFeatureIndexes, trees[0].NumericalSplitFeatureIndexes);

            var expectedNumericalSplitThresholds = new float[] { 0.14f, -0.645f, -0.095f, 0.31f };

            Assert.Equal(4, trees[0].NumericalSplitThresholds.Count);
            for (int i = 0; i < trees[0].NumericalSplitThresholds.Count; ++i)
            {
                Assert.Equal(expectedNumericalSplitThresholds[i], trees[0].NumericalSplitThresholds[i], 6);
            }

            Assert.Equal(5, trees[0].NumLeaves);

            var expectedLeafValues = new double[] { 40.159015006449692, 80.434805844435061, 57.072130551545513, 82.898710076162757, 104.17547955322266 };

            Assert.Equal(5, trees[0].LeafValues.Count);
            for (int i = 0; i < trees[0].LeafValues.Count; ++i)
            {
                Assert.Equal(expectedLeafValues[i], trees[0].LeafValues[i], 6);
            }
        }