public void LogisticRegressionBinaryClassification() { var env = new MLContext(seed: 0); var dataPath = GetDataPath(TestDatasets.breastCancer.trainFilename); var dataSource = new MultiFileSource(dataPath); var ctx = new BinaryClassificationContext(env); var reader = TextLoader.CreateReader(env, c => (label: c.LoadBool(0), features: c.LoadFloat(1, 9))); IPredictorWithFeatureWeights <float> pred = null; var est = reader.MakeNewEstimator() .Append(r => (r.label, preds: ctx.Trainers.LogisticRegressionBinaryClassifier(r.label, r.features, l1Weight: 10, onFit: (p) => { pred = p; }, advancedSettings: s => s.NumThreads = 1))); var pipe = reader.Append(est); Assert.Null(pred); var model = pipe.Fit(dataSource); Assert.NotNull(pred); // 9 input features, so we ought to have 9 weights. VBuffer <float> weights = new VBuffer <float>(); pred.GetFeatureWeights(ref weights); Assert.Equal(9, weights.Length); var data = model.Read(dataSource); var metrics = ctx.Evaluate(data, r => r.label, r => r.preds); // Run a sanity check against a few of the metrics. Assert.InRange(metrics.Accuracy, 0, 1); Assert.InRange(metrics.Auc, 0, 1); Assert.InRange(metrics.Auprc, 0, 1); }
public void TestCustomTransformer() { string dataPath = GetDataPath("adult.tiny.with-schema.txt"); var source = new MultiFileSource(dataPath); var loader = ML.Data.CreateTextLoader(new[] { new TextLoader.Column("Float1", DataKind.R4, 9), new TextLoader.Column("Float4", DataKind.R4, new[] { new TextLoader.Range(9), new TextLoader.Range(10), new TextLoader.Range(11), new TextLoader.Range(12) }) }, hasHeader: true); var data = loader.Read(source); IDataView transformedData; // We create a temporary environment to instantiate the custom transformer. This is to ensure that we don't need the same // environment for saving and loading. var tempoEnv = new MLContext(); var customEst = new CustomMappingEstimator <MyInput, MyOutput>(tempoEnv, MyLambda.MyAction, "MyLambda"); try { TestEstimatorCore(customEst, data); Assert.True(false, "Cannot work without MEF injection"); } catch (InvalidOperationException ex) { if (!ex.IsMarked()) { throw; } } ML.CompositionContainer = new CompositionContainer(new TypeCatalog(typeof(MyLambda))); TestEstimatorCore(customEst, data); transformedData = customEst.Fit(data).Transform(data); var inputs = ML.CreateEnumerable <MyInput>(transformedData, true); var outputs = ML.CreateEnumerable <MyOutput>(transformedData, true); Assert.True(inputs.Zip(outputs, (x, y) => y.Together == $"{x.Float1} + {string.Join(", ", x.Float4)}").All(x => x)); Done(); }
public void SaveCompositeLoaderAndLoad() { var file = new MultiFileSource(GetDataPath(TestDatasets.adult.trainFilename)); var loader = ML.Data.CreateTextLoader <InputData>(hasHeader: true, dataSample: file); var composite = loader.Append(ML.Transforms.Normalize("Features")); var loaderWithEmbeddedModel = composite.Fit(file); string modelPath = GetOutputPath(FullTestName + "-model.zip"); ML.Model.Save(null, loaderWithEmbeddedModel, modelPath); Load(modelPath, out var loadedWithSchema, out var loadedSchema, out var loadedWithLoader, out var loadedLoaderWithTransformer); // Because we saved the transform model as part of the composite loader, with no transforms, // the transform that should be loaded should be an empty transformer chain, since the "model," // such as it is, has been combined with the loader. Assert.Empty(Assert.IsType <TransformerChain <ITransformer> >(loadedWithSchema)); Assert.Empty(Assert.IsType <TransformerChain <ITransformer> >(loadedWithLoader)); var expectedSchema = loaderWithEmbeddedModel.GetOutputSchema(); Assert.True(expectedSchema.Count == 3); Assert.NotNull(expectedSchema.GetColumnOrNull("Label")); Assert.NotNull(expectedSchema.GetColumnOrNull("Features")); Assert.True(expectedSchema["Features"].HasSlotNames()); CheckSameSchemas(loaderWithEmbeddedModel.GetOutputSchema(), loadedSchema); var schemaFromLoadedLoader = loadedLoaderWithTransformer.GetOutputSchema(); CheckSameSchemas(loaderWithEmbeddedModel.GetOutputSchema(), schemaFromLoadedLoader); // The type of the loader itself should be a composite data loader, and its single transformer // should be the normalizing transformer. var compositeLoader = Assert.IsType <CompositeDataLoader <IMultiStreamSource, ITransformer> >(loadedLoaderWithTransformer); var chainFromLoader = compositeLoader.Transformer; Assert.IsType <NormalizingTransformer>(Assert.Single(compositeLoader.Transformer)); Done(); }
public void SdcaBinaryClassification() { var env = new TlcEnvironment(seed: 0); var dataPath = GetDataPath("breast-cancer.txt"); var dataSource = new MultiFileSource(dataPath); var reader = TextLoader.CreateReader(env, c => (label: c.LoadBool(0), features: c.LoadFloat(1, 9))); LinearBinaryPredictor pred = null; ParameterMixingCalibratedPredictor cali = null; var est = reader.MakeNewEstimator() .Append(r => (r.label, preds: r.label.PredictSdcaBinaryClassification(r.features, maxIterations: 2, onFit: (p, c) => { pred = p; cali = c; }))); var pipe = reader.Append(est); Assert.Null(pred); Assert.Null(cali); var model = pipe.Fit(dataSource); Assert.NotNull(pred); Assert.NotNull(cali); // 9 input features, so we ought to have 9 weights. Assert.Equal(9, pred.Weights2.Count); var data = model.Read(dataSource); // Just output some data on the schema for fun. var rows = DataViewUtils.ComputeRowCount(data.AsDynamic); var schema = data.AsDynamic.Schema; for (int c = 0; c < schema.ColumnCount; ++c) { Console.WriteLine($"{schema.GetColumnName(c)}, {schema.GetColumnType(c)}"); } }
public InferRecipesCommand(IHostEnvironment env, Arguments args) { Contracts.CheckValue(env, nameof(env)); _host = env.Register("InferRecipes", seed: 0, verbose: true); _host.CheckValue(args, nameof(args)); var files = new MultiFileSource(args.DataFile); _host.CheckUserArg(files.Count > 0, nameof(args.DataFile), "dataFile is required"); _dataFile = args.DataFile; if (!string.IsNullOrWhiteSpace(args.RspOutputFile)) { Utils.CheckOptionalUserDirectory(args.RspOutputFile, nameof(args.RspOutputFile)); _rspOutFile = args.RspOutputFile; } if (!string.IsNullOrWhiteSpace(args.SchemaDefinitionFile)) { Utils.CheckOptionalUserDirectory(args.SchemaDefinitionFile, nameof(args.SchemaDefinitionFile)); _schemaDefinitionFile = args.SchemaDefinitionFile; } }
public void AveragePerceptronNoCalibration() { var env = new ConsoleEnvironment(seed: 0); var dataPath = GetDataPath(TestDatasets.breastCancer.trainFilename); var dataSource = new MultiFileSource(dataPath); var ctx = new BinaryClassificationContext(env); var reader = TextLoader.CreateReader(env, c => (label: c.LoadBool(0), features: c.LoadFloat(1, 9))); LinearBinaryPredictor pred = null; var loss = new HingeLoss(new HingeLoss.Arguments() { Margin = 1 }); var est = reader.MakeNewEstimator() .Append(r => (r.label, preds: ctx.Trainers.AveragedPerceptron(r.label, r.features, lossFunction: loss, numIterations: 2, onFit: p => pred = p))); var pipe = reader.Append(est); Assert.Null(pred); var model = pipe.Fit(dataSource); Assert.NotNull(pred); // 9 input features, so we ought to have 9 weights. Assert.Equal(9, pred.Weights2.Count); var data = model.Read(dataSource); var metrics = ctx.Evaluate(data, r => r.label, r => r.preds); // Run a sanity check against a few of the metrics. Assert.InRange(metrics.Accuracy, 0, 1); Assert.InRange(metrics.Auc, 0, 1); Assert.InRange(metrics.Auprc, 0, 1); }
public void SaveTransformerAndSchemaAndLoad() { var file = new MultiFileSource(GetDataPath(TestDatasets.adult.trainFilename)); var loader = _ml.Data.CreateTextLoader <InputData>(hasHeader: true, dataSample: file); var estimator = _ml.Transforms.Normalize("Features"); var model = estimator.Fit(loader.Load(file)); string modelPath = GetOutputPath(FullTestName + "-model.zip"); _ml.Model.Save(model, loader.GetOutputSchema(), modelPath); Load(modelPath, out var loadedWithSchema, out var loadedSchema, out var loadedLoader, out var loadedWithLoader, out var loadedLoaderWithTransformer); Assert.True(loadedWithSchema is NormalizingTransformer); Assert.True(loadedSchema.Count == 2 && loadedSchema.GetColumnOrNull("Label") != null && loadedSchema.GetColumnOrNull("Features") != null && loadedSchema["Features"].HasSlotNames()); Assert.Null(loadedLoader); Assert.Null(loadedWithLoader); Assert.Null(loadedLoaderWithTransformer); }
public void ConcatWith() { var env = new ConsoleEnvironment(seed: 0); var dataPath = GetDataPath("iris.data"); var reader = TextLoader.CreateReader(env, c => (label: c.LoadText(4), values: c.LoadFloat(0, 3), value: c.LoadFloat(2)), separator: ','); var dataSource = new MultiFileSource(dataPath); var data = reader.Read(dataSource); var est = data.MakeNewEstimator() .Append(r => ( r.label, r.values, r.value, c0: r.label.AsVector(), c1: r.label.ConcatWith(r.label), c2: r.value.ConcatWith(r.values), c3: r.values.ConcatWith(r.value, r.values))); var tdata = est.Fit(data).Transform(data); var schema = tdata.AsDynamic.Schema; int[] idx = new int[4]; for (int i = 0; i < idx.Length; ++i) { Assert.True(schema.TryGetColumnIndex("c" + i, out idx[i]), $"Could not find col c{i}"); } var types = new VectorType[idx.Length]; int[] expectedLen = new int[] { 1, 2, 5, 9 }; for (int i = 0; i < idx.Length; ++i) { var type = schema.GetColumnType(idx[i]); Assert.True(type.VectorSize > 0, $"Col c{i} had unexpected type {type}"); types[i] = type.AsVector; Assert.Equal(expectedLen[i], type.VectorSize); } Assert.Equal(TextType.Instance, types[0].ItemType); Assert.Equal(TextType.Instance, types[1].ItemType); Assert.Equal(NumberType.Float, types[2].ItemType); Assert.Equal(NumberType.Float, types[3].ItemType); }
public void PrincipalComponentAnalysis() { var env = new ConsoleEnvironment(seed: 0); var dataPath = GetDataPath("generated_regression_dataset.csv"); var dataSource = new MultiFileSource(dataPath); var reader = TextLoader.CreateReader(env, c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)), separator: ';', hasHeader: true); var data = reader.Read(dataSource); var est = reader.MakeNewEstimator() .Append(r => (r.label, pca: r.features.ToPrincipalComponents(rank: 5))); var tdata = est.Fit(data).Transform(data); var schema = tdata.AsDynamic.Schema; Assert.True(schema.TryGetColumnIndex("pca", out int pcaCol)); var type = schema.GetColumnType(pcaCol); Assert.True(type.IsVector && type.IsKnownSizeVector && type.ItemType.IsNumber); }
public void TestSchemaPropagation() { string dataPath = GetDataPath("adult.test"); var source = new MultiFileSource(dataPath); var loader = ML.Data.CreateTextLoader(new[] { new TextLoader.Column("Float1", DataKind.R4, 0), new TextLoader.Column("Float4", DataKind.R4, new[] { new TextLoader.Range(0), new TextLoader.Range(2), new TextLoader.Range(4), new TextLoader.Range(10) }), new TextLoader.Column("Text1", DataKind.Text, 0) }, hasHeader: true, separatorChar: ','); var data = loader.Read(source); Action <MyInput, MyOutput> mapping = (input, output) => output.Together = input.Float1.ToString(); var est = ML.Transforms.CustomMapping(mapping, null); // Make sure schema propagation works for valid data. est.GetOutputSchema(SchemaShape.Create(data.Schema)); var badData1 = ML.Transforms.CopyColumns("Text1", "Float1").Fit(data).Transform(data); try { est.GetOutputSchema(SchemaShape.Create(badData1.Schema)); Assert.True(false); } catch (Exception) { } var badData2 = ML.Transforms.SelectColumns(new[] { "Float1" }).Fit(data).Transform(data); try { est.GetOutputSchema(SchemaShape.Create(badData2.Schema)); Assert.True(false); } catch (Exception) { } Done(); }
public void ToKey() { var env = new ConsoleEnvironment(seed: 0); var dataPath = GetDataPath("iris.data"); var reader = TextLoader.CreateReader(env, c => (label: c.LoadText(4), values: c.LoadFloat(0, 3)), separator: ','); var dataSource = new MultiFileSource(dataPath); var data = reader.Read(dataSource); var est = data.MakeNewEstimator() .Append(r => (labelKey: r.label.ToKey(), valuesKey: r.values.ToKey(onFit: m => { }))) .Append(r => (r.labelKey, r.valuesKey, valuesKeyKey: r.valuesKey.ToKey())); var tdata = est.Fit(data).Transform(data); var schema = tdata.AsDynamic.Schema; Assert.True(schema.TryGetColumnIndex("labelKey", out int labelCol)); Assert.True(schema.TryGetColumnIndex("valuesKey", out int valuesCol)); Assert.True(schema.TryGetColumnIndex("valuesKeyKey", out int valuesKeyCol)); Assert.Equal(3, schema.GetColumnType(labelCol).KeyCount); Assert.True(schema.GetColumnType(valuesCol).ItemType.IsKey); Assert.True(schema.GetColumnType(valuesKeyCol).ItemType.IsKey); var labelKeyType = schema.GetMetadataTypeOrNull(MetadataUtils.Kinds.KeyValues, labelCol); var valuesKeyType = schema.GetMetadataTypeOrNull(MetadataUtils.Kinds.KeyValues, valuesCol); var valuesKeyKeyType = schema.GetMetadataTypeOrNull(MetadataUtils.Kinds.KeyValues, valuesKeyCol); Assert.NotNull(labelKeyType); Assert.NotNull(valuesKeyType); Assert.NotNull(valuesKeyKeyType); Assert.True(labelKeyType.IsVector && labelKeyType.ItemType == TextType.Instance); Assert.True(valuesKeyType.IsVector && valuesKeyType.ItemType == NumberType.Float); Assert.True(valuesKeyKeyType.IsVector && valuesKeyKeyType.ItemType == NumberType.Float); // Because they're over exactly the same data, they ought to have the same cardinality and everything. Assert.True(valuesKeyKeyType.Equals(valuesKeyType)); }
void New_FileBasedSavingOfData() { var ml = new MLContext(seed: 1, conc: 1); var src = new MultiFileSource(GetDataPath(TestDatasets.Sentiment.trainFilename)); var trainData = ml.Data.TextReader(MakeSentimentTextLoaderArgs()) .Append(ml.Transforms.Text.FeaturizeText("SentimentText", "Features")) .Fit(src).Read(src); var path = DeleteOutputPath("i.idv"); using (var file = File.Create(path)) { var saver = new BinarySaver(ml, new BinarySaver.Arguments()); using (var ch = ((IHostEnvironment)ml).Start("SaveData")) DataSaverUtils.SaveDataView(ch, saver, trainData, file); } var trainer = ml.BinaryClassification.Trainers.StochasticDualCoordinateAscent("Label", "Features", advancedSettings: s => s.NumThreads = 1); var loadedTrainData = new BinaryLoader(ml, new BinaryLoader.Arguments(), new MultiFileSource(path)); // Train. var model = trainer.Fit(loadedTrainData); }
public void CrossValidate() { var env = new MLContext(seed: 0); var dataPath = GetDataPath(TestDatasets.iris.trainFilename); var dataSource = new MultiFileSource(dataPath); var ctx = new MulticlassClassificationContext(env); var reader = TextLoader.CreateReader(env, c => (label: c.LoadText(0), features: c.LoadFloat(1, 4))); var est = reader.MakeNewEstimator() .Append(r => (label: r.label.ToKey(), r.features)) .Append(r => (r.label, preds: ctx.Trainers.Sdca( r.label, r.features, maxIterations: 2))); var results = ctx.CrossValidate(reader.Read(dataSource), est, r => r.label) .Select(x => x.metrics).ToArray(); Assert.Equal(5, results.Length); Assert.True(results.All(x => x.LogLoss > 0)); }
public void LpGcNormAndWhitening() { var env = new ConsoleEnvironment(seed: 0); var dataPath = GetDataPath("generated_regression_dataset.csv"); var dataSource = new MultiFileSource(dataPath); var reader = TextLoader.CreateReader(env, c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)), separator: ';', hasHeader: true); var data = reader.Read(dataSource); var est = reader.MakeNewEstimator() .Append(r => (r.label, lpnorm: r.features.LpNormalize(), gcnorm: r.features.GlobalContrastNormalize(), zcawhitened: r.features.ZcaWhitening(), pcswhitened: r.features.PcaWhitening())); var tdata = est.Fit(data).Transform(data); var schema = tdata.AsDynamic.Schema; Assert.True(schema.TryGetColumnIndex("lpnorm", out int lpnormCol)); var type = schema.GetColumnType(lpnormCol); Assert.True(type.IsVector && type.IsKnownSizeVector && type.ItemType.IsNumber); Assert.True(schema.TryGetColumnIndex("gcnorm", out int gcnormCol)); type = schema.GetColumnType(gcnormCol); Assert.True(type.IsVector && type.IsKnownSizeVector && type.ItemType.IsNumber); Assert.True(schema.TryGetColumnIndex("zcawhitened", out int zcawhitenedCol)); type = schema.GetColumnType(zcawhitenedCol); Assert.True(type.IsVector && type.IsKnownSizeVector && type.ItemType.IsNumber); Assert.True(schema.TryGetColumnIndex("pcswhitened", out int pcswhitenedCol)); type = schema.GetColumnType(pcswhitenedCol); Assert.True(type.IsVector && type.IsKnownSizeVector && type.ItemType.IsNumber); }
public void TestCustomTransformer() { string dataPath = GetDataPath("adult.test"); var source = new MultiFileSource(dataPath); var loader = ML.Data.TextReader(new[] { new TextLoader.Column("Float1", DataKind.R4, 0), new TextLoader.Column("Float4", DataKind.R4, new[] { new TextLoader.Range(0), new TextLoader.Range(2), new TextLoader.Range(4), new TextLoader.Range(10) }) }, s => { s.Separator = ","; s.HasHeader = true; }); var data = loader.Read(source); IDataView transformedData; // We create a temporary environment to instantiate the custom transformer. This is to ensure that we don't need the same // environment for saving and loading. var tempoEnv = new MLContext(); var customEst = new CustomMappingEstimator <MyInput, MyOutput>(tempoEnv, MyLambda.MyAction, "MyLambda"); try { TestEstimatorCore(customEst, data); Assert.True(false, "Cannot work without MEF injection"); } catch (Exception) { // REVIEW: we should have a common mechanism that will make sure this is 'our' exception thrown. } ML.CompositionContainer = new CompositionContainer(new TypeCatalog(typeof(MyLambda))); TestEstimatorCore(customEst, data); transformedData = customEst.Fit(data).Transform(data); var inputs = transformedData.AsEnumerable <MyInput>(ML, true); var outputs = transformedData.AsEnumerable <MyOutput>(ML, true); Assert.True(inputs.Zip(outputs, (x, y) => y.Together == $"{x.Float1} + {string.Join(", ", x.Float4)}").All(x => x)); Done(); }
public void TestStatefulCustomMappingTransformer() { string dataPath = GetDataPath("breast-cancer.txt"); var source = new MultiFileSource(dataPath); var loader = ML.Data.CreateTextLoader(new[] { new TextLoader.Column("Features", DataKind.Single, 1, 9), new TextLoader.Column("Label", DataKind.String, 0), new TextLoader.Column("Value", DataKind.Single, 2), }); var data = loader.Load(source); // We create a temporary environment to instantiate the custom transformer. This is to ensure that we don't need the same // environment for saving and loading. var tempoEnv = new MLContext(); var customEst = tempoEnv.Transforms.StatefulCustomMapping <MyStatefulInput, MyStatefulOutput, MyState>(MyStatefulLambda.MyStatefulAction, MyStatefulLambda.MyStateInit, nameof(MyStatefulLambda)); TestEstimatorCore(customEst, data); var transformedData = customEst.Fit(data).Transform(data); var outputs = transformedData.GetColumn <bool>(transformedData.Schema[nameof(MyStatefulOutput.FirstAppearance)]); Assert.Equal(10, outputs.Count(output => output)); Done(); }
public void TestCustomTransformer(bool registerAssembly) { string dataPath = GetDataPath("adult.tiny.with-schema.txt"); var source = new MultiFileSource(dataPath); var loader = ML.Data.CreateTextLoader(new[] { new TextLoader.Column("Float1", DataKind.Single, 9), new TextLoader.Column("Float4", DataKind.Single, new[] { new TextLoader.Range(9), new TextLoader.Range(10), new TextLoader.Range(11), new TextLoader.Range(12) }) }, hasHeader: true); var data = loader.Load(source); IDataView transformedData; // We create a temporary environment to instantiate the custom transformer. This is to ensure that we don't need the same // environment for saving and loading. var tempoEnv = new MLContext(1); var customEst = new CustomMappingEstimator <MyInput, MyOutput>(tempoEnv, MyLambda.MyAction, "MyLambda"); // Before 1.5-preview3 it was required to register the assembly. // Now, the assembly information is automatically saved in the model and the assembly is registered // when loading. // This tests the case that the CustomTransformer still works even if you explicitly register the assembly if (registerAssembly) { ML.ComponentCatalog.RegisterAssembly(typeof(MyLambda).Assembly); } TestEstimatorCore(customEst, data); transformedData = customEst.Fit(data).Transform(data); var inputs = ML.Data.CreateEnumerable <MyInput>(transformedData, true); var outputs = ML.Data.CreateEnumerable <MyOutput>(transformedData, true); Assert.True(inputs.Zip(outputs, (x, y) => y.Together == $"{x.Float1} + {string.Join(", ", x.Float4)}").All(x => x)); Done(); }
public void Normalizer() { var env = new ConsoleEnvironment(seed: 0); var dataPath = GetDataPath("generated_regression_dataset.csv"); var dataSource = new MultiFileSource(dataPath); var reader = TextLoader.CreateReader(env, c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)), separator: ';', hasHeader: true); var data = reader.Read(dataSource); var est = reader.MakeNewEstimator() .Append(r => (r.label, r.features, bin: r.features.NormalizeByBinning(), mm: r.features.Normalize())); var tdata = est.Fit(data).Transform(data); var schema = tdata.AsDynamic.Schema; Assert.True(schema.TryGetColumnIndex("features", out int featCol)); Assert.True(schema.TryGetColumnIndex("bin", out int binCol)); Assert.True(schema.TryGetColumnIndex("mm", out int mmCol)); Assert.False(schema.IsNormalized(featCol)); Assert.True(schema.IsNormalized(binCol)); Assert.True(schema.IsNormalized(mmCol)); }
/// <summary> /// Reads a text file as a IDataView. /// Follows pandas API. /// </summary> /// <param name="filename">filename</param> /// <param name="sep">column separator</param> /// <param name="header">has a header or not</param> /// <param name="names">column names (can be empty)</param> /// <param name="dtypes">column types (can be empty)</param> /// <param name="nrows">number of rows to read</param> /// <param name="guess_rows">number of rows used to guess types</param> /// <param name="encoding">text encoding</param> /// <param name="useThreads">specific to TextLoader</param> /// <param name="host">host</param> /// <param name="index">add a column to hold the index</param> /// <returns>TextLoader</returns> public static IDataView ReadCsvToTextLoader(string[] filenames, char sep = ',', bool header = true, string[] names = null, ColumnType[] dtypes = null, int nrows = -1, int guess_rows = 10, Encoding encoding = null, bool useThreads = true, bool index = false, IHost host = null) { var df = ReadCsv(filenames[0], sep: sep, header: header, names: names, dtypes: dtypes, nrows: guess_rows, guess_rows: guess_rows, encoding: encoding, index: index); var sch = df.Schema; var cols = new TextLoader.Column[sch.ColumnCount]; for (int i = 0; i < cols.Length; ++i) { cols[i] = TextLoader.Column.Parse(df.NameType(i)); } var args = new TextLoader.Arguments() { AllowQuoting = false, Separator = string.Format("{0}", sep), Column = cols, TrimWhitespace = true, UseThreads = useThreads, HasHeader = header, MaxRows = nrows > 0 ? (int?)nrows : null }; if (host == null) { host = new ConsoleEnvironment().Register("TextLoader"); } var multiSource = new MultiFileSource(filenames); return(new TextLoader(host, args, multiSource).Read(multiSource)); }
public void FastTreeRegressionRepresentationWithCategoricalSplit() { var env = new MLContext(seed: 0); var dataPath = GetDataPath(TestDatasets.generatedRegressionDataset.trainFilename); var dataSource = new MultiFileSource(dataPath); var catalog = new RegressionCatalog(env); var reader = TextLoaderStatic.CreateLoader(env, c => (label: c.LoadFloat(11), features: c.LoadText(0, 10)), separator: ';', hasHeader: true); FastTreeRegressionModelParameters pred = null; var opts = new FastTreeRegressionTrainer.Options() { CategoricalSplit = true, NumTrees = 3, NumLeaves = 5, NumThreads = 1, // This is the minimal samples to form a split (i.e., generating two extra nodes/leaves). For a small data set, // we should set a small value. Otherwise, the trained trees could be empty. MinDocumentsInLeafs = 2 }; var est = reader.MakeNewEstimator() .Append(r => (r.label, features: r.features.OneHotEncoding())) .Append(r => (r.label, score: catalog.Trainers.FastTree(r.label, r.features, null, opts, onFit: (p) => { pred = p; }))); var pipe = reader.Append(est); Assert.Null(pred); var model = pipe.Fit(dataSource); Assert.NotNull(pred); var treeCollection = pred.TrainedTreeEnsemble; Assert.Equal(0, treeCollection.Bias); Assert.Equal(3, treeCollection.Trees.Count); Assert.Equal(3, treeCollection.TreeWeights.Count); var trees = treeCollection.Trees; Assert.Equal(4, trees[0].NumNodes); var expectedGtChild = new int[] { 3, -3, -4, -5 }; Assert.Equal(4, trees[0].GtChild.Count); Assert.Equal(expectedGtChild, trees[0].GtChild); var expectedLteChild = new int[] { 1, 2, -1, -2 }; Assert.Equal(4, trees[0].LteChild.Count); Assert.Equal(expectedLteChild, trees[0].LteChild); var expectedCategoricalSplitFlags = new bool[] { true, true, true, true }; Assert.Equal(4, trees[0].CategoricalSplitFlags.Count); Assert.Equal(expectedCategoricalSplitFlags, trees[0].CategoricalSplitFlags); var expectedNumericalSplitFeatureIndexes = new int[] { 5312, 2, 2126, 533 }; Assert.Equal(4, trees[0].NumericalSplitFeatureIndexes.Count); Assert.Equal(expectedNumericalSplitFeatureIndexes, trees[0].NumericalSplitFeatureIndexes); var expectedNumericalSplitThresholds = new float[] { 0.5f, 0.5f, 0.5f, 0.5f }; Assert.Equal(4, trees[0].NumericalSplitThresholds.Count); for (int i = 0; i < trees[0].NumericalSplitThresholds.Count; ++i) { Assert.Equal(expectedNumericalSplitThresholds[i], trees[0].NumericalSplitThresholds[i], 6); } var actualCategoricalRanges0 = trees[0].GetCategoricalCategoricalSplitFeatureRangeAt(0); Assert.Equal(actualCategoricalRanges0, new int[] { 5312, 5782 }); var actualCategoricalRanges1 = trees[0].GetCategoricalCategoricalSplitFeatureRangeAt(1); Assert.Equal(actualCategoricalRanges1, new int[] { 2, 417 }); var actualCategoricalRanges2 = trees[0].GetCategoricalCategoricalSplitFeatureRangeAt(2); Assert.Equal(actualCategoricalRanges2, new int[] { 2126, 2593 }); var actualCategoricalRanges3 = trees[0].GetCategoricalCategoricalSplitFeatureRangeAt(3); Assert.Equal(actualCategoricalRanges3, new int[] { 533, 983 }); int[] expectedCounts = { 62, 52, 54, 22 }; int[] expectedStarts = { 5315, 10, 2141, 533 }; int[] expectedEnds = { 5782, 401, 2558, 874 }; for (int i = 0; i < trees[0].NumNodes; ++i) { // Retrieve i-th node's split features. var actualCategoricalSplitFeatures = trees[0].GetCategoricalSplitFeaturesAt(i); Assert.Equal(expectedCounts[i], actualCategoricalSplitFeatures.Count); Assert.Equal(expectedStarts[i], actualCategoricalSplitFeatures[0]); Assert.Equal(expectedEnds[i], actualCategoricalSplitFeatures[expectedCounts[i] - 1]); } Assert.Equal(5, trees[0].NumLeaves); var expectedLeafValues = new double[] { 48.456055413607892, 86.584156799316418, 87.017326642027, 76.381184971185391, 117.68872643673058 }; Assert.Equal(5, trees[0].LeafValues.Count); for (int i = 0; i < trees[0].LeafValues.Count; ++i) { Assert.Equal(expectedLeafValues[i], trees[0].LeafValues[i], 6); } }
// This examples shows all the ways to load data with TextLoader. public static void Example() { // Create 5 data files to illustrate different loading methods. var dataFiles = new List <string>(); var random = new Random(1); var dataDirectoryName = "DataDir"; Directory.CreateDirectory(dataDirectoryName); for (int i = 0; i < 5; i++) { var fileName = Path.Combine(dataDirectoryName, $"Data_{i}.csv"); dataFiles.Add(fileName); using (var fs = File.CreateText(fileName)) { // Write without header with 10 random columns, forcing // approximately 80% of values to be 0. for (int line = 0; line < 10; line++) { var sb = new StringBuilder(); for (int pos = 0; pos < 10; pos++) { var value = random.NextDouble(); sb.Append((value < 0.8 ? 0 : value).ToString() + '\t'); } fs.WriteLine(sb.ToString(0, sb.Length - 1)); } } } // Create a TextLoader. var mlContext = new MLContext(); var loader = mlContext.Data.CreateTextLoader( columns: new[] { new TextLoader.Column("Features", DataKind.Single, 0, 9) }, hasHeader: false ); // Load a single file from path. var singleFileData = loader.Load(dataFiles[0]); PrintRowCount(singleFileData); // Expected Output: // 10 // Load all 5 files from path. var multipleFilesData = loader.Load(dataFiles.ToArray()); PrintRowCount(multipleFilesData); // Expected Output: // 50 // Load all files using path wildcard. var multipleFilesWildcardData = loader.Load(Path.Combine(dataDirectoryName, "Data_*.csv")); PrintRowCount(multipleFilesWildcardData); // Expected Output: // 50 // Create a TextLoader with user defined type. var loaderWithCustomType = mlContext.Data.CreateTextLoader <Data>(hasHeader: false); // Load a single file from path. var singleFileCustomTypeData = loaderWithCustomType.Load(dataFiles[0]); PrintRowCount(singleFileCustomTypeData); // Expected Output: // 10 // Create a TextLoader with unknown column length to illustrate // how a data sample may be used to infer column size. var dataSample = new MultiFileSource(dataFiles[0]); var loaderWithUnknownLength = mlContext.Data.CreateTextLoader( columns: new[] { new TextLoader.Column("Features", DataKind.Single, new[] { new TextLoader.Range(0, null) }) }, dataSample: dataSample ); var dataWithInferredLength = loaderWithUnknownLength.Load(dataFiles[0]); var featuresColumn = dataWithInferredLength.Schema.GetColumnOrNull("Features"); if (featuresColumn.HasValue) { Console.WriteLine(featuresColumn.Value.ToString()); } // Expected Output: // Features: Vector<Single, 10> // // ML.NET infers the correct length of 10 for the Features column, // which is of type Vector<Single>. PrintRowCount(dataWithInferredLength); // Expected Output: // 10 // Save the data with 10 rows to a text file to illustrate the use of // sparse format. var sparseDataFileName = Path.Combine(dataDirectoryName, "saved_data.tsv"); using (FileStream stream = new FileStream(sparseDataFileName, FileMode.Create)) mlContext.Data.SaveAsText(singleFileData, stream); // Since there are many zeroes in the data, it will be saved in a sparse // representation to save disk space. The data may be forced to be saved // in a dense representation by setting forceDense to true. The sparse // data will look like the following: // // 10 7:0.943862259 // 10 3:0.989767134 // 10 0:0.949778438 8:0.823028445 9:0.886469543 // // The sparse representation of the first row indicates that there are // 10 columns, the column 7 (8-th column) has value 0.943862259, and other // omitted columns have value 0. // Create a TextLoader that allows sparse input. var sparseLoader = mlContext.Data.CreateTextLoader( columns: new[] { new TextLoader.Column("Features", DataKind.Single, 0, 9) }, allowSparse: true ); // Load the saved sparse data. var sparseData = sparseLoader.Load(sparseDataFileName); PrintRowCount(sparseData); // Expected Output: // 10 // Create a TextLoader without any column schema using TextLoader.Options. // Since the sparse data file was saved with ML.NET, it has the schema // enoded in its header that the loader can understand: // // #@ TextLoader{ // #@ sep=tab // #@ col=Features:R4:0-9 // #@ } // // The schema syntax is unimportant since it is only used internally. In // short, it tells the loader that the values are separated by tabs, and // that columns 0-9 in the text file are to be read into one column named // "Features" of type Single (internal type R4). var options = new TextLoader.Options() { AllowSparse = true, }; var dataSampleWithSchema = new MultiFileSource(sparseDataFileName); var sparseLoaderWithSchema = mlContext.Data.CreateTextLoader(options, dataSample: dataSampleWithSchema); // Load the saved sparse data. var sparseDataWithSchema = sparseLoaderWithSchema.Load(sparseDataFileName); PrintRowCount(sparseDataWithSchema); // Expected Output: // 10 }
public void MultiFileSourceUnitTest() { var fileSource = new MultiFileSource("adult.txt"); Assert.True(fileSource.Count == 1); fileSource = new MultiFileSource("adult.tiny.with-schema.txt", "adult.tiny.with-schema.txt"); Assert.True(fileSource.Count == 2, $"Error passing multiple paths to {nameof(MultiFileSource)}"); //creating a directory with three files for the tests var dirName = Directory.CreateDirectory("MultiFileSourceUnitTest").FullName; var file1 = Path.Combine(dirName, "a.txt"); var file2 = Path.Combine(dirName, "b.txt"); File.WriteAllText(file1, "Unit Test"); File.WriteAllText(file2, "Unit Test"); fileSource = new MultiFileSource($"{file1}+{file2}"); Assert.True(fileSource.Count == 2, $"Error passing concatenated paths to {nameof(MultiFileSource)}"); fileSource = new MultiFileSource(Path.Combine(dirName, "...")); Assert.True(fileSource.Count == 2, $"Error passing concatenated paths to {nameof(MultiFileSource)}"); /* Create test directories and files in the following specifications: * /MultiFileSourceUnitTest/Data * /MultiFileSourceUnitTest/Data/a.txt * /MultiFileSourceUnitTest/Data/b.txt * /MultiFileSourceUnitTest/DataFolder/ * /MultiFileSourceUnitTest/DataFolder/SubFolder1 * /MultiFileSourceUnitTest/DataFolder/SubFolder1/a.txt * /MultiFileSourceUnitTest/DataFolder/SubFolder2 * /MultiFileSourceUnitTest/DataFolder/SubFolder2/b.txt */ var dataDir = Directory.CreateDirectory("MultiFileSourceUnitTest/Data").FullName; var fileDataA = Path.Combine(dataDir, "a.txt"); var fileDataB = Path.Combine(dataDir, "b.txt"); File.WriteAllText(fileDataA, "Unit Test"); File.WriteAllText(fileDataB, "Unit Test"); var dataFolderDir = Directory.CreateDirectory("MultiFileSourceUnitTest/DataFolder").FullName; var subFolder1Dir = Directory.CreateDirectory("MultiFileSourceUnitTest/DataFolder/SubFolder1").FullName; var subFolder2Dir = Directory.CreateDirectory("MultiFileSourceUnitTest/DataFolder/SubFolder2").FullName; var fileDataSA = Path.Combine(subFolder1Dir, "a.txt"); var fileDataSB = Path.Combine(subFolder2Dir, "b.txt"); File.WriteAllText(fileDataSA, "Unit Test"); File.WriteAllText(fileDataSB, "Unit Test"); fileSource = new MultiFileSource(dataDir + "/*"); Assert.True(fileSource.Count == 2, $"Error passing concatenated paths to {nameof(MultiFileSource)}"); fileSource = new MultiFileSource(dataFolderDir + "/.../*"); Assert.True(fileSource.Count == 2, $"Error passing concatenated paths to {nameof(MultiFileSource)}"); //Delete test folder and files for test clean-up Directory.Delete(dirName, true); }
void TestConcat() { string dataPath = GetDataPath("adult.tiny.with-schema.txt"); var source = new MultiFileSource(dataPath); var loader = new TextLoader(ML, new TextLoader.Options { Columns = new[] { new TextLoader.Column("float1", DataKind.Single, 9), new TextLoader.Column("float4", DataKind.Single, new[] { new TextLoader.Range(9), new TextLoader.Range(10), new TextLoader.Range(11), new TextLoader.Range(12) }), new TextLoader.Column("float6", DataKind.Single, new[] { new TextLoader.Range(9), new TextLoader.Range(10), new TextLoader.Range(11), new TextLoader.Range(12, 14) }), new TextLoader.Column("vfloat", DataKind.Single, new[] { new TextLoader.Range(14, null) { AutoEnd = false, VariableEnd = true } }) }, Separator = "\t", HasHeader = true }, new MultiFileSource(dataPath)); var data = loader.Load(source); DataViewType GetType(DataViewSchema schema, string name) { Assert.True(schema.TryGetColumnIndex(name, out int cIdx), $"Could not find '{name}'"); return(schema[cIdx].Type); } var pipe = ML.Transforms.Concatenate("f1", "float1") .Append(ML.Transforms.Concatenate("f2", "float1", "float1")) .Append(ML.Transforms.Concatenate("f3", "float4", "float1")) .Append(ML.Transforms.Concatenate("f4", "float6", "vfloat", "float1")); data = ML.Data.TakeRows(data, 10); data = pipe.Fit(data).Transform(data); DataViewType t; t = GetType(data.Schema, "f1"); Assert.True(t is VectorDataViewType vt1 && vt1.ItemType == NumberDataViewType.Single && vt1.Size == 1); t = GetType(data.Schema, "f2"); Assert.True(t is VectorDataViewType vt2 && vt2.ItemType == NumberDataViewType.Single && vt2.Size == 2); t = GetType(data.Schema, "f3"); Assert.True(t is VectorDataViewType vt3 && vt3.ItemType == NumberDataViewType.Single && vt3.Size == 5); t = GetType(data.Schema, "f4"); Assert.True(t is VectorDataViewType vt4 && vt4.ItemType == NumberDataViewType.Single && vt4.Size == 0); data = ML.Transforms.SelectColumns("f1", "f2", "f3", "f4").Fit(data).Transform(data); var subdir = Path.Combine("Transform", "Concat"); var outputPath = GetOutputPath(subdir, "Concat1.tsv"); using (var ch = Env.Start("save")) { var saver = new TextSaver(ML, new TextSaver.Arguments { Silent = true, Dense = true }); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, data, fs, keepHidden: false); } CheckEquality(subdir, "Concat1.tsv"); Done(); }
public void LoadModelAndExtractPredictor() { var mlContext = new MLContext(1); var file = new MultiFileSource(TestCommon.GetDataPath(DataDir, TestDatasets.adult.trainFilename)); var loader = mlContext.Data.CreateTextLoader <InputData>(hasHeader: true, dataSample: file); var data = loader.Load(file); // Pipeline. var pipeline = mlContext.BinaryClassification.Trainers.Gam(); // Define the same pipeline starting with the loader. var pipeline1 = loader.Append(mlContext.BinaryClassification.Trainers.Gam()); // Train. var transformerModel = pipeline.Fit(data); var compositeLoaderModel = pipeline1.Fit(file); // Save and reload the "same" model with some differences in structure. // In this case we are saving the transformer model, but *not* the loader, just the schema from that loader. string modelAndSchemaPath = TestCommon.GetOutputPath(OutDir, FullTestName + "-model-schema.zip"); mlContext.Model.Save(transformerModel, data.Schema, modelAndSchemaPath); // In this case we have combined the loader with the transformer model to form a "composite" loader, and are just // saving that one loader to this file. string compositeLoaderModelPath = TestCommon.GetOutputPath(OutDir, FullTestName + "-composite-model.zip"); mlContext.Model.Save(null, compositeLoaderModel, compositeLoaderModelPath); // In this case we are saving the transformer model, as well as the associated data loader. string loaderAndTransformerModelPath = TestCommon.GetOutputPath(OutDir, FullTestName + "-loader-transformer.zip"); mlContext.Model.Save(transformerModel, loader, loaderAndTransformerModelPath); ITransformer loadedTransformerModel; IDataLoader <IMultiStreamSource> loadedCompositeLoader; ITransformer loadedTransformerModel1; using (var fs = File.OpenRead(modelAndSchemaPath)) loadedTransformerModel = mlContext.Model.Load(fs, out var loadedSchema); using (var fs = File.OpenRead(compositeLoaderModelPath)) { // This model can be loaded either as a composite data loader, // a transformer model + an input schema, or a transformer model + a data loader. var t = mlContext.Model.LoadWithDataLoader(fs, out loadedCompositeLoader); // This is a bit strange, as it seems to test that it can reload from the same // stream twice opened only once, which as far as I know is not really a requirement // of the design or API, but we are nonetheless testing it. If this winds up failing, // I'm not sure we should really insist on this as a design requirement. var t1 = mlContext.Model.Load(fs, out var s); TestCommon.CheckSameSchemas(loadedCompositeLoader.GetOutputSchema(), s); // We combined the GAM with the loader, so the remaining chain should just be empty. Assert.Empty(Assert.IsType <TransformerChain <ITransformer> >(t)); Assert.Empty(Assert.IsType <TransformerChain <ITransformer> >(t1)); } using (var fs = File.OpenRead(loaderAndTransformerModelPath)) { // This model can be loaded either as a composite data loader, // a transformer model + an input schema, or a transformer model + a data loader. var t = mlContext.Model.Load(fs, out var s); TestCommon.CheckSameSchemas(loader.GetOutputSchema(), s); loadedTransformerModel1 = mlContext.Model.LoadWithDataLoader(fs, out var l); }
public GenerateSweepCandidatesCommand(IHostEnvironment env, Arguments args) { Contracts.CheckValue(env, nameof(env)); _host = env.Register("GenerateCandidates"); _host.CheckValue(args, nameof(args)); var files = new MultiFileSource(args.DataFile); _host.CheckUserArg(files.Count > 0, nameof(args.DataFile), "dataFile is required"); _dataFile = args.DataFile; _rspsOutFolder = Utils.CreateFolderIfNotExists(args.RspOutFolder); _host.CheckUserArg(_rspsOutFolder != null, nameof(args.RspOutFolder), "Provide a value rspOutFolder (or 'out', the short name)."); if (!string.IsNullOrWhiteSpace(args.SchemaDefinitionFile)) { Utils.CheckOptionalUserDirectory(args.SchemaDefinitionFile, nameof(args.SchemaDefinitionFile)); _schemaDefinitionFile = args.SchemaDefinitionFile; } if (!string.IsNullOrWhiteSpace(args.Sweeper)) { var info = ComponentCatalog.GetLoadableClassInfo <SignatureSweeper>(args.Sweeper); _host.CheckUserArg(info?.SignatureTypes[0] == typeof(SignatureSweeper), nameof(args.Sweeper), "Please specify a valid sweeper."); _sweeper = args.Sweeper; } else { _sweeper = "kdo"; } if (!string.IsNullOrWhiteSpace(args.Mode)) { var info = ComponentCatalog.GetLoadableClassInfo <SignatureCommand>(args.Mode); _host.CheckUserArg(info?.Type == typeof(TrainCommand) || info?.Type == typeof(TrainTestCommand) || info?.Type == typeof(CrossValidationCommand), nameof(args.Mode), "Invalid mode."); _mode = args.Mode; } else { _mode = CrossValidationCommand.LoadName; } _indented = args.Indent; if (!string.IsNullOrWhiteSpace(args.TestFile)) { files = new MultiFileSource(args.TestFile); _host.CheckUserArg(files.Count > 0, nameof(args.TestFile), "testFile needs to be a valid file, if provided."); _testFile = args.TestFile; } else { _host.CheckUserArg(_mode != TrainTestCommand.LoadName, nameof(args.TestFile), "testFile needs to be a valid file, for mode = TrainTest."); } _outputDataFolder = Utils.CreateFolderIfNotExists(args.OutputDataFolder); if (_outputDataFolder == null) { _outputDataFolder = _rspsOutFolder; } }
public void LoadModelAndExtractPredictor() { var file = new MultiFileSource(GetDataPath(TestDatasets.adult.trainFilename)); var loader = ML.Data.CreateTextLoader <InputData>(hasHeader: true, dataSample: file); var data = loader.Load(file); // Pipeline. var pipeline = ML.BinaryClassification.Trainers.Gam(); // Define the same pipeline starting with the loader. var pipeline1 = loader.Append(ML.BinaryClassification.Trainers.Gam()); // Train. var transformerModel = pipeline.Fit(data); var compositeLoaderModel = pipeline1.Fit(file); // Save and reload the "same" model with some differences in structure. // In this case we are saving the transformer model, but *not* the loader, just the schema from that loader. string modelAndSchemaPath = GetOutputPath(FullTestName + "-model-schema.zip"); ML.Model.Save(transformerModel, data.Schema, modelAndSchemaPath); // In this case we have combined the loader with the transformer model to form a "composite" loader, and are just // saving that one loader to this file. string compositeLoaderModelPath = GetOutputPath(FullTestName + "-composite-model.zip"); ML.Model.Save(null, compositeLoaderModel, compositeLoaderModelPath); // In this case we are saving the transformer model, as well as the associated data loader. string loaderAndTransformerModelPath = GetOutputPath(FullTestName + "-loader-transformer.zip"); ML.Model.Save(transformerModel, loader, loaderAndTransformerModelPath); ITransformer loadedTransformerModel; IDataLoader <IMultiStreamSource> loadedCompositeLoader; ITransformer loadedTransformerModel1; using (var fs = File.OpenRead(modelAndSchemaPath)) loadedTransformerModel = ML.Model.Load(fs, out var loadedSchema); using (var fs = File.OpenRead(compositeLoaderModelPath)) { // This model can be loaded either as a composite data loader, // a transformer model + an input schema, or a transformer model + a data loader. var t = ML.Model.LoadWithDataLoader(fs, out loadedCompositeLoader); // This is a bit strange, as it seems to test that it can reload from the same // stream twice opened only once, which as far as I know is not really a requirement // of the design or API, but we are nonetheless testing it. If this winds up failing, // I'm not sure we should really insist on this as a design requirement. var t1 = ML.Model.Load(fs, out var s); CheckSameSchemas(loadedCompositeLoader.GetOutputSchema(), s); // We combined the GAM with the loader, so the remaining chain should just be empty. Assert.Empty(Assert.IsType <TransformerChain <ITransformer> >(t)); Assert.Empty(Assert.IsType <TransformerChain <ITransformer> >(t1)); } using (var fs = File.OpenRead(loaderAndTransformerModelPath)) { // This model can be loaded either as a composite data loader, // a transformer model + an input schema, or a transformer model + a data loader. var t = ML.Model.Load(fs, out var s); CheckSameSchemas(loader.GetOutputSchema(), s); loadedTransformerModel1 = ML.Model.LoadWithDataLoader(fs, out var l); } void AssertIsGam(ITransformer trans) { Assert.IsType <GamBinaryModelParameters>( Assert.IsAssignableFrom <CalibratedModelParametersBase>( Assert.IsAssignableFrom <ISingleFeaturePredictionTransformer <object> >(trans).Model).SubModel); } // In the case of the directly used transformer model, the thing we loaded should be itself the result from fitting GAM. AssertIsGam(loadedTransformerModel); // This is quite similar, the fact that we omitted saving the loader and saved the input schema to the model itself. AssertIsGam(loadedTransformerModel1); // If we had combined the transformer with the loader, and then saved *that*, then the resulting loaded "model" // will be empty (as tested above), but the loader itself with a composite loader containing the result from // fitting GAM as the sole item in its transformer chain. var fromComposite = Assert.Single(Assert.IsType <TransformerChain <ITransformer> >( Assert.IsType <CompositeDataLoader <IMultiStreamSource, ITransformer> >(loadedCompositeLoader).Transformer)); AssertIsGam(fromComposite); Done(); }
// This examples shows how to load data with SvmLightLoader. public static void Example() { // Create a random SVM light format file. var random = new Random(42); var dataDirectoryName = "DataDir"; Directory.CreateDirectory(dataDirectoryName); var fileName = Path.Combine(dataDirectoryName, $"SVM_Data.csv"); using (var fs = File.CreateText(fileName)) { // Write random lines in SVM light format for (int line = 0; line < 10; line++) { var sb = new StringBuilder(); if (random.NextDouble() > 0.5) { sb.Append("1 "); } else { sb.Append("-1 "); } if (line % 2 == 0) { sb.Append("cost:1"); } else { sb.Append("cost:2"); } for (int i = 1; i <= 10; i++) { if (random.NextDouble() > 0.5) { continue; } sb.Append($"{i}:{random.NextDouble()} "); } fs.WriteLine(sb.ToString()); } } // Create an SvmLightLoader. var mlContext = new MLContext(); var file = new MultiFileSource(fileName); var loader = mlContext.Data.CreateSvmLightLoader(dataSample: file); // Load a single file from path. var svmData = loader.Load(file); PrintSchema(svmData); // Expected Output: // Column Label type Single // Column Weight type Single // Column GroupId type Key<UInt64, 0 - 18446744073709551613> // Column Comment type String // Column Features type Vector<Single, 10> PrintData(svmData); // Expected Output: // 1 1 0 0 0.2625927 0 0 0.7612506 0.2573214 0 0.3809696 0.5174511 // -1 1 0 0 0 0.7051522 0 0 0.7111546 0.9062127 0 0 // -1 1 0 0 0 0.535722 0 0 0.1491191 0.05100901 0 0 // -1 1 0 0.6481459 0.04449836 0 0 0.4203662 0 0 0.01325378 0.2674384 // -1 1 0 0 0.7978093 0.5134962 0.008952909 0 0.003074009 0.6541431 0.9135142 0 // -1 1 0 0.3727672 0.4369507 0 0 0.2973725 0 0 0 0.8816807 // 1 1 0 0.1031429 0.3332489 0 0.1346936 0.5916625 0 0 0 0 // 1 1 0 0 0 0.3454075 0 0.2197472 0.03848049 0.5923384 0.09373277 0 // -1 1 0 0.7511514 0 0.0420841 0 0 0.9262196 0 0.545344 0 // 1 1 0 0.02958358 0.9334617 0 0 0.8833956 0.2947684 0 0 0 // If the loader is created without a data sample we need to specify the number of features expected in the file. loader = mlContext.Data.CreateSvmLightLoader(inputSize: 10); svmData = loader.Load(file); PrintSchema(svmData); PrintData(svmData); }
void TestConcat() { string dataPath = GetDataPath("adult.test"); var source = new MultiFileSource(dataPath); var loader = new TextLoader(Env, new TextLoader.Arguments { Column = new[] { new TextLoader.Column("float1", DataKind.R4, 0), new TextLoader.Column("float4", DataKind.R4, new[] { new TextLoader.Range(0), new TextLoader.Range(2), new TextLoader.Range(4), new TextLoader.Range(10) }), new TextLoader.Column("vfloat", DataKind.R4, new[] { new TextLoader.Range(0), new TextLoader.Range(2), new TextLoader.Range(4), new TextLoader.Range(10, null) { AutoEnd = false, VariableEnd = true } }) }, Separator = ",", HasHeader = true }, new MultiFileSource(dataPath)); var data = loader.Read(source); ColumnType GetType(ISchema schema, string name) { Assert.True(schema.TryGetColumnIndex(name, out int cIdx), $"Could not find '{name}'"); return(schema.GetColumnType(cIdx)); } var pipe = new ConcatEstimator(Env, "f1", "float1") .Append(new ConcatEstimator(Env, "f2", "float1", "float1")) .Append(new ConcatEstimator(Env, "f3", "float4", "float1")) .Append(new ConcatEstimator(Env, "f4", "vfloat", "float1")); data = TakeFilter.Create(Env, data, 10); data = pipe.Fit(data).Transform(data); ColumnType t; t = GetType(data.Schema, "f1"); Assert.True(t.IsVector && t.ItemType == NumberType.R4 && t.VectorSize == 1); t = GetType(data.Schema, "f2"); Assert.True(t.IsVector && t.ItemType == NumberType.R4 && t.VectorSize == 2); t = GetType(data.Schema, "f3"); Assert.True(t.IsVector && t.ItemType == NumberType.R4 && t.VectorSize == 5); t = GetType(data.Schema, "f4"); Assert.True(t.IsVector && t.ItemType == NumberType.R4 && t.VectorSize == 0); data = new ChooseColumnsTransform(Env, data, "f1", "f2", "f3", "f4"); var subdir = Path.Combine("Transform", "Concat"); var outputPath = GetOutputPath(subdir, "Concat1.tsv"); using (var ch = Env.Start("save")) { var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true, Dense = true }); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, data, fs, keepHidden: false); } CheckEquality(subdir, "Concat1.tsv"); Done(); }
static IDataTransform Create(IHostEnvironment env, Arguments args, IDataView input, out IDataView sourceCtx) { sourceCtx = input; env.CheckValue(args.tag, "Tag cannot be empty."); if (TagHelper.EnumerateTaggedView(true, input).Where(c => c.Item1 == args.tag).Any()) { throw env.Except("Tag '{0}' is already used.", args.tag); } env.CheckValue(args.selectTag, "Selected tag cannot be empty."); if (string.IsNullOrEmpty(args.filename)) { var selected = TagHelper.EnumerateTaggedView(true, input).Where(c => c.Item1 == args.selectTag); if (!selected.Any()) { throw env.Except("Unable to find a view to select with tag '{0}'. Did you forget to specify a filename?", args.selectTag); } var first = selected.First(); if (selected.Skip(1).Any()) { throw env.Except("Tag '{0}' is ambiguous, {1} views were found.", args.selectTag, selected.Count()); } var tagged = input as ITaggedDataView; if (tagged == null) { var ag = new TagViewTransform.Arguments { tag = args.tag }; tagged = new TagViewTransform(env, ag, input); } first.Item2.AddRange(new[] { new Tuple <string, ITaggedDataView>(args.tag, tagged) }); tagged.AddRange(new[] { new Tuple <string, ITaggedDataView>(args.selectTag, first.Item2) }); #if (DEBUG_TIP) long count = DataViewUtils.ComputeRowCount(tagged); if (count == 0) { throw env.Except("Replaced view is empty."); } count = DataViewUtils.ComputeRowCount(first.Item2); if (count == 0) { throw env.Except("Selected view is empty."); } #endif var tr = first.Item2 as IDataTransform; env.AssertValue(tr); return(tr); } else { if (!File.Exists(args.filename)) { throw env.Except("Unable to find file '{0}'.", args.filename); } var selected = TagHelper.EnumerateTaggedView(true, input).Where(c => c.Item1 == args.selectTag); if (selected.Any()) { throw env.Except("Tag '{0}' was already given. It cannot be assigned to the new file.", args.selectTag); } var loaderArgs = new BinaryLoader.Arguments(); var file = new MultiFileSource(args.filename); var loadSettings = ScikitSubComponent <ILegacyDataLoader, SignatureDataLoader> .AsSubComponent(args.loaderSettings); IDataView loader = loadSettings.CreateInstance(env, file); var ag = new TagViewTransform.Arguments { tag = args.selectTag }; var newInput = new TagViewTransform(env, ag, loader); var tagged = input as ITaggedDataView; if (tagged == null) { ag = new TagViewTransform.Arguments { tag = args.tag }; tagged = new TagViewTransform(env, ag, input); } newInput.AddRange(new[] { new Tuple <string, ITaggedDataView>(args.tag, tagged) }); tagged.AddRange(new[] { new Tuple <string, ITaggedDataView>(args.selectTag, newInput) }); var schema = loader.Schema; if (schema.Count == 0) { throw env.Except("The loaded view '{0}' is empty (empty schema).", args.filename); } return(newInput); } }
public void FastTreeRegressionRepresentation() { var env = new MLContext(seed: 0); var dataPath = GetDataPath(TestDatasets.generatedRegressionDataset.trainFilename); var dataSource = new MultiFileSource(dataPath); var catalog = new RegressionCatalog(env); var reader = TextLoaderStatic.CreateLoader(env, c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)), separator: ';', hasHeader: true); var opts = new FastTreeRegressionTrainer.Options() { NumTrees = 10, NumLeaves = 5, NumThreads = 1 }; FastTreeRegressionModelParameters pred = null; var est = reader.MakeNewEstimator() .Append(r => (r.label, score: catalog.Trainers.FastTree(r.label, r.features, null, opts, onFit: (p) => { pred = p; }))); var pipe = reader.Append(est); Assert.Null(pred); var model = pipe.Fit(dataSource); Assert.NotNull(pred); var treeCollection = pred.TrainedTreeEnsemble; Assert.Equal(0, treeCollection.Bias); Assert.Equal(10, treeCollection.Trees.Count); Assert.Equal(10, treeCollection.TreeWeights.Count); var trees = treeCollection.Trees; Assert.Equal(4, trees[0].NumNodes); // Numerical split. There is no categorical split so the follwoing vector contains 0-element. var categoricalSplitFeatures = trees[0].GetCategoricalSplitFeaturesAt(0); Assert.Equal(0, categoricalSplitFeatures.Count); // Numerical split. There is no categorical split so the follwoing vector contains 0-element. var categoricalSplitFeatureRange = trees[0].GetCategoricalCategoricalSplitFeatureRangeAt(0); Assert.Equal(0, categoricalSplitFeatureRange.Count); var expectedGtChild = new int[] { 3, 2, -4, -5 }; Assert.Equal(4, trees[0].GtChild.Count); Assert.Equal(expectedGtChild, trees[0].GtChild); var expectedLteChild = new int[] { 1, -1, -3, -2 }; Assert.Equal(4, trees[0].LteChild.Count); Assert.Equal(expectedLteChild, trees[0].LteChild); var expectedCategoricalSplitFlags = new bool[] { false, false, false, false }; Assert.Equal(4, trees[0].CategoricalSplitFlags.Count); Assert.Equal(expectedCategoricalSplitFlags, trees[0].CategoricalSplitFlags); var expectedNumericalSplitFeatureIndexes = new int[] { 0, 10, 2, 10 }; Assert.Equal(4, trees[0].NumericalSplitFeatureIndexes.Count); Assert.Equal(expectedNumericalSplitFeatureIndexes, trees[0].NumericalSplitFeatureIndexes); var expectedNumericalSplitThresholds = new float[] { 0.14f, -0.645f, -0.095f, 0.31f }; Assert.Equal(4, trees[0].NumericalSplitThresholds.Count); for (int i = 0; i < trees[0].NumericalSplitThresholds.Count; ++i) { Assert.Equal(expectedNumericalSplitThresholds[i], trees[0].NumericalSplitThresholds[i], 6); } Assert.Equal(5, trees[0].NumLeaves); var expectedLeafValues = new double[] { 40.159015006449692, 80.434805844435061, 57.072130551545513, 82.898710076162757, 104.17547955322266 }; Assert.Equal(5, trees[0].LeafValues.Count); for (int i = 0; i < trees[0].LeafValues.Count; ++i) { Assert.Equal(expectedLeafValues[i], trees[0].LeafValues[i], 6); } }