private void ScikitAPI_SimpleTransform_Load(bool removeFirstTransform) { var methodName = System.Reflection.MethodBase.GetCurrentMethod().Name; var output = FileHelper.GetOutputFile($"model{removeFirstTransform}.zip", methodName); var inputs = new[] { new ExampleA() { X = new float[] { 1, 10, 100 } }, new ExampleA() { X = new float[] { 2, 3, 5 } } }; var inputs2 = new[] { new ExampleA() { X = new float[] { -1, -10, -100 } }, new ExampleA() { X = new float[] { -2, -3, -5 } } }; string expected = null; using (var host = EnvHelper.NewTestEnvironment(conc: 1)) { var data = host.CreateStreamingDataView(inputs); using (var pipe = new ScikitPipeline(new[] { "poly{col=X}" }, host: host)) { var predictor = pipe.Train(data); Assert.IsTrue(predictor != null); var data2 = host.CreateStreamingDataView(inputs2); var predictions = pipe.Transform(data2); var df = DataFrameIO.ReadView(predictions); Assert.AreEqual(df.Shape, new Tuple <int, int>(2, 9)); var dfs = df.ToString(); var dfs2 = dfs.Replace("\n", ";"); expected = dfs2; Assert.AreEqual(dfs2, "X.0,X.1,X.2,X.3,X.4,X.5,X.6,X.7,X.8;-1,-10,-100,1,10,100,100,1000,10000;-2,-3,-5,4,6,10,9,15,25"); pipe.Save(output, removeFirstTransform); } } using (var host = EnvHelper.NewTestEnvironment(conc: 1)) { var data2 = host.CreateStreamingDataView(inputs2); using (var pipe2 = new ScikitPipeline(output, host)) { var predictions = pipe2.Transform(data2); var df = DataFrameIO.ReadView(predictions); Assert.AreEqual(df.Shape, new Tuple <int, int>(2, 9)); var dfs = df.ToString(); var dfs2 = dfs.Replace("\n", ";"); Assert.AreEqual(expected, dfs2); } } }
public void TestBcLrSameModel() { var methodName = System.Reflection.MethodBase.GetCurrentMethod().Name; var output = FileHelper.GetOutputFile("bc-lr.zip", methodName); var name = FileHelper.GetOutputFile("bc.txt", methodName); var df = DataFrameIO.ReadStr("Label,X1,X2,X3,X4,X5,X6,X7,X8,X9\n" + "0,0.1,1.1,2.1,3.1,4.1,5.1,6.2,7.4,-5\n" + "1,1.1,1.1,2.1,3.1,4.1,5.1,6.2,7.4,-5\n" + "0,2.1,1.1,3.1,3.1,-4.1,5.1,6.2,7.4,-5\n" + "1,3.1,1.1,4.1,3.1,4.1,-5.1,6.2,7.4,-5\n" + "0,4.1,1.1,2.1,3.1,4.1,5.1,6.2,-7.4,-5"); df.ToCsv(name); var cmd = string.Format("Train tr=lr data={0} out={1} loader=text{{col=Label:R4:0 col=Features:R4:1-* sep=, header=+}}", name, output); var stdout = new StringBuilder(); ILogWriter logout = new LogWriter((string s) => { stdout.Append(s); }); ILogWriter logerr = new LogWriter((string s) => { stdout.Append(s); }); /*using (*/ var env = new DelegateEnvironment(seed: 0, verbose: 2, outWriter: logout, errWriter: logerr); MamlHelper.MamlScript(cmd, false, env); var stout = stdout.ToString(); if (string.IsNullOrEmpty(stout)) { throw new Exception(stout); } }
void FillCacheIfNotFilled() { lock (_lock) { if (!(_autoView is null)) { return; } _autoView = DataFrameIO.ReadView(_input, keepVectors: true, numThreads: _numThreads); if (_sortColumn >= 0) { var sortedPosition = new List <KeyValuePair <TValue, long> >(); long position = 0; TValue got = default(TValue); // We could use multithreading here but the cost of sorting // might be higher than going through an array in memory. using (var cursor = _autoView.GetRowCursor(_autoView.Schema.Where(c => c.Index == _sortColumn))) { var sortColumnGetter = cursor.GetGetter <TValue>( SchemaHelper._dc(_sortColumn, cursor)); while (cursor.MoveNext()) { sortColumnGetter(ref got); sortedPosition.Add(new KeyValuePair <TValue, long>(got, position)); ++position; } } sortedPosition.Sort(CompareTo); _autoView.Order(sortedPosition.Select(c => (int)c.Value).ToArray()); } } }
static void TrainMultiToBinaryPredictorSparse(bool singleColumn, bool checkError) { var methodName = string.Format("{0}-{1}-V{2}", System.Reflection.MethodBase.GetCurrentMethod().Name, "lr", singleColumn ? "C" : "Vec"); var trainFile = FileHelper.GetTestFile("Train-28x28_small.txt"); var testFile = FileHelper.GetTestFile("Test-28x28_small.txt"); var outModelFilePath = FileHelper.GetOutputFile("outModelFilePath.zip", methodName); var outData = FileHelper.GetOutputFile("outData1.txt", methodName); var outData2 = FileHelper.GetOutputFile("outData2.txt", methodName); /*using (*/ var env = EnvHelper.NewTestEnvironment(conc: 1); { var loader = env.CreateLoader("Text{col=Label:R4:0 col=Features:R4:1-784}", new MultiFileSource(trainFile)); var roles = env.CreateExamples(loader, "Features", "Label"); var df = DataFrameIO.ReadView(roles.Data); Assert.IsTrue(df.Shape[0] > 0); var iova = string.Format("iova{{p=lr sc={0} nt=1}}", singleColumn ? "+" : "-"); loader = env.CreateLoader("Text{col=Label:R4:0 col=Features:R4:1-784}", new MultiFileSource(testFile)); var trainer = env.CreateTrainer(iova); using (var ch = env.Start("train")) { var predictor = trainer.Train(env, ch, roles); TestTrainerHelper.FinalizeSerializationTest(env, outModelFilePath, predictor, roles, outData, outData2, PredictionKind.MulticlassClassification, checkError, ratio: 0.1f); } } }
public void TestEP_PassThroughTransform() { var methodName = System.Reflection.MethodBase.GetCurrentMethod().Name; var iris = FileHelper.GetTestFile("iris.txt"); var outPass = FileHelper.GetOutputFile("data.idv", methodName); var df = DataFrameIO.ReadCsv(iris, sep: '\t', dtypes: new ColumnType[] { NumberType.R4 }); var importData = df.EPTextLoader(iris, sep: '\t', header: true); var learningPipeline = new GenericLearningPipeline(conc: 1); learningPipeline.Add(importData); learningPipeline.Add(new Legacy.Transforms.ColumnConcatenator("Features", "Sepal_length", "Sepal_width")); learningPipeline.Add(new Scikit.ML.EntryPoints.Scaler("Features")); learningPipeline.Add(new Scikit.ML.EntryPoints.PassThrough() { Filename = outPass, SaveOnDisk = true }); learningPipeline.Add(new Legacy.Trainers.StochasticDualCoordinateAscentRegressor()); var predictor = learningPipeline.Train(); var predictions = predictor.Predict(df); var dfout = DataFrameIO.ReadView(predictions); Assert.AreEqual(dfout.Shape, new Tuple <int, int>(150, 8)); Assert.IsTrue(File.Exists(outPass)); }
public void TestTagTrainOrScoreTransformCustomScorer() { var methodName = System.Reflection.MethodBase.GetCurrentMethod().Name; var dataFilePath = FileHelper.GetTestFile("mc_iris.txt"); var outModelFilePath = FileHelper.GetOutputFile("outModelFilePath.zip", methodName); var outData = FileHelper.GetOutputFile("outData1.txt", methodName); using (var env = EnvHelper.NewTestEnvironment()) { var loader = env.CreateLoader("Text{col=Label:R4:0 col=Slength:R4:1 col=Swidth:R4:2 col=Plength:R4:3 col=Pwidth:R4:4 header=-}", new MultiFileSource(dataFilePath)); using (var pipe = new ScikitPipeline(new[] { "Concat{col=Feature:Slength,Swidth}", "TagTrainScore{tr=iova{p=ft{nl=10 iter=1}} lab=Label feat=Feature tag=model scorer=MultiClassClassifierScorer{ex=AA}}" }, host: env)) { pipe.Train(loader); var pred = pipe.Predict(loader); var df = DataFrameIO.ReadView(pred); Assert.AreEqual(df.Shape, new Tuple <int, int>(150, 11)); var dfs = df.Head().ToString(); Assert.IsTrue(dfs.StartsWith("Label,Slength,Swidth,Plength,Pwidth,Feature.0,Feature.1,PredictedLabelAA,ScoreAA.0,ScoreAA.1,ScoreAA.2")); } } }
/// <summary> /// Reads a string as a IDataView. /// Follows pandas API. /// </summary> public static DataFrame ReadStr(string content, char sep = ',', bool header = true, string[] names = null, int[] dtypes = null, int nrows = -1, int guess_rows = 10, bool index = false) { var kinds = IntToColumnTypes(dtypes); return(DataFrameIO.ReadStr(content, sep, header, names, kinds, nrows, guess_rows, index)); }
public void TestScikitAPI_SimplePredictor() { var inputs = new[] { new ExampleA() { X = new float[] { 1, 10, 100 } }, new ExampleA() { X = new float[] { 2, 3, 5 } }, new ExampleA() { X = new float[] { 2, 4, 5 } }, new ExampleA() { X = new float[] { 2, 4, 7 } }, }; var inputs2 = new[] { new ExampleA() { X = new float[] { -1, -10, -100 } }, new ExampleA() { X = new float[] { -2, -3, -5 } }, new ExampleA() { X = new float[] { 3, 4, 5 } }, new ExampleA() { X = new float[] { 3, 4, 7 } }, }; /*using (*/ var host = EnvHelper.NewTestEnvironment(conc: 1); { var data = DataViewConstructionUtils.CreateFromEnumerable(host, inputs); using (var pipe = new ScikitPipeline(new[] { "poly{col=X}" }, "km{k=2}", host)) { var predictor = pipe.Train(data, feature: "X"); Assert.IsTrue(predictor != null); var data2 = new StreamingDataFrame(DataViewConstructionUtils.CreateFromEnumerable(host, inputs2)); var predictions = pipe.Predict(data2); var df = DataFrameIO.ReadView(predictions); Assert.AreEqual(df.Shape, new Tuple <int, int>(4, 12)); var dfs = df.ToString(); var dfs2 = dfs.Replace("\n", ";"); Assert.IsTrue(dfs2.StartsWith("X.0,X.1,X.2,X.3,X.4,X.5,X.6,X.7,X.8,PredictedLabel,Score.0,Score.1;-1,-10,-100,1,10,100,100,1000,10000")); } } }
/// <summary> /// Reads a file as a <see cref="DataFrame"/>. /// Follows pandas API. /// </summary> public static DataFrame ReadCsv(string filename, char sep = ',', bool header = true, string[] names = null, int[] dtypes = null, int nrows = -1, int guess_rows = 10, string encoding = null, bool index = false) { var kinds = IntToColumnTypes(dtypes); return(DataFrameIO.ReadCsv(filename, sep, header, names, kinds, nrows, guess_rows, encoding == null ? null : Encoding.GetEncoding(encoding), index: index)); }
public void Train() { using (var env = new ConsoleEnvironment()) { var df = DataFrameIO.ReadCsv(_dataset, sep: ',', dtypes: new ColumnType[] { NumberType.R4 }); var concat = "Concat{col=Features:F0,F1,F2,F3,F4,F5,F6,F7,F8,F9}"; var pipe = new ScikitPipeline(new[] { concat }, "ftr{iter=10}"); pipe.Train(df, "Features", "Label"); _pipeline = pipe; } }
public void TestPipelineIris() { var iris = FileHelper.GetTestFile("iris_data_id.txt"); var df = DataFrameIO.ReadCsv(iris, sep: ',', dtypes: new[] { NumberType.R4, NumberType.R4, NumberType.R4 }); var env3 = PyEnvHelper.CreateStoreEnvironment(); var pipe = PyPipelineHelper.CreateScikitPipeline(new string[] { "Concat{col=Features:Slength,Swidth}" }, "mlr", env3.Item1); pipe.Train(df, "Features", "Label"); var sout = env3.Item2.ToString(); Assert.IsNotNull(sout); }
public void TestScikitAPI_TrainingWithIris() { var iris = FileHelper.GetTestFile("iris.txt"); var df = DataFrameIO.ReadCsv(iris, sep: '\t'); df.AddColumn("LabelI", df["Label"].AsType(NumberType.R4)); var pipe = new ScikitPipeline(new[] { $"Concat{{col=Features:{df.Columns[1]},{df.Columns[2]}}}" }, "mlr"); pipe.Train(df, "Features", "LabelI"); DataFrame pred = null; pipe.Predict(df, ref pred); Assert.AreEqual(pred.Shape, new ShapeType(150, 9)); }
public void TestScikitAPI_TrainingDiabete() { var diab = FileHelper.GetTestFile("diabete.csv"); var cols = Enumerable.Range(0, 10).Select(c => NumberType.R4).ToArray(); var colsName = string.Join(',', Enumerable.Range(0, 10).Select(c => $"F{c}")); var df = DataFrameIO.ReadCsv(diab, sep: ',', dtypes: cols); var pipe = new ScikitPipeline(new string[] { $"Concat{{col=Features:{colsName}}}" }, "ols"); pipe.Train(df, "Features", "Label"); DataFrame pred = null; pipe.Predict(df, ref pred); Assert.AreEqual(pred.Shape, new ShapeType(83, 13)); }
public static DataFrame TestScikitAPI_EngineSimpleTrainAndPredict(string engine, int th, int N, int ncall, bool cacheScikit) { var dico = new Dictionary <Tuple <int, string, int, int>, double>(); var scorer = _TrainSentiment(); var trscorer = _TrainSentiment2(); foreach (var res in _MeasureTime(th, engine, scorer, trscorer, N, ncall, cacheScikit)) { dico[new Tuple <int, string, int, int>(res.Item1, engine, th, res.Item3)] = res.Item2.TotalSeconds; } var df = DataFrameIO.Convert(dico, "N", "engine", "number of threads", "call", "time(s)"); return(df); }
public void TestScikitAPI_EngineSimpleTrainAndPredict() { var dico = new Dictionary <Tuple <int, string, string, int, int>, double>(); var scorer = _TrainSentiment(); var trscorer = _TrainSentiment2Transformer(); foreach (var cache in new[] { false, true }) { for (int th = 1; th <= 3; ++th) { var memo = new Dictionary <string, float[]>(); foreach (var engine in new[] { "mlnet", "scikit" }) { foreach (var kind in new[] { "array", "stream" }) { var strat_ = new[] { cache ? "extcache" : "viewcache", kind, }; var strat = string.Join("+", strat_); foreach (var res in _MeasureTime(th, strat, engine, scorer, trscorer, 2)) { dico[new Tuple <int, string, string, int, int>(res.Item1, engine, strat, th, res.Item3)] = res.Item2.TotalSeconds; if (res.Item3 == 1) { memo[engine] = res.Item4; } } } } var p1 = memo["mlnet"]; var p2 = memo["scikit"]; Assert.AreEqual(p1.Length, p2.Length); var abs = 0.0; for (int ii = 0; ii < p1.Length; ++ii) { abs += Math.Abs(p1[ii] - p2[ii]); } abs /= p1.Length; // Assert.IsTrue(abs <= 2); } } var df = DataFrameIO.Convert(dico, "N", "engine", "strategy", "number of threads", "call", "time(s)"); var methodName = System.Reflection.MethodBase.GetCurrentMethod().Name; var filename = FileHelper.GetOutputFile("benchmark_ValueMapperPredictionEngineMultiThread.txt", methodName); df.ToCsv(filename); Assert.AreEqual(dico.Count, 48); }
public void TestI_PolynomialTransformNumericValues() { using (var host = EnvHelper.NewTestEnvironment(conc: 1)) { var raw = DataFrameIO.ReadStr("A,B\n1.0,2.0\n2.0,3.0\n10.0,11.0"); raw.SetShuffle(false); var loader = host.CreateTransform("concat{col=X:A,B}", raw); var data = host.CreateTransform("Poly{col=X}", loader); var res = DataFrameIO.ReadView(data); var txt = res.ToString(); Assert.IsFalse(string.IsNullOrEmpty(txt)); var exp = "A,B,X.0,X.1,X.2,X.3,X.4\n1.0,2.0,1.0,2.0,1.0,2.0,4.0\n2.0,3.0,2.0,3.0,4.0,6.0,9.0\n10.0,11.0,10.0,11.0,100.0,110.0,121.0"; var dfexp = DataFrameIO.ReadStr(exp); Assert.AreEqual(0, dfexp.AlmostEquals(res, exc: true, printDf: true)); } }
public void TestI_ScalerTransformNumericValuesMinMax() { using (var host = EnvHelper.NewTestEnvironment(conc: 1)) { var raw = DataFrameIO.ReadStr("A,B\n1.0,2.0\n2.0,3.0\n10.0,11.0"); raw.SetShuffle(false); var loader = host.CreateTransform("concat{col=X:A,B}", raw); var data = host.CreateTransform("Scaler{col=X scale=minMax}", loader); (data as ITrainableTransform).Estimate(); var res = DataFrameIO.ReadView(data); var txt = res.ToString(); var exp = "A,B,X.0,X.1\n1.0,2.0,0.0,0.0\n2.0,3.0,0.11111111,0.11111111\n10.0,11.0,1.0,1.0"; var dfexp = DataFrameIO.ReadStr(exp); Assert.AreEqual(0, dfexp.AlmostEquals(res, exc: true, printDf: true)); } }
public void TestPipelineDiabete() { var diab = FileHelper.GetTestFile("diabete.csv"); var cols = Enumerable.Range(0, 10).Select(c => NumberType.R4).ToArray(); var colsName = string.Join(',', Enumerable.Range(0, 10).Select(c => $"F{c}")); var df = DataFrameIO.ReadCsv(diab, sep: ',', dtypes: cols); var env3 = PyEnvHelper.CreateStoreEnvironment(); var pipe = PyPipelineHelper.CreateScikitPipeline(new string[] { $"Concat{{col=Features:{colsName}}}" }, "ols", env3.Item1); pipe.Train(df, "Features", "Label"); var sout = env3.Item2.ToString(); Assert.IsNotNull(sout); DataFrame pred = PyPipelineHelper.FastPredictOrTransform(pipe, df); Assert.IsTrue(df.Shape[0] > 0); }
public void TestEP_NearestNeighborsLPMc() { using (var env = EnvHelper.NewTestEnvironment(conc: 1)) { var iris = FileHelper.GetTestFile("iris.txt"); var df = DataFrameIO.ReadCsv(iris, sep: '\t', dtypes: new ColumnType[] { NumberType.R4 }); var importData = df.EPTextLoader(iris, sep: '\t', header: true); var learningPipeline = new GenericLearningPipeline(conc: 1); learningPipeline.Add(importData); learningPipeline.Add(new Legacy.Transforms.ColumnConcatenator("Features", "Sepal_length", "Sepal_width")); learningPipeline.Add(new Scikit.ML.EntryPoints.NearestNeighborsMultiClass()); var predictor = learningPipeline.Train(); var predictions = predictor.Predict(df); var dfout = DataFrameIO.ReadView(predictions); Assert.AreEqual(dfout.Shape, new Tuple <int, int>(150, 11)); } }
public void TestValueMapperPredictionEngineMultiThread() { var name = FileHelper.GetTestFile("bc-lr.zip"); /*using (*/ var env = EnvHelper.NewTestEnvironment(); using (var engine0 = new ValueMapperPredictionEngineFloat(env, name, conc: 1)) { var feat = new float[] { 5, 1, 1, 1, 2, 1, 3, 1, 1 }; var exp = new float[100]; for (int i = 0; i < exp.Length; ++i) { feat[0] = i; exp[i] = engine0.Predict(feat); Assert.IsFalse(float.IsNaN(exp[i])); Assert.IsFalse(float.IsInfinity(exp[i])); } var dico = new Dictionary <Tuple <int, bool, int>, double>(); foreach (var each in new[] { false, true }) { foreach (int th in new int[] { 2, 0, 1, 3 }) { var engine = new ValueMapperPredictionEngineFloat(env, name, conc: th); var sw = new Stopwatch(); sw.Start(); for (int i = 0; i < exp.Length; ++i) { feat[0] = i; var res = engine.Predict(feat); Assert.AreEqual(exp[i], res); } sw.Stop(); dico[new Tuple <int, bool, int>(exp.Length, each, th)] = sw.Elapsed.TotalSeconds; } } Assert.AreEqual(dico.Count, 8); var df = DataFrameIO.Convert(dico, "N", "number of threads", "time(s)"); var methodName = System.Reflection.MethodBase.GetCurrentMethod().Name; var filename = FileHelper.GetOutputFile("benchmark_ValueMapperPredictionEngineMultiThread.txt", methodName); df.ToCsv(filename); } }
public void TestI_ScalerTransformNumericValuesMeanVar() { /*using (*/ var host = EnvHelper.NewTestEnvironment(conc: 1); { var raw = DataFrameIO.ReadStr("A,B\n1.0,2.0\n2.0,3.0\n10.0,11.0"); raw.SetShuffle(false); var loader = host.CreateTransform("concat{col=X:A,B}", raw); var data = host.CreateTransform("Scaler{col=X}", loader); (data as ITrainableTransform).Estimate(); var res = DataFrameIO.ReadView(data); var txt = res.ToString(); Assert.IsNotNull(txt); var exp = "A,B,X.0,X.1\n1.0,2.0,-0.827605963,-0.827605963\n2.0,3.0,-0.5793242,-0.5793242\n10.0,11.0,1.40693,1.40693"; var dfexp = DataFrameIO.ReadStr(exp); Assert.AreEqual(0, dfexp.AlmostEquals(res, exc: true, printDf: true, sortBy: "A")); } }
public void TestEP_ScalerTransform() { var iris = FileHelper.GetTestFile("iris.txt"); var df = DataFrameIO.ReadCsv(iris, sep: '\t', dtypes: new ColumnType[] { NumberType.R4 }); var importData = df.EPTextLoader(iris, sep: '\t', header: true); var learningPipeline = new GenericLearningPipeline(conc: 1); learningPipeline.Add(importData); learningPipeline.Add(new Legacy.Transforms.ColumnConcatenator("Features", "Sepal_length", "Sepal_width")); learningPipeline.Add(new Scikit.ML.EntryPoints.Scaler("Features")); learningPipeline.Add(new Legacy.Trainers.StochasticDualCoordinateAscentRegressor()); var predictor = learningPipeline.Train(); var predictions = predictor.Predict(df); var dfout = DataFrameIO.ReadView(predictions); Assert.AreEqual(dfout.Shape, new Tuple <int, int>(150, 8)); }
public void TestScikitAPI_MKL_TrainingDiabete() { try { var diab = FileHelper.GetTestFile("diabete.csv"); var cols = Enumerable.Range(0, 10).Select(c => NumberDataViewType.Single).ToArray(); var colsName = string.Join(',', Enumerable.Range(0, 10).Select(c => $"F{c}")); var df = DataFrameIO.ReadCsv(diab, sep: ',', dtypes: cols); var pipe = new ScikitPipeline(new string[] { $"Concat{{col=Features:{colsName}}}" }, "ols"); pipe.Train(df, "Features", "Label"); DataFrame pred = null; pipe.Predict(df, ref pred); Assert.AreEqual(pred.Shape, new ShapeType(83, 13)); } catch (DllNotFoundException e) { var os = Environment.OSVersion; if (os.Platform == PlatformID.Unix) { Console.WriteLine("FAIL(1): TestScikitAPI_MKL due to {0}", e.ToString()); return; } else { Console.WriteLine("FAIL(1): TestScikitAPI_MKL, OS={0}", os.ToString()); throw e; } } catch (NotSupportedException e) { var os = Environment.OSVersion; if (os.Platform == PlatformID.Unix) { Console.WriteLine("FAIL(2): TestScikitAPI_MKL due to {0}", e.ToString()); return; } else { Console.WriteLine("FAIL(2): TestScikitAPI_MKL, OS={0}", os.ToString()); throw e; } } }
public void TestScikitAPI_SimpleTransform() { var inputs = new[] { new ExampleA() { X = new float[] { 1, 10, 100 } }, new ExampleA() { X = new float[] { 2, 3, 5 } } }; var inputs2 = new[] { new ExampleA() { X = new float[] { -1, -10, -100 } }, new ExampleA() { X = new float[] { -2, -3, -5 } } }; /*using (*/ var host = EnvHelper.NewTestEnvironment(conc: 1); { var data = DataViewConstructionUtils.CreateFromEnumerable(host, inputs); using (var pipe = new ScikitPipeline(new[] { "poly{col=X}" }, host: host)) { var predictor = pipe.Train(data); Assert.IsTrue(predictor != null); var data2 = DataViewConstructionUtils.CreateFromEnumerable(host, inputs2); var predictions = pipe.Transform(data2); var df = DataFrameIO.ReadView(predictions); Assert.AreEqual(df.Shape, new Tuple <int, int>(2, 9)); var dfs = df.ToString(); var dfs2 = dfs.Replace("\n", ";"); Assert.AreEqual(dfs2, "X.0,X.1,X.2,X.3,X.4,X.5,X.6,X.7,X.8;-1,-10,-100,1,10,100,100,1000,10000;-2,-3,-5,4,6,10,9,15,25"); } } }
public void TestTreePathInnerAPI() { using (var env = EnvHelper.NewTestEnvironment(conc: 1)) { var methodName = System.Reflection.MethodBase.GetCurrentMethod().Name; var iris = FileHelper.GetTestFile("iris.txt"); var df = DataFrameIO.ReadCsv(iris, sep: '\t', dtypes: new ColumnType[] { NumberType.R4 }); using (var pipe = new ScikitPipeline(new[] { "Concat{col=Feature:Sepal_length,Sepal_width}", "TreeFeat{tr=ft{iter=2} lab=Label feat=Feature}" })) { pipe.Train(df); var scorer = pipe.Predict(df); var dfout = DataFrameIO.ReadView(scorer); Assert.AreEqual(dfout.Shape, new Tuple <int, int>(150, 31)); var outfile = FileHelper.GetOutputFile("iris_path.txt", methodName); dfout.ToCsv(outfile); Assert.IsTrue(File.Exists(outfile)); } } }
public void TestOnnx_TrainingWithIris() { var methodName = System.Reflection.MethodBase.GetCurrentMethod().Name; // direct call var iris = FileHelper.GetTestFile("iris.txt"); var df = DataFrameIO.ReadCsv(iris, sep: '\t'); df.AddColumn("LabelI", df["Label"].AsType(NumberDataViewType.Single)); var pipe = new ScikitPipeline(new[] { $"Concat{{col=Features:{df.Columns[1]},{df.Columns[2]}}}" }, "mlr"); pipe.Train(df, "Features", "LabelI"); DataFrame pred = null; pipe.Predict(df, ref pred); // Onnx Save var output = FileHelper.GetOutputFile("model.onnx", methodName); var model = pipe.ToOnnx(); model.Save(output); Assert.IsTrue(File.Exists(output)); // Onnx save no concat. output = FileHelper.GetOutputFile("model_vector.onnx", methodName); model = pipe.ToOnnx(1); model.Save(output); Assert.IsTrue(File.Exists(output)); // Onnx Load Not implemented yet. /* * var restored = new ScikitPipeline(output); * DataFrame pred2 = null; * restored.Predict(df, ref pred2); * pred.AssertAlmostEqual(pred2); */ }
public void TestTreePathNewAPI() { using (var env = EnvHelper.NewTestEnvironment(conc: 1)) { var methodName = System.Reflection.MethodBase.GetCurrentMethod().Name; var iris = FileHelper.GetTestFile("iris.txt"); var df = DataFrameIO.ReadCsv(iris, sep: '\t', dtypes: new ColumnType[] { NumberType.R4 }); var importData = df.EPTextLoader(iris, sep: '\t', header: true); var learningPipeline = new GenericLearningPipeline(); learningPipeline.Add(importData); learningPipeline.Add(new Legacy.Transforms.ColumnConcatenator("Features", "Sepal_length", "Sepal_width")); learningPipeline.Add(new Legacy.Trainers.StochasticDualCoordinateAscentRegressor() { MaxIterations = 2 }); var predictor = learningPipeline.Train(); var predictions = predictor.Predict(df); var dfout = DataFrameIO.ReadView(predictions); Assert.AreEqual(dfout.Shape, new Tuple <int, int>(150, 8)); var outfile = FileHelper.GetOutputFile("iris_path.txt", methodName); dfout.ToCsv(outfile); Assert.IsTrue(File.Exists(outfile)); } }
public void TestScikitAPI_SimplePredictor_FastValueMapper() { var inputs = new[] { new ExampleA() { X = new float[] { 1, 10, 100 } }, new ExampleA() { X = new float[] { 2, 3, 5 } }, new ExampleA() { X = new float[] { 2, 4, 5 } }, new ExampleA() { X = new float[] { 2, 4, 7 } }, }; var inputs2 = new[] { new ExampleA() { X = new float[] { -1, -10, -100 } }, new ExampleA() { X = new float[] { -2, -3, -5 } }, new ExampleA() { X = new float[] { 3, 4, 5 } }, new ExampleA() { X = new float[] { 3, 4, 7 } }, }; DataFrame df1, df2, df3; using (var host = EnvHelper.NewTestEnvironment(conc: 1)) { var data = host.CreateStreamingDataView(inputs); var data2 = host.CreateStreamingDataView(inputs2); df1 = DataFrameIO.ReadView(data, env: host, keepVectors: true); df2 = DataFrameIO.ReadView(data2, env: host, keepVectors: true); df3 = DataFrameIO.ReadView(data2, env: host, keepVectors: true); } using (var host = EnvHelper.NewTestEnvironment(conc: 1)) { using (var pipe = new ScikitPipeline(new[] { "poly{col=X}" }, "km{k=2}", host)) { DataFrame pred = null, pred2 = null; var predictor = pipe.Train(df1, feature: "X"); Assert.IsTrue(predictor != null); pipe.Predict(df2, ref pred); Assert.AreEqual(pred.Shape, new Tuple <int, int>(4, 3)); var dfs = pred.ToString(); var dfs2 = dfs.Replace("\n", ";"); if (!dfs2.StartsWith("X.0,X.1,X.2,X.3,X.4,X.5,X.6,X.7,X.8,PredictedLabel,Score.0,Score.1;-1,-10,-100,1,10,100,100,1000,10000")) { throw new Exception($"Wrong starts\n{dfs2}"); } pipe.Predict(df3, ref pred2); pred.AssertAlmostEqual(pred2); } } }
/// <summary> /// Runs a simple test. /// </summary> public static void TestScikitAPI() { var inputs = new[] { new ExampleVector() { X = new float[] { 1, 10, 100 } }, new ExampleVector() { X = new float[] { 2, 3, 5 } }, new ExampleVector() { X = new float[] { 2, 4, 5 } }, new ExampleVector() { X = new float[] { 2, 4, 7 } }, }; var inputs2 = new[] { new ExampleVector() { X = new float[] { -1, -10, -100 } }, new ExampleVector() { X = new float[] { -2, -3, -5 } }, new ExampleVector() { X = new float[] { 3, 4, 5 } }, new ExampleVector() { X = new float[] { 3, 4, 7 } }, }; using (var host = new ConsoleEnvironment(conc: 1)) { ComponentHelper.AddStandardComponents(host); var data = host.CreateStreamingDataView(inputs); using (var pipe = new ScikitPipeline(new[] { "poly{col=X}" }, "km{k=2}", host)) { var predictor = pipe.Train(data, feature: "X"); if (predictor == null) { throw new Exception("Test failed: no predictor."); } var data2 = host.CreateStreamingDataView(inputs2); var predictions = pipe.Predict(data2); var df = DataFrameIO.ReadView(predictions); if (df.Shape.Item1 != 4 || df.Shape.Item2 != 12) { throw new Exception("Test failed: prediction failed."); } var dfs = df.ToString(); var dfs2 = dfs.Replace("\n", ";"); if (!dfs2.StartsWith("X.0,X.1,X.2,X.3,X.4,X.5,X.6,X.7,X.8,PredictedLabel,Score.0,Score.1;-1,-10,-100,1,10,100,100,1000,10000")) { throw new Exception("Test failed: prediction failed (header)."); } } } }
/// <summary> /// Trains the pipeline with data coming from a <see cref="IDataView"/>. /// </summary> public ScikitPipeline Train(IDataView data, string feature = "Feature", string label = null, string weight = null, string groupId = null) { IDataView trans = data; using (var ch = _env.Start("Create transforms")) { for (int i = 0; i < _transforms.Length; ++i) { try { trans = _env.CreateTransform(_transforms[i].transformSettings, trans); } catch (Exception e) { if (e.ToString().Contains("Unknown loadable class")) { var nn = _env.ComponentCatalog.GetAllClasses().Length; var filt = _env.ComponentCatalog.GetAllClasses() .Select(c => c.UserName) .OrderBy(c => c) .Where(c => c.Trim().Length > 2); var regis = string.Join("\n", filt); throw Contracts.Except(e, $"Unable to create transform '{_transforms[i].transformSettings}', assembly not registered among {nn}\n{regis}"); } throw e; } _transforms[i].transform = trans as IDataTransform; } } if (_predictor != null) { using (var ch = _env.Start("Create Predictor")) { _predictor.trainer = TrainerHelper.CreateTrainer(_env, _predictor.trainerSettings); _roles = new List <KeyValuePair <RoleMappedSchema.ColumnRole, string> >(); _roles.Add(new KeyValuePair <RoleMappedSchema.ColumnRole, string>(RoleMappedSchema.ColumnRole.Feature, feature)); if (!string.IsNullOrEmpty(label)) { _roles.Add(new KeyValuePair <RoleMappedSchema.ColumnRole, string>(RoleMappedSchema.ColumnRole.Label, label)); } if (!string.IsNullOrEmpty(groupId)) { _roles.Add(new KeyValuePair <RoleMappedSchema.ColumnRole, string>(RoleMappedSchema.ColumnRole.Group, groupId)); } if (!string.IsNullOrEmpty(weight)) { _roles.Add(new KeyValuePair <RoleMappedSchema.ColumnRole, string>(RoleMappedSchema.ColumnRole.Weight, weight)); } var roleMap = new RoleMappedData(trans, label, feature, group: groupId, weight: weight); _predictor.predictor = _predictor.trainer.Train(_env, ch, roleMap); _predictor.roleMapData = roleMap; } } else { _predictor = new StepPredictor() { predictor = null, trainer = null, trainerSettings = null, roleMapData = new RoleMappedData(trans) }; // We predict one to make sure everything works fine. using (var ch = _env.Start("Compute one prediction.")) { var df = DataFrameIO.ReadView(trans, 1, keepVectors: true, env: _env); if (df.Length == 0) { throw _env.ExceptEmpty("Something went wrong. The pipeline does not produce any output."); } } } return(this); }