public void TestEP_PassThroughTransform() { var methodName = System.Reflection.MethodBase.GetCurrentMethod().Name; var iris = FileHelper.GetTestFile("iris.txt"); var outPass = FileHelper.GetOutputFile("data.idv", methodName); var df = DataFrameIO.ReadCsv(iris, sep: '\t', dtypes: new ColumnType[] { NumberType.R4 }); var importData = df.EPTextLoader(iris, sep: '\t', header: true); var learningPipeline = new GenericLearningPipeline(conc: 1); learningPipeline.Add(importData); learningPipeline.Add(new Legacy.Transforms.ColumnConcatenator("Features", "Sepal_length", "Sepal_width")); learningPipeline.Add(new Scikit.ML.EntryPoints.Scaler("Features")); learningPipeline.Add(new Scikit.ML.EntryPoints.PassThrough() { Filename = outPass, SaveOnDisk = true }); learningPipeline.Add(new Legacy.Trainers.StochasticDualCoordinateAscentRegressor()); var predictor = learningPipeline.Train(); var predictions = predictor.Predict(df); var dfout = DataFrameIO.ReadView(predictions); Assert.AreEqual(dfout.Shape, new Tuple <int, int>(150, 8)); Assert.IsTrue(File.Exists(outPass)); }
/// <summary> /// Reads a file as a <see cref="DataFrame"/>. /// Follows pandas API. /// </summary> public static DataFrame ReadCsv(string filename, char sep = ',', bool header = true, string[] names = null, int[] dtypes = null, int nrows = -1, int guess_rows = 10, string encoding = null, bool index = false) { var kinds = IntToColumnTypes(dtypes); return(DataFrameIO.ReadCsv(filename, sep, header, names, kinds, nrows, guess_rows, encoding == null ? null : Encoding.GetEncoding(encoding), index: index)); }
public void Train() { using (var env = new ConsoleEnvironment()) { var df = DataFrameIO.ReadCsv(_dataset, sep: ',', dtypes: new ColumnType[] { NumberType.R4 }); var concat = "Concat{col=Features:F0,F1,F2,F3,F4,F5,F6,F7,F8,F9}"; var pipe = new ScikitPipeline(new[] { concat }, "ftr{iter=10}"); pipe.Train(df, "Features", "Label"); _pipeline = pipe; } }
public void TestPipelineIris() { var iris = FileHelper.GetTestFile("iris_data_id.txt"); var df = DataFrameIO.ReadCsv(iris, sep: ',', dtypes: new[] { NumberType.R4, NumberType.R4, NumberType.R4 }); var env3 = PyEnvHelper.CreateStoreEnvironment(); var pipe = PyPipelineHelper.CreateScikitPipeline(new string[] { "Concat{col=Features:Slength,Swidth}" }, "mlr", env3.Item1); pipe.Train(df, "Features", "Label"); var sout = env3.Item2.ToString(); Assert.IsNotNull(sout); }
public void TestScikitAPI_TrainingDiabete() { var diab = FileHelper.GetTestFile("diabete.csv"); var cols = Enumerable.Range(0, 10).Select(c => NumberType.R4).ToArray(); var colsName = string.Join(',', Enumerable.Range(0, 10).Select(c => $"F{c}")); var df = DataFrameIO.ReadCsv(diab, sep: ',', dtypes: cols); var pipe = new ScikitPipeline(new string[] { $"Concat{{col=Features:{colsName}}}" }, "ols"); pipe.Train(df, "Features", "Label"); DataFrame pred = null; pipe.Predict(df, ref pred); Assert.AreEqual(pred.Shape, new ShapeType(83, 13)); }
public void TestScikitAPI_TrainingWithIris() { var iris = FileHelper.GetTestFile("iris.txt"); var df = DataFrameIO.ReadCsv(iris, sep: '\t'); df.AddColumn("LabelI", df["Label"].AsType(NumberType.R4)); var pipe = new ScikitPipeline(new[] { $"Concat{{col=Features:{df.Columns[1]},{df.Columns[2]}}}" }, "mlr"); pipe.Train(df, "Features", "LabelI"); DataFrame pred = null; pipe.Predict(df, ref pred); Assert.AreEqual(pred.Shape, new ShapeType(150, 9)); }
public void TestEP_NearestNeighborsLPMc() { using (var env = EnvHelper.NewTestEnvironment(conc: 1)) { var iris = FileHelper.GetTestFile("iris.txt"); var df = DataFrameIO.ReadCsv(iris, sep: '\t', dtypes: new ColumnType[] { NumberType.R4 }); var importData = df.EPTextLoader(iris, sep: '\t', header: true); var learningPipeline = new GenericLearningPipeline(conc: 1); learningPipeline.Add(importData); learningPipeline.Add(new Legacy.Transforms.ColumnConcatenator("Features", "Sepal_length", "Sepal_width")); learningPipeline.Add(new Scikit.ML.EntryPoints.NearestNeighborsMultiClass()); var predictor = learningPipeline.Train(); var predictions = predictor.Predict(df); var dfout = DataFrameIO.ReadView(predictions); Assert.AreEqual(dfout.Shape, new Tuple <int, int>(150, 11)); } }
public void TestPipelineDiabete() { var diab = FileHelper.GetTestFile("diabete.csv"); var cols = Enumerable.Range(0, 10).Select(c => NumberType.R4).ToArray(); var colsName = string.Join(',', Enumerable.Range(0, 10).Select(c => $"F{c}")); var df = DataFrameIO.ReadCsv(diab, sep: ',', dtypes: cols); var env3 = PyEnvHelper.CreateStoreEnvironment(); var pipe = PyPipelineHelper.CreateScikitPipeline(new string[] { $"Concat{{col=Features:{colsName}}}" }, "ols", env3.Item1); pipe.Train(df, "Features", "Label"); var sout = env3.Item2.ToString(); Assert.IsNotNull(sout); DataFrame pred = PyPipelineHelper.FastPredictOrTransform(pipe, df); Assert.IsTrue(df.Shape[0] > 0); }
public void TestEP_ScalerTransform() { var iris = FileHelper.GetTestFile("iris.txt"); var df = DataFrameIO.ReadCsv(iris, sep: '\t', dtypes: new ColumnType[] { NumberType.R4 }); var importData = df.EPTextLoader(iris, sep: '\t', header: true); var learningPipeline = new GenericLearningPipeline(conc: 1); learningPipeline.Add(importData); learningPipeline.Add(new Legacy.Transforms.ColumnConcatenator("Features", "Sepal_length", "Sepal_width")); learningPipeline.Add(new Scikit.ML.EntryPoints.Scaler("Features")); learningPipeline.Add(new Legacy.Trainers.StochasticDualCoordinateAscentRegressor()); var predictor = learningPipeline.Train(); var predictions = predictor.Predict(df); var dfout = DataFrameIO.ReadView(predictions); Assert.AreEqual(dfout.Shape, new Tuple <int, int>(150, 8)); }
public void TestScikitAPI_MKL_TrainingDiabete() { try { var diab = FileHelper.GetTestFile("diabete.csv"); var cols = Enumerable.Range(0, 10).Select(c => NumberDataViewType.Single).ToArray(); var colsName = string.Join(',', Enumerable.Range(0, 10).Select(c => $"F{c}")); var df = DataFrameIO.ReadCsv(diab, sep: ',', dtypes: cols); var pipe = new ScikitPipeline(new string[] { $"Concat{{col=Features:{colsName}}}" }, "ols"); pipe.Train(df, "Features", "Label"); DataFrame pred = null; pipe.Predict(df, ref pred); Assert.AreEqual(pred.Shape, new ShapeType(83, 13)); } catch (DllNotFoundException e) { var os = Environment.OSVersion; if (os.Platform == PlatformID.Unix) { Console.WriteLine("FAIL(1): TestScikitAPI_MKL due to {0}", e.ToString()); return; } else { Console.WriteLine("FAIL(1): TestScikitAPI_MKL, OS={0}", os.ToString()); throw e; } } catch (NotSupportedException e) { var os = Environment.OSVersion; if (os.Platform == PlatformID.Unix) { Console.WriteLine("FAIL(2): TestScikitAPI_MKL due to {0}", e.ToString()); return; } else { Console.WriteLine("FAIL(2): TestScikitAPI_MKL, OS={0}", os.ToString()); throw e; } } }
public void TestTreePathInnerAPI() { using (var env = EnvHelper.NewTestEnvironment(conc: 1)) { var methodName = System.Reflection.MethodBase.GetCurrentMethod().Name; var iris = FileHelper.GetTestFile("iris.txt"); var df = DataFrameIO.ReadCsv(iris, sep: '\t', dtypes: new ColumnType[] { NumberType.R4 }); using (var pipe = new ScikitPipeline(new[] { "Concat{col=Feature:Sepal_length,Sepal_width}", "TreeFeat{tr=ft{iter=2} lab=Label feat=Feature}" })) { pipe.Train(df); var scorer = pipe.Predict(df); var dfout = DataFrameIO.ReadView(scorer); Assert.AreEqual(dfout.Shape, new Tuple <int, int>(150, 31)); var outfile = FileHelper.GetOutputFile("iris_path.txt", methodName); dfout.ToCsv(outfile); Assert.IsTrue(File.Exists(outfile)); } } }
public void TestOnnx_TrainingWithIris() { var methodName = System.Reflection.MethodBase.GetCurrentMethod().Name; // direct call var iris = FileHelper.GetTestFile("iris.txt"); var df = DataFrameIO.ReadCsv(iris, sep: '\t'); df.AddColumn("LabelI", df["Label"].AsType(NumberDataViewType.Single)); var pipe = new ScikitPipeline(new[] { $"Concat{{col=Features:{df.Columns[1]},{df.Columns[2]}}}" }, "mlr"); pipe.Train(df, "Features", "LabelI"); DataFrame pred = null; pipe.Predict(df, ref pred); // Onnx Save var output = FileHelper.GetOutputFile("model.onnx", methodName); var model = pipe.ToOnnx(); model.Save(output); Assert.IsTrue(File.Exists(output)); // Onnx save no concat. output = FileHelper.GetOutputFile("model_vector.onnx", methodName); model = pipe.ToOnnx(1); model.Save(output); Assert.IsTrue(File.Exists(output)); // Onnx Load Not implemented yet. /* * var restored = new ScikitPipeline(output); * DataFrame pred2 = null; * restored.Predict(df, ref pred2); * pred.AssertAlmostEqual(pred2); */ }
public void TestTreePathNewAPI() { using (var env = EnvHelper.NewTestEnvironment(conc: 1)) { var methodName = System.Reflection.MethodBase.GetCurrentMethod().Name; var iris = FileHelper.GetTestFile("iris.txt"); var df = DataFrameIO.ReadCsv(iris, sep: '\t', dtypes: new ColumnType[] { NumberType.R4 }); var importData = df.EPTextLoader(iris, sep: '\t', header: true); var learningPipeline = new GenericLearningPipeline(); learningPipeline.Add(importData); learningPipeline.Add(new Legacy.Transforms.ColumnConcatenator("Features", "Sepal_length", "Sepal_width")); learningPipeline.Add(new Legacy.Trainers.StochasticDualCoordinateAscentRegressor() { MaxIterations = 2 }); var predictor = learningPipeline.Train(); var predictions = predictor.Predict(df); var dfout = DataFrameIO.ReadView(predictions); Assert.AreEqual(dfout.Shape, new Tuple <int, int>(150, 8)); var outfile = FileHelper.GetOutputFile("iris_path.txt", methodName); dfout.ToCsv(outfile); Assert.IsTrue(File.Exists(outfile)); } }