Beispiel #1
0
        private void ScikitAPI_SimpleTransform_Load(bool removeFirstTransform)
        {
            var methodName = System.Reflection.MethodBase.GetCurrentMethod().Name;
            var output     = FileHelper.GetOutputFile($"model{removeFirstTransform}.zip", methodName);
            var inputs     = new[] {
                new ExampleA()
                {
                    X = new float[] { 1, 10, 100 }
                },
                new ExampleA()
                {
                    X = new float[] { 2, 3, 5 }
                }
            };

            var inputs2 = new[] {
                new ExampleA()
                {
                    X = new float[] { -1, -10, -100 }
                },
                new ExampleA()
                {
                    X = new float[] { -2, -3, -5 }
                }
            };

            string expected = null;

            using (var host = EnvHelper.NewTestEnvironment(conc: 1))
            {
                var data = host.CreateStreamingDataView(inputs);
                using (var pipe = new ScikitPipeline(new[] { "poly{col=X}" }, host: host))
                {
                    var predictor = pipe.Train(data);
                    Assert.IsTrue(predictor != null);
                    var data2       = host.CreateStreamingDataView(inputs2);
                    var predictions = pipe.Transform(data2);
                    var df          = DataFrameIO.ReadView(predictions);
                    Assert.AreEqual(df.Shape, new Tuple <int, int>(2, 9));
                    var dfs  = df.ToString();
                    var dfs2 = dfs.Replace("\n", ";");
                    expected = dfs2;
                    Assert.AreEqual(dfs2, "X.0,X.1,X.2,X.3,X.4,X.5,X.6,X.7,X.8;-1,-10,-100,1,10,100,100,1000,10000;-2,-3,-5,4,6,10,9,15,25");
                    pipe.Save(output, removeFirstTransform);
                }
            }
            using (var host = EnvHelper.NewTestEnvironment(conc: 1))
            {
                var data2 = host.CreateStreamingDataView(inputs2);
                using (var pipe2 = new ScikitPipeline(output, host))
                {
                    var predictions = pipe2.Transform(data2);
                    var df          = DataFrameIO.ReadView(predictions);
                    Assert.AreEqual(df.Shape, new Tuple <int, int>(2, 9));
                    var dfs  = df.ToString();
                    var dfs2 = dfs.Replace("\n", ";");
                    Assert.AreEqual(expected, dfs2);
                }
            }
        }
        public void TestBcLrSameModel()
        {
            var methodName = System.Reflection.MethodBase.GetCurrentMethod().Name;
            var output     = FileHelper.GetOutputFile("bc-lr.zip", methodName);
            var name       = FileHelper.GetOutputFile("bc.txt", methodName);
            var df         = DataFrameIO.ReadStr("Label,X1,X2,X3,X4,X5,X6,X7,X8,X9\n" +
                                                 "0,0.1,1.1,2.1,3.1,4.1,5.1,6.2,7.4,-5\n" +
                                                 "1,1.1,1.1,2.1,3.1,4.1,5.1,6.2,7.4,-5\n" +
                                                 "0,2.1,1.1,3.1,3.1,-4.1,5.1,6.2,7.4,-5\n" +
                                                 "1,3.1,1.1,4.1,3.1,4.1,-5.1,6.2,7.4,-5\n" +
                                                 "0,4.1,1.1,2.1,3.1,4.1,5.1,6.2,-7.4,-5");

            df.ToCsv(name);
            var cmd = string.Format("Train tr=lr data={0} out={1} loader=text{{col=Label:R4:0 col=Features:R4:1-* sep=, header=+}}",
                                    name, output);

            var        stdout = new StringBuilder();
            ILogWriter logout = new LogWriter((string s) => { stdout.Append(s); });
            ILogWriter logerr = new LogWriter((string s) => { stdout.Append(s); });
            /*using (*/
            var env = new DelegateEnvironment(seed: 0, verbose: 2, outWriter: logout, errWriter: logerr);

            MamlHelper.MamlScript(cmd, false, env);
            var stout = stdout.ToString();

            if (string.IsNullOrEmpty(stout))
            {
                throw new Exception(stout);
            }
        }
Beispiel #3
0
            void FillCacheIfNotFilled()
            {
                lock (_lock)
                {
                    if (!(_autoView is null))
                    {
                        return;
                    }

                    _autoView = DataFrameIO.ReadView(_input, keepVectors: true, numThreads: _numThreads);

                    if (_sortColumn >= 0)
                    {
                        var    sortedPosition = new List <KeyValuePair <TValue, long> >();
                        long   position       = 0;
                        TValue got            = default(TValue);

                        // We could use multithreading here but the cost of sorting
                        // might be higher than going through an array in memory.
                        using (var cursor = _autoView.GetRowCursor(_autoView.Schema.Where(c => c.Index == _sortColumn)))
                        {
                            var sortColumnGetter = cursor.GetGetter <TValue>(
                                SchemaHelper._dc(_sortColumn, cursor));
                            while (cursor.MoveNext())
                            {
                                sortColumnGetter(ref got);
                                sortedPosition.Add(new KeyValuePair <TValue, long>(got, position));
                                ++position;
                            }
                        }
                        sortedPosition.Sort(CompareTo);
                        _autoView.Order(sortedPosition.Select(c => (int)c.Value).ToArray());
                    }
                }
            }
        static void TrainMultiToBinaryPredictorSparse(bool singleColumn, bool checkError)
        {
            var methodName = string.Format("{0}-{1}-V{2}", System.Reflection.MethodBase.GetCurrentMethod().Name,
                                           "lr", singleColumn ? "C" : "Vec");
            var trainFile        = FileHelper.GetTestFile("Train-28x28_small.txt");
            var testFile         = FileHelper.GetTestFile("Test-28x28_small.txt");
            var outModelFilePath = FileHelper.GetOutputFile("outModelFilePath.zip", methodName);
            var outData          = FileHelper.GetOutputFile("outData1.txt", methodName);
            var outData2         = FileHelper.GetOutputFile("outData2.txt", methodName);

            /*using (*/
            var env = EnvHelper.NewTestEnvironment(conc: 1);
            {
                var loader = env.CreateLoader("Text{col=Label:R4:0 col=Features:R4:1-784}", new MultiFileSource(trainFile));
                var roles  = env.CreateExamples(loader, "Features", "Label");
                var df     = DataFrameIO.ReadView(roles.Data);
                Assert.IsTrue(df.Shape[0] > 0);
                var iova = string.Format("iova{{p=lr sc={0} nt=1}}", singleColumn ? "+" : "-");
                loader = env.CreateLoader("Text{col=Label:R4:0 col=Features:R4:1-784}", new MultiFileSource(testFile));
                var trainer = env.CreateTrainer(iova);
                using (var ch = env.Start("train"))
                {
                    var predictor = trainer.Train(env, ch, roles);
                    TestTrainerHelper.FinalizeSerializationTest(env, outModelFilePath, predictor, roles, outData, outData2,
                                                                PredictionKind.MulticlassClassification, checkError, ratio: 0.1f);
                }
            }
        }
Beispiel #5
0
        public void TestEP_PassThroughTransform()
        {
            var methodName = System.Reflection.MethodBase.GetCurrentMethod().Name;
            var iris       = FileHelper.GetTestFile("iris.txt");
            var outPass    = FileHelper.GetOutputFile("data.idv", methodName);
            var df         = DataFrameIO.ReadCsv(iris, sep: '\t', dtypes: new ColumnType[] { NumberType.R4 });

            var importData       = df.EPTextLoader(iris, sep: '\t', header: true);
            var learningPipeline = new GenericLearningPipeline(conc: 1);

            learningPipeline.Add(importData);
            learningPipeline.Add(new Legacy.Transforms.ColumnConcatenator("Features", "Sepal_length", "Sepal_width"));
            learningPipeline.Add(new Scikit.ML.EntryPoints.Scaler("Features"));
            learningPipeline.Add(new Scikit.ML.EntryPoints.PassThrough()
            {
                Filename = outPass, SaveOnDisk = true
            });
            learningPipeline.Add(new Legacy.Trainers.StochasticDualCoordinateAscentRegressor());
            var predictor   = learningPipeline.Train();
            var predictions = predictor.Predict(df);
            var dfout       = DataFrameIO.ReadView(predictions);

            Assert.AreEqual(dfout.Shape, new Tuple <int, int>(150, 8));
            Assert.IsTrue(File.Exists(outPass));
        }
        public void TestTagTrainOrScoreTransformCustomScorer()
        {
            var methodName       = System.Reflection.MethodBase.GetCurrentMethod().Name;
            var dataFilePath     = FileHelper.GetTestFile("mc_iris.txt");
            var outModelFilePath = FileHelper.GetOutputFile("outModelFilePath.zip", methodName);
            var outData          = FileHelper.GetOutputFile("outData1.txt", methodName);

            using (var env = EnvHelper.NewTestEnvironment())
            {
                var loader = env.CreateLoader("Text{col=Label:R4:0 col=Slength:R4:1 col=Swidth:R4:2 col=Plength:R4:3 col=Pwidth:R4:4 header=-}",
                                              new MultiFileSource(dataFilePath));

                using (var pipe = new ScikitPipeline(new[] {
                    "Concat{col=Feature:Slength,Swidth}",
                    "TagTrainScore{tr=iova{p=ft{nl=10 iter=1}} lab=Label feat=Feature tag=model scorer=MultiClassClassifierScorer{ex=AA}}"
                }, host: env))
                {
                    pipe.Train(loader);
                    var pred = pipe.Predict(loader);
                    var df   = DataFrameIO.ReadView(pred);
                    Assert.AreEqual(df.Shape, new Tuple <int, int>(150, 11));
                    var dfs = df.Head().ToString();
                    Assert.IsTrue(dfs.StartsWith("Label,Slength,Swidth,Plength,Pwidth,Feature.0,Feature.1,PredictedLabelAA,ScoreAA.0,ScoreAA.1,ScoreAA.2"));
                }
            }
        }
Beispiel #7
0
        /// <summary>
        /// Reads a string as a IDataView.
        /// Follows pandas API.
        /// </summary>
        public static DataFrame ReadStr(string content, char sep = ',', bool header   = true,
                                        string[] names           = null, int[] dtypes = null,
                                        int nrows = -1, int guess_rows = 10, bool index = false)
        {
            var kinds = IntToColumnTypes(dtypes);

            return(DataFrameIO.ReadStr(content, sep, header, names, kinds, nrows, guess_rows, index));
        }
Beispiel #8
0
        public void TestScikitAPI_SimplePredictor()
        {
            var inputs = new[] {
                new ExampleA()
                {
                    X = new float[] { 1, 10, 100 }
                },
                new ExampleA()
                {
                    X = new float[] { 2, 3, 5 }
                },
                new ExampleA()
                {
                    X = new float[] { 2, 4, 5 }
                },
                new ExampleA()
                {
                    X = new float[] { 2, 4, 7 }
                },
            };

            var inputs2 = new[] {
                new ExampleA()
                {
                    X = new float[] { -1, -10, -100 }
                },
                new ExampleA()
                {
                    X = new float[] { -2, -3, -5 }
                },
                new ExampleA()
                {
                    X = new float[] { 3, 4, 5 }
                },
                new ExampleA()
                {
                    X = new float[] { 3, 4, 7 }
                },
            };

            /*using (*/
            var host = EnvHelper.NewTestEnvironment(conc: 1);
            {
                var data = DataViewConstructionUtils.CreateFromEnumerable(host, inputs);
                using (var pipe = new ScikitPipeline(new[] { "poly{col=X}" }, "km{k=2}", host))
                {
                    var predictor = pipe.Train(data, feature: "X");
                    Assert.IsTrue(predictor != null);
                    var data2       = new StreamingDataFrame(DataViewConstructionUtils.CreateFromEnumerable(host, inputs2));
                    var predictions = pipe.Predict(data2);
                    var df          = DataFrameIO.ReadView(predictions);
                    Assert.AreEqual(df.Shape, new Tuple <int, int>(4, 12));
                    var dfs  = df.ToString();
                    var dfs2 = dfs.Replace("\n", ";");
                    Assert.IsTrue(dfs2.StartsWith("X.0,X.1,X.2,X.3,X.4,X.5,X.6,X.7,X.8,PredictedLabel,Score.0,Score.1;-1,-10,-100,1,10,100,100,1000,10000"));
                }
            }
        }
Beispiel #9
0
        /// <summary>
        /// Reads a file as a <see cref="DataFrame"/>.
        /// Follows pandas API.
        /// </summary>
        public static DataFrame ReadCsv(string filename, char sep = ',', bool header   = true,
                                        string[] names            = null, int[] dtypes = null,
                                        int nrows  = -1, int guess_rows = 10, string encoding = null,
                                        bool index = false)
        {
            var kinds = IntToColumnTypes(dtypes);

            return(DataFrameIO.ReadCsv(filename, sep, header, names, kinds, nrows, guess_rows,
                                       encoding == null ? null : Encoding.GetEncoding(encoding),
                                       index: index));
        }
Beispiel #10
0
 public void Train()
 {
     using (var env = new ConsoleEnvironment())
     {
         var df = DataFrameIO.ReadCsv(_dataset, sep: ',',
                                      dtypes: new ColumnType[] { NumberType.R4 });
         var concat = "Concat{col=Features:F0,F1,F2,F3,F4,F5,F6,F7,F8,F9}";
         var pipe   = new ScikitPipeline(new[] { concat }, "ftr{iter=10}");
         pipe.Train(df, "Features", "Label");
         _pipeline = pipe;
     }
 }
Beispiel #11
0
        public void TestPipelineIris()
        {
            var iris = FileHelper.GetTestFile("iris_data_id.txt");
            var df   = DataFrameIO.ReadCsv(iris, sep: ',', dtypes: new[] { NumberType.R4, NumberType.R4, NumberType.R4 });
            var env3 = PyEnvHelper.CreateStoreEnvironment();
            var pipe = PyPipelineHelper.CreateScikitPipeline(new string[] { "Concat{col=Features:Slength,Swidth}" },
                                                             "mlr", env3.Item1);

            pipe.Train(df, "Features", "Label");
            var sout = env3.Item2.ToString();

            Assert.IsNotNull(sout);
        }
Beispiel #12
0
        public void TestScikitAPI_TrainingWithIris()
        {
            var iris = FileHelper.GetTestFile("iris.txt");
            var df   = DataFrameIO.ReadCsv(iris, sep: '\t');

            df.AddColumn("LabelI", df["Label"].AsType(NumberType.R4));
            var pipe = new ScikitPipeline(new[] { $"Concat{{col=Features:{df.Columns[1]},{df.Columns[2]}}}" }, "mlr");

            pipe.Train(df, "Features", "LabelI");
            DataFrame pred = null;

            pipe.Predict(df, ref pred);
            Assert.AreEqual(pred.Shape, new ShapeType(150, 9));
        }
Beispiel #13
0
        public void TestScikitAPI_TrainingDiabete()
        {
            var diab     = FileHelper.GetTestFile("diabete.csv");
            var cols     = Enumerable.Range(0, 10).Select(c => NumberType.R4).ToArray();
            var colsName = string.Join(',', Enumerable.Range(0, 10).Select(c => $"F{c}"));
            var df       = DataFrameIO.ReadCsv(diab, sep: ',', dtypes: cols);
            var pipe     = new ScikitPipeline(new string[] { $"Concat{{col=Features:{colsName}}}" }, "ols");

            pipe.Train(df, "Features", "Label");
            DataFrame pred = null;

            pipe.Predict(df, ref pred);
            Assert.AreEqual(pred.Shape, new ShapeType(83, 13));
        }
        public static DataFrame TestScikitAPI_EngineSimpleTrainAndPredict(string engine, int th, int N, int ncall, bool cacheScikit)
        {
            var dico     = new Dictionary <Tuple <int, string, int, int>, double>();
            var scorer   = _TrainSentiment();
            var trscorer = _TrainSentiment2();

            foreach (var res in _MeasureTime(th, engine, scorer, trscorer, N, ncall, cacheScikit))
            {
                dico[new Tuple <int, string, int, int>(res.Item1, engine, th, res.Item3)] = res.Item2.TotalSeconds;
            }
            var df = DataFrameIO.Convert(dico, "N", "engine", "number of threads", "call", "time(s)");

            return(df);
        }
        public void TestScikitAPI_EngineSimpleTrainAndPredict()
        {
            var dico     = new Dictionary <Tuple <int, string, string, int, int>, double>();
            var scorer   = _TrainSentiment();
            var trscorer = _TrainSentiment2Transformer();

            foreach (var cache in new[] { false, true })
            {
                for (int th = 1; th <= 3; ++th)
                {
                    var memo = new Dictionary <string, float[]>();
                    foreach (var engine in new[] { "mlnet", "scikit" })
                    {
                        foreach (var kind in new[] { "array", "stream" })
                        {
                            var strat_ = new[] {
                                cache ? "extcache" : "viewcache",
                                 kind,
                            };
                            var strat = string.Join("+", strat_);
                            foreach (var res in _MeasureTime(th, strat, engine, scorer, trscorer, 2))
                            {
                                dico[new Tuple <int, string, string, int, int>(res.Item1, engine, strat, th, res.Item3)] = res.Item2.TotalSeconds;
                                if (res.Item3 == 1)
                                {
                                    memo[engine] = res.Item4;
                                }
                            }
                        }
                    }
                    var p1 = memo["mlnet"];
                    var p2 = memo["scikit"];
                    Assert.AreEqual(p1.Length, p2.Length);
                    var abs = 0.0;
                    for (int ii = 0; ii < p1.Length; ++ii)
                    {
                        abs += Math.Abs(p1[ii] - p2[ii]);
                    }
                    abs /= p1.Length;
                    // Assert.IsTrue(abs <= 2);
                }
            }
            var df         = DataFrameIO.Convert(dico, "N", "engine", "strategy", "number of threads", "call", "time(s)");
            var methodName = System.Reflection.MethodBase.GetCurrentMethod().Name;
            var filename   = FileHelper.GetOutputFile("benchmark_ValueMapperPredictionEngineMultiThread.txt", methodName);

            df.ToCsv(filename);
            Assert.AreEqual(dico.Count, 48);
        }
 public void TestI_PolynomialTransformNumericValues()
 {
     using (var host = EnvHelper.NewTestEnvironment(conc: 1))
     {
         var raw = DataFrameIO.ReadStr("A,B\n1.0,2.0\n2.0,3.0\n10.0,11.0");
         raw.SetShuffle(false);
         var loader = host.CreateTransform("concat{col=X:A,B}", raw);
         var data   = host.CreateTransform("Poly{col=X}", loader);
         var res    = DataFrameIO.ReadView(data);
         var txt    = res.ToString();
         Assert.IsFalse(string.IsNullOrEmpty(txt));
         var exp   = "A,B,X.0,X.1,X.2,X.3,X.4\n1.0,2.0,1.0,2.0,1.0,2.0,4.0\n2.0,3.0,2.0,3.0,4.0,6.0,9.0\n10.0,11.0,10.0,11.0,100.0,110.0,121.0";
         var dfexp = DataFrameIO.ReadStr(exp);
         Assert.AreEqual(0, dfexp.AlmostEquals(res, exc: true, printDf: true));
     }
 }
 public void TestI_ScalerTransformNumericValuesMinMax()
 {
     using (var host = EnvHelper.NewTestEnvironment(conc: 1))
     {
         var raw = DataFrameIO.ReadStr("A,B\n1.0,2.0\n2.0,3.0\n10.0,11.0");
         raw.SetShuffle(false);
         var loader = host.CreateTransform("concat{col=X:A,B}", raw);
         var data   = host.CreateTransform("Scaler{col=X scale=minMax}", loader);
         (data as ITrainableTransform).Estimate();
         var res   = DataFrameIO.ReadView(data);
         var txt   = res.ToString();
         var exp   = "A,B,X.0,X.1\n1.0,2.0,0.0,0.0\n2.0,3.0,0.11111111,0.11111111\n10.0,11.0,1.0,1.0";
         var dfexp = DataFrameIO.ReadStr(exp);
         Assert.AreEqual(0, dfexp.AlmostEquals(res, exc: true, printDf: true));
     }
 }
Beispiel #18
0
        public void TestPipelineDiabete()
        {
            var diab     = FileHelper.GetTestFile("diabete.csv");
            var cols     = Enumerable.Range(0, 10).Select(c => NumberType.R4).ToArray();
            var colsName = string.Join(',', Enumerable.Range(0, 10).Select(c => $"F{c}"));
            var df       = DataFrameIO.ReadCsv(diab, sep: ',', dtypes: cols);
            var env3     = PyEnvHelper.CreateStoreEnvironment();
            var pipe     = PyPipelineHelper.CreateScikitPipeline(new string[] { $"Concat{{col=Features:{colsName}}}" }, "ols", env3.Item1);

            pipe.Train(df, "Features", "Label");
            var sout = env3.Item2.ToString();

            Assert.IsNotNull(sout);
            DataFrame pred = PyPipelineHelper.FastPredictOrTransform(pipe, df);

            Assert.IsTrue(df.Shape[0] > 0);
        }
Beispiel #19
0
 public void TestEP_NearestNeighborsLPMc()
 {
     using (var env = EnvHelper.NewTestEnvironment(conc: 1))
     {
         var iris             = FileHelper.GetTestFile("iris.txt");
         var df               = DataFrameIO.ReadCsv(iris, sep: '\t', dtypes: new ColumnType[] { NumberType.R4 });
         var importData       = df.EPTextLoader(iris, sep: '\t', header: true);
         var learningPipeline = new GenericLearningPipeline(conc: 1);
         learningPipeline.Add(importData);
         learningPipeline.Add(new Legacy.Transforms.ColumnConcatenator("Features", "Sepal_length", "Sepal_width"));
         learningPipeline.Add(new Scikit.ML.EntryPoints.NearestNeighborsMultiClass());
         var predictor   = learningPipeline.Train();
         var predictions = predictor.Predict(df);
         var dfout       = DataFrameIO.ReadView(predictions);
         Assert.AreEqual(dfout.Shape, new Tuple <int, int>(150, 11));
     }
 }
        public void TestValueMapperPredictionEngineMultiThread()
        {
            var name = FileHelper.GetTestFile("bc-lr.zip");

            /*using (*/
            var env = EnvHelper.NewTestEnvironment();

            using (var engine0 = new ValueMapperPredictionEngineFloat(env, name, conc: 1))
            {
                var feat = new float[] { 5, 1, 1, 1, 2, 1, 3, 1, 1 };
                var exp  = new float[100];
                for (int i = 0; i < exp.Length; ++i)
                {
                    feat[0] = i;
                    exp[i]  = engine0.Predict(feat);
                    Assert.IsFalse(float.IsNaN(exp[i]));
                    Assert.IsFalse(float.IsInfinity(exp[i]));
                }

                var dico = new Dictionary <Tuple <int, bool, int>, double>();

                foreach (var each in new[] { false, true })
                {
                    foreach (int th in new int[] { 2, 0, 1, 3 })
                    {
                        var engine = new ValueMapperPredictionEngineFloat(env, name, conc: th);
                        var sw     = new Stopwatch();
                        sw.Start();
                        for (int i = 0; i < exp.Length; ++i)
                        {
                            feat[0] = i;
                            var res = engine.Predict(feat);
                            Assert.AreEqual(exp[i], res);
                        }
                        sw.Stop();
                        dico[new Tuple <int, bool, int>(exp.Length, each, th)] = sw.Elapsed.TotalSeconds;
                    }
                }
                Assert.AreEqual(dico.Count, 8);
                var df         = DataFrameIO.Convert(dico, "N", "number of threads", "time(s)");
                var methodName = System.Reflection.MethodBase.GetCurrentMethod().Name;
                var filename   = FileHelper.GetOutputFile("benchmark_ValueMapperPredictionEngineMultiThread.txt", methodName);
                df.ToCsv(filename);
            }
        }
 public void TestI_ScalerTransformNumericValuesMeanVar()
 {
     /*using (*/
     var host = EnvHelper.NewTestEnvironment(conc: 1);
     {
         var raw = DataFrameIO.ReadStr("A,B\n1.0,2.0\n2.0,3.0\n10.0,11.0");
         raw.SetShuffle(false);
         var loader = host.CreateTransform("concat{col=X:A,B}", raw);
         var data   = host.CreateTransform("Scaler{col=X}", loader);
         (data as ITrainableTransform).Estimate();
         var res = DataFrameIO.ReadView(data);
         var txt = res.ToString();
         Assert.IsNotNull(txt);
         var exp   = "A,B,X.0,X.1\n1.0,2.0,-0.827605963,-0.827605963\n2.0,3.0,-0.5793242,-0.5793242\n10.0,11.0,1.40693,1.40693";
         var dfexp = DataFrameIO.ReadStr(exp);
         Assert.AreEqual(0, dfexp.AlmostEquals(res, exc: true, printDf: true, sortBy: "A"));
     }
 }
        public void TestEP_ScalerTransform()
        {
            var iris = FileHelper.GetTestFile("iris.txt");
            var df   = DataFrameIO.ReadCsv(iris, sep: '\t', dtypes: new ColumnType[] { NumberType.R4 });

            var importData       = df.EPTextLoader(iris, sep: '\t', header: true);
            var learningPipeline = new GenericLearningPipeline(conc: 1);

            learningPipeline.Add(importData);
            learningPipeline.Add(new Legacy.Transforms.ColumnConcatenator("Features", "Sepal_length", "Sepal_width"));
            learningPipeline.Add(new Scikit.ML.EntryPoints.Scaler("Features"));
            learningPipeline.Add(new Legacy.Trainers.StochasticDualCoordinateAscentRegressor());
            var predictor   = learningPipeline.Train();
            var predictions = predictor.Predict(df);
            var dfout       = DataFrameIO.ReadView(predictions);

            Assert.AreEqual(dfout.Shape, new Tuple <int, int>(150, 8));
        }
Beispiel #23
0
 public void TestScikitAPI_MKL_TrainingDiabete()
 {
     try
     {
         var diab     = FileHelper.GetTestFile("diabete.csv");
         var cols     = Enumerable.Range(0, 10).Select(c => NumberDataViewType.Single).ToArray();
         var colsName = string.Join(',', Enumerable.Range(0, 10).Select(c => $"F{c}"));
         var df       = DataFrameIO.ReadCsv(diab, sep: ',', dtypes: cols);
         var pipe     = new ScikitPipeline(new string[] { $"Concat{{col=Features:{colsName}}}" }, "ols");
         pipe.Train(df, "Features", "Label");
         DataFrame pred = null;
         pipe.Predict(df, ref pred);
         Assert.AreEqual(pred.Shape, new ShapeType(83, 13));
     }
     catch (DllNotFoundException e)
     {
         var os = Environment.OSVersion;
         if (os.Platform == PlatformID.Unix)
         {
             Console.WriteLine("FAIL(1): TestScikitAPI_MKL due to {0}", e.ToString());
             return;
         }
         else
         {
             Console.WriteLine("FAIL(1): TestScikitAPI_MKL, OS={0}", os.ToString());
             throw e;
         }
     }
     catch (NotSupportedException e)
     {
         var os = Environment.OSVersion;
         if (os.Platform == PlatformID.Unix)
         {
             Console.WriteLine("FAIL(2): TestScikitAPI_MKL due to {0}", e.ToString());
             return;
         }
         else
         {
             Console.WriteLine("FAIL(2): TestScikitAPI_MKL, OS={0}", os.ToString());
             throw e;
         }
     }
 }
Beispiel #24
0
        public void TestScikitAPI_SimpleTransform()
        {
            var inputs = new[] {
                new ExampleA()
                {
                    X = new float[] { 1, 10, 100 }
                },
                new ExampleA()
                {
                    X = new float[] { 2, 3, 5 }
                }
            };

            var inputs2 = new[] {
                new ExampleA()
                {
                    X = new float[] { -1, -10, -100 }
                },
                new ExampleA()
                {
                    X = new float[] { -2, -3, -5 }
                }
            };

            /*using (*/
            var host = EnvHelper.NewTestEnvironment(conc: 1);
            {
                var data = DataViewConstructionUtils.CreateFromEnumerable(host, inputs);
                using (var pipe = new ScikitPipeline(new[] { "poly{col=X}" }, host: host))
                {
                    var predictor = pipe.Train(data);
                    Assert.IsTrue(predictor != null);
                    var data2       = DataViewConstructionUtils.CreateFromEnumerable(host, inputs2);
                    var predictions = pipe.Transform(data2);
                    var df          = DataFrameIO.ReadView(predictions);
                    Assert.AreEqual(df.Shape, new Tuple <int, int>(2, 9));
                    var dfs  = df.ToString();
                    var dfs2 = dfs.Replace("\n", ";");
                    Assert.AreEqual(dfs2, "X.0,X.1,X.2,X.3,X.4,X.5,X.6,X.7,X.8;-1,-10,-100,1,10,100,100,1000,10000;-2,-3,-5,4,6,10,9,15,25");
                }
            }
        }
 public void TestTreePathInnerAPI()
 {
     using (var env = EnvHelper.NewTestEnvironment(conc: 1))
     {
         var methodName = System.Reflection.MethodBase.GetCurrentMethod().Name;
         var iris       = FileHelper.GetTestFile("iris.txt");
         var df         = DataFrameIO.ReadCsv(iris, sep: '\t', dtypes: new ColumnType[] { NumberType.R4 });
         using (var pipe = new ScikitPipeline(new[] { "Concat{col=Feature:Sepal_length,Sepal_width}",
                                                      "TreeFeat{tr=ft{iter=2} lab=Label feat=Feature}" }))
         {
             pipe.Train(df);
             var scorer = pipe.Predict(df);
             var dfout  = DataFrameIO.ReadView(scorer);
             Assert.AreEqual(dfout.Shape, new Tuple <int, int>(150, 31));
             var outfile = FileHelper.GetOutputFile("iris_path.txt", methodName);
             dfout.ToCsv(outfile);
             Assert.IsTrue(File.Exists(outfile));
         }
     }
 }
Beispiel #26
0
        public void TestOnnx_TrainingWithIris()
        {
            var methodName = System.Reflection.MethodBase.GetCurrentMethod().Name;

            // direct call
            var iris = FileHelper.GetTestFile("iris.txt");
            var df   = DataFrameIO.ReadCsv(iris, sep: '\t');

            df.AddColumn("LabelI", df["Label"].AsType(NumberDataViewType.Single));
            var pipe = new ScikitPipeline(new[] { $"Concat{{col=Features:{df.Columns[1]},{df.Columns[2]}}}" }, "mlr");

            pipe.Train(df, "Features", "LabelI");
            DataFrame pred = null;

            pipe.Predict(df, ref pred);

            // Onnx Save
            var output = FileHelper.GetOutputFile("model.onnx", methodName);
            var model  = pipe.ToOnnx();

            model.Save(output);
            Assert.IsTrue(File.Exists(output));

            // Onnx save no concat.
            output = FileHelper.GetOutputFile("model_vector.onnx", methodName);
            model  = pipe.ToOnnx(1);
            model.Save(output);
            Assert.IsTrue(File.Exists(output));

            // Onnx Load Not implemented yet.

            /*
             * var restored = new ScikitPipeline(output);
             * DataFrame pred2 = null;
             * restored.Predict(df, ref pred2);
             * pred.AssertAlmostEqual(pred2);
             */
        }
 public void TestTreePathNewAPI()
 {
     using (var env = EnvHelper.NewTestEnvironment(conc: 1))
     {
         var methodName       = System.Reflection.MethodBase.GetCurrentMethod().Name;
         var iris             = FileHelper.GetTestFile("iris.txt");
         var df               = DataFrameIO.ReadCsv(iris, sep: '\t', dtypes: new ColumnType[] { NumberType.R4 });
         var importData       = df.EPTextLoader(iris, sep: '\t', header: true);
         var learningPipeline = new GenericLearningPipeline();
         learningPipeline.Add(importData);
         learningPipeline.Add(new Legacy.Transforms.ColumnConcatenator("Features", "Sepal_length", "Sepal_width"));
         learningPipeline.Add(new Legacy.Trainers.StochasticDualCoordinateAscentRegressor()
         {
             MaxIterations = 2
         });
         var predictor   = learningPipeline.Train();
         var predictions = predictor.Predict(df);
         var dfout       = DataFrameIO.ReadView(predictions);
         Assert.AreEqual(dfout.Shape, new Tuple <int, int>(150, 8));
         var outfile = FileHelper.GetOutputFile("iris_path.txt", methodName);
         dfout.ToCsv(outfile);
         Assert.IsTrue(File.Exists(outfile));
     }
 }
Beispiel #28
0
        public void TestScikitAPI_SimplePredictor_FastValueMapper()
        {
            var inputs = new[] {
                new ExampleA()
                {
                    X = new float[] { 1, 10, 100 }
                },
                new ExampleA()
                {
                    X = new float[] { 2, 3, 5 }
                },
                new ExampleA()
                {
                    X = new float[] { 2, 4, 5 }
                },
                new ExampleA()
                {
                    X = new float[] { 2, 4, 7 }
                },
            };

            var inputs2 = new[] {
                new ExampleA()
                {
                    X = new float[] { -1, -10, -100 }
                },
                new ExampleA()
                {
                    X = new float[] { -2, -3, -5 }
                },
                new ExampleA()
                {
                    X = new float[] { 3, 4, 5 }
                },
                new ExampleA()
                {
                    X = new float[] { 3, 4, 7 }
                },
            };
            DataFrame df1, df2, df3;

            using (var host = EnvHelper.NewTestEnvironment(conc: 1))
            {
                var data  = host.CreateStreamingDataView(inputs);
                var data2 = host.CreateStreamingDataView(inputs2);
                df1 = DataFrameIO.ReadView(data, env: host, keepVectors: true);
                df2 = DataFrameIO.ReadView(data2, env: host, keepVectors: true);
                df3 = DataFrameIO.ReadView(data2, env: host, keepVectors: true);
            }

            using (var host = EnvHelper.NewTestEnvironment(conc: 1))
            {
                using (var pipe = new ScikitPipeline(new[] { "poly{col=X}" }, "km{k=2}", host))
                {
                    DataFrame pred = null, pred2 = null;
                    var       predictor = pipe.Train(df1, feature: "X");
                    Assert.IsTrue(predictor != null);

                    pipe.Predict(df2, ref pred);
                    Assert.AreEqual(pred.Shape, new Tuple <int, int>(4, 3));
                    var dfs  = pred.ToString();
                    var dfs2 = dfs.Replace("\n", ";");
                    if (!dfs2.StartsWith("X.0,X.1,X.2,X.3,X.4,X.5,X.6,X.7,X.8,PredictedLabel,Score.0,Score.1;-1,-10,-100,1,10,100,100,1000,10000"))
                    {
                        throw new Exception($"Wrong starts\n{dfs2}");
                    }

                    pipe.Predict(df3, ref pred2);
                    pred.AssertAlmostEqual(pred2);
                }
            }
        }
        /// <summary>
        /// Runs a simple test.
        /// </summary>
        public static void TestScikitAPI()
        {
            var inputs = new[] {
                new ExampleVector()
                {
                    X = new float[] { 1, 10, 100 }
                },
                new ExampleVector()
                {
                    X = new float[] { 2, 3, 5 }
                },
                new ExampleVector()
                {
                    X = new float[] { 2, 4, 5 }
                },
                new ExampleVector()
                {
                    X = new float[] { 2, 4, 7 }
                },
            };

            var inputs2 = new[] {
                new ExampleVector()
                {
                    X = new float[] { -1, -10, -100 }
                },
                new ExampleVector()
                {
                    X = new float[] { -2, -3, -5 }
                },
                new ExampleVector()
                {
                    X = new float[] { 3, 4, 5 }
                },
                new ExampleVector()
                {
                    X = new float[] { 3, 4, 7 }
                },
            };

            using (var host = new ConsoleEnvironment(conc: 1))
            {
                ComponentHelper.AddStandardComponents(host);
                var data = host.CreateStreamingDataView(inputs);
                using (var pipe = new ScikitPipeline(new[] { "poly{col=X}" }, "km{k=2}", host))
                {
                    var predictor = pipe.Train(data, feature: "X");
                    if (predictor == null)
                    {
                        throw new Exception("Test failed: no predictor.");
                    }
                    var data2       = host.CreateStreamingDataView(inputs2);
                    var predictions = pipe.Predict(data2);
                    var df          = DataFrameIO.ReadView(predictions);
                    if (df.Shape.Item1 != 4 || df.Shape.Item2 != 12)
                    {
                        throw new Exception("Test failed: prediction failed.");
                    }
                    var dfs  = df.ToString();
                    var dfs2 = dfs.Replace("\n", ";");
                    if (!dfs2.StartsWith("X.0,X.1,X.2,X.3,X.4,X.5,X.6,X.7,X.8,PredictedLabel,Score.0,Score.1;-1,-10,-100,1,10,100,100,1000,10000"))
                    {
                        throw new Exception("Test failed: prediction failed (header).");
                    }
                }
            }
        }
        /// <summary>
        /// Trains the pipeline with data coming from a <see cref="IDataView"/>.
        /// </summary>
        public ScikitPipeline Train(IDataView data,
                                    string feature = "Feature", string label = null,
                                    string weight  = null, string groupId    = null)
        {
            IDataView trans = data;

            using (var ch = _env.Start("Create transforms"))
            {
                for (int i = 0; i < _transforms.Length; ++i)
                {
                    try
                    {
                        trans = _env.CreateTransform(_transforms[i].transformSettings, trans);
                    }
                    catch (Exception e)
                    {
                        if (e.ToString().Contains("Unknown loadable class"))
                        {
                            var nn   = _env.ComponentCatalog.GetAllClasses().Length;
                            var filt = _env.ComponentCatalog.GetAllClasses()
                                       .Select(c => c.UserName)
                                       .OrderBy(c => c)
                                       .Where(c => c.Trim().Length > 2);
                            var regis = string.Join("\n", filt);
                            throw Contracts.Except(e, $"Unable to create transform '{_transforms[i].transformSettings}', assembly not registered among {nn}\n{regis}");
                        }
                        throw e;
                    }
                    _transforms[i].transform = trans as IDataTransform;
                }
            }

            if (_predictor != null)
            {
                using (var ch = _env.Start("Create Predictor"))
                {
                    _predictor.trainer = TrainerHelper.CreateTrainer(_env, _predictor.trainerSettings);
                    _roles             = new List <KeyValuePair <RoleMappedSchema.ColumnRole, string> >();
                    _roles.Add(new KeyValuePair <RoleMappedSchema.ColumnRole, string>(RoleMappedSchema.ColumnRole.Feature, feature));
                    if (!string.IsNullOrEmpty(label))
                    {
                        _roles.Add(new KeyValuePair <RoleMappedSchema.ColumnRole, string>(RoleMappedSchema.ColumnRole.Label, label));
                    }
                    if (!string.IsNullOrEmpty(groupId))
                    {
                        _roles.Add(new KeyValuePair <RoleMappedSchema.ColumnRole, string>(RoleMappedSchema.ColumnRole.Group, groupId));
                    }
                    if (!string.IsNullOrEmpty(weight))
                    {
                        _roles.Add(new KeyValuePair <RoleMappedSchema.ColumnRole, string>(RoleMappedSchema.ColumnRole.Weight, weight));
                    }
                    var roleMap = new RoleMappedData(trans, label, feature, group: groupId, weight: weight);
                    _predictor.predictor   = _predictor.trainer.Train(_env, ch, roleMap);
                    _predictor.roleMapData = roleMap;
                }
            }
            else
            {
                _predictor = new StepPredictor()
                {
                    predictor       = null,
                    trainer         = null,
                    trainerSettings = null,
                    roleMapData     = new RoleMappedData(trans)
                };

                // We predict one to make sure everything works fine.
                using (var ch = _env.Start("Compute one prediction."))
                {
                    var df = DataFrameIO.ReadView(trans, 1, keepVectors: true, env: _env);
                    if (df.Length == 0)
                    {
                        throw _env.ExceptEmpty("Something went wrong. The pipeline does not produce any output.");
                    }
                }
            }
            return(this);
        }