public void TestTagTrainOrScoreTransformCustomScorer() { var methodName = System.Reflection.MethodBase.GetCurrentMethod().Name; var dataFilePath = FileHelper.GetTestFile("mc_iris.txt"); var outModelFilePath = FileHelper.GetOutputFile("outModelFilePath.zip", methodName); var outData = FileHelper.GetOutputFile("outData1.txt", methodName); using (var env = EnvHelper.NewTestEnvironment()) { var loader = env.CreateLoader("Text{col=Label:R4:0 col=Slength:R4:1 col=Swidth:R4:2 col=Plength:R4:3 col=Pwidth:R4:4 header=-}", new MultiFileSource(dataFilePath)); using (var pipe = new ScikitPipeline(new[] { "Concat{col=Feature:Slength,Swidth}", "TagTrainScore{tr=iova{p=ft{nl=10 iter=1}} lab=Label feat=Feature tag=model scorer=MultiClassClassifierScorer{ex=AA}}" }, host: env)) { pipe.Train(loader); var pred = pipe.Predict(loader); var df = DataFrameIO.ReadView(pred); Assert.AreEqual(df.Shape, new Tuple <int, int>(150, 11)); var dfs = df.Head().ToString(); Assert.IsTrue(dfs.StartsWith("Label,Slength,Swidth,Plength,Pwidth,Feature.0,Feature.1,PredictedLabelAA,ScoreAA.0,ScoreAA.1,ScoreAA.2")); } } }
public void TestI_ResampleSerialization() { var methodName = System.Reflection.MethodBase.GetCurrentMethod().Name; var dataFilePath = FileHelper.GetTestFile("iris.txt"); var outputDataFilePath = FileHelper.GetOutputFile("outputDataFilePath.txt", methodName); /*using (*/ var env = EnvHelper.NewTestEnvironment(conc: 1); { var loader = env.CreateLoader("Text{col=Label:R4:0 col=Slength:R4:1 col=Swidth:R4:2 col=Plength:R4:3 " + "col=Pwidth:R4:4 header=+ sep=tab}", new MultiFileSource(dataFilePath)); var sorted = env.CreateTransform("resample{lambda=1 c=-}", loader); DataViewHelper.ToCsv(env, sorted, outputDataFilePath); var lines = File.ReadAllLines(outputDataFilePath); int begin = 0; for (; begin < lines.Length; ++begin) { if (lines[begin].StartsWith("Label")) { break; } } lines = lines.Skip(begin).ToArray(); var linesSorted = lines.OrderBy(c => c).ToArray(); for (int i = 1; i < linesSorted.Length; ++i) { if (linesSorted[i - 1][0] > linesSorted[i][0]) { throw new Exception("The output is not sorted."); } } } }
public void TestI_DescribeTransformSaveDataAndZip() { using (var env = EnvHelper.NewTestEnvironment()) { var inputs = InputOutput.CreateInputs(); var data = env.CreateStreamingDataView(inputs); var args = new DescribeTransform.Arguments() { columns = new[] { "X" } }; var tr = new DescribeTransform(env, args, data); var methodName = System.Reflection.MethodBase.GetCurrentMethod().Name; var outputDataFilePath = FileHelper.GetOutputFile("outputDataFilePath.txt", methodName); StreamHelper.SavePredictions(env, tr, outputDataFilePath); Assert.IsTrue(File.Exists(outputDataFilePath)); var outModelFilePath = FileHelper.GetOutputFile("outModelFilePath.zip", methodName); StreamHelper.SaveModel(env, tr, outModelFilePath); Assert.IsTrue(File.Exists(outModelFilePath)); var outputDataFilePath2 = FileHelper.GetOutputFile("outputDataFilePath2.txt", methodName); StreamHelper.SavePredictions(env, outModelFilePath, outputDataFilePath2, data); Assert.IsTrue(File.Exists(outputDataFilePath2)); var d1 = File.ReadAllText(outputDataFilePath); Assert.IsTrue(d1.Length > 0); var d2 = File.ReadAllText(outputDataFilePath2); Assert.AreEqual(d1, d2); } }
public void TestChainTransformSerialize() { using (var host = EnvHelper.NewTestEnvironment()) { var inputs = new[] { new ExampleA() { X = new float[] { 1, 10, 100 } }, new ExampleA() { X = new float[] { 2, 3, 5 } } }; IDataView loader = host.CreateStreamingDataView(inputs); IDataTransform data = host.CreateTransform("Scaler{col=X4:X}", loader); data = host.CreateTransform("ChainTrans{ xf1=Scaler{col=X2:X} xf2=Poly{col=X3:X2} }", data); // We create a specific folder in build/UnitTest which will contain the output. var methodName = System.Reflection.MethodBase.GetCurrentMethod().Name; var outModelFilePath = FileHelper.GetOutputFile("outModelFilePath.zip", methodName); var outData = FileHelper.GetOutputFile("outData.txt", methodName); var outData2 = FileHelper.GetOutputFile("outData2.txt", methodName); TestTransformHelper.SerializationTestTransform(host, outModelFilePath, data, loader, outData, outData2); } }
public void TestTagViewTransform() { using (var host = EnvHelper.NewTestEnvironment()) { var inputs = new[] { new ExampleA() { X = new float[] { 0, 1 } }, new ExampleA() { X = new float[] { 2, 3 } } }; IDataView loader = host.CreateStreamingDataView(inputs); var data = host.CreateTransform("Scaler{col=X1:X}", loader); data = host.CreateTransform("tag{t=memory}", data); var methodName = System.Reflection.MethodBase.GetCurrentMethod().Name; var outModelFilePath = FileHelper.GetOutputFile("outModelFilePath.zip", methodName); var outData = FileHelper.GetOutputFile("outData.txt", methodName); var outData2 = FileHelper.GetOutputFile("outData2.txt", methodName); TestTransformHelper.SerializationTestTransform(host, outModelFilePath, data, loader, outData, outData2); } }
public void TestLambdaColumnPassThroughTransform() { using (var host = EnvHelper.NewTestEnvironment()) { var inputs = new InputOutputU[] { new InputOutputU() { X = new float[] { 0.1f, 1.1f }, Y = 0 }, new InputOutputU() { X = new float[] { 0.2f, 1.2f }, Y = 1 }, new InputOutputU() { X = new float[] { 0.3f, 1.3f }, Y = 2 } }; var data = host.CreateStreamingDataView(inputs); var lambdaView = LambdaColumnHelper.Create <VBuffer <float>, VBuffer <float> >(host, "Lambda", data, "X", "XX", new VectorType(NumberType.R4, 2), new VectorType(NumberType.R4, 2), (in VBuffer <float> src, ref VBuffer <float> dst) => { dst = new VBuffer <float>(2, new float[2]); dst.Values[0] = src.Values[0] + 1f; dst.Values[1] = src.Values[1] + 1f; });
public void TestI_DescribeTransformCode() { using (var env = EnvHelper.NewTestEnvironment()) { var inputs = InputOutput.CreateInputs(); var data = env.CreateStreamingDataView(inputs); var args = new DescribeTransform.Arguments() { columns = new[] { "X" } }; var tr = new DescribeTransform(env, args, data); var values = new List <int>(); using (var cursor = tr.GetRowCursor(i => true)) { var columnGetter = cursor.GetGetter <int>(1); while (cursor.MoveNext()) { int got = 0; columnGetter(ref got); values.Add((int)got); } } Assert.AreEqual(values.Count, 4); } }
static void TrainPrePostProcessTrainer(string modelName, bool checkError, int threads, bool addpre) { var methodName = string.Format("{0}-{1}-T{2}", System.Reflection.MethodBase.GetCurrentMethod().Name, modelName, threads); var dataFilePath = FileHelper.GetTestFile("mc_iris.txt"); var trainFile = FileHelper.GetOutputFile("iris_train.idv", methodName); var testFile = FileHelper.GetOutputFile("iris_test.idv", methodName); var outModelFilePath = FileHelper.GetOutputFile("outModelFilePath.zip", methodName); var outData = FileHelper.GetOutputFile("outData1.txt", methodName); var outData2 = FileHelper.GetOutputFile("outData2.txt", methodName); using (var env = EnvHelper.NewTestEnvironment(conc: threads == 1 ? 1 : 0)) { var loader = env.CreateLoader("Text{col=Label:R4:0 col=Slength:R4:1 col=Swidth:R4:2 col=Plength:R4:3 col=Pwidth:R4:4 header=+}", new MultiFileSource(dataFilePath)); var xf = env.CreateTransform("shuffle{force=+}", loader); // We shuffle because Iris is order by label. xf = env.CreateTransform("concat{col=Features:Slength,Swidth}", xf); var roles = env.CreateExamples(xf, "Features", "Label"); string pred = addpre ? "PrePost{pre=poly{col=Features} p=___ pret=Take{n=80}}" : "PrePost{p=___ pret=Take{n=80}}"; pred = pred.Replace("___", modelName); var trainer = env.CreateTrainer(pred); using (var ch = env.Start("Train")) { var predictor = trainer.Train(env, ch, roles); TestTrainerHelper.FinalizeSerializationTest(env, outModelFilePath, predictor, roles, outData, outData2, PredictionKind.MultiClassClassification, checkError, ratio: 0.15f); } } }
public void TestI_ScalerTransformSerialize() { using (var host = EnvHelper.NewTestEnvironment()) { var inputs = new[] { new ExampleA() { X = new float[] { 1, 10, 100 } }, new ExampleA() { X = new float[] { 2, 3, 5 } } }; IDataView loader = host.CreateStreamingDataView(inputs); var data = host.CreateTransform("Scaler{col=X}", loader); (data as ITrainableTransform).Estimate(); // We create a specific folder in build/UnitTest which will contain the output. var methodName = System.Reflection.MethodBase.GetCurrentMethod().Name; var outModelFilePath = FileHelper.GetOutputFile("outModelFilePath.zip", methodName); var outData = FileHelper.GetOutputFile("outData.txt", methodName); var outData2 = FileHelper.GetOutputFile("outData2.txt", methodName); var nb = DataViewUtils.ComputeRowCount(data); if (nb < 1) { throw new Exception("empty view"); } // This function serializes the output data twice, once before saving the pipeline, once after loading the pipeline. // It checks it gives the same result. TestTransformHelper.SerializationTestTransform(host, outModelFilePath, data, loader, outData, outData2); } }
public void TestDataViewCacheDataFrameSerialization() { var methodName = System.Reflection.MethodBase.GetCurrentMethod().Name; var dataFilePath = FileHelper.GetTestFile("mc_iris.txt"); var outputDataFilePath = FileHelper.GetOutputFile("outputDataFilePath.txt", methodName); var outModelFilePath = FileHelper.GetOutputFile("outModelFilePath.zip", methodName); /*using (*/ var env = EnvHelper.NewTestEnvironment(); { var loader = env.CreateLoader("Text{col=Label:R4:0 col=Slength:R4:1 col=Swidth:R4:2 col=Plength:R4:3 col=Pwidth:R4:4 header=+}", new MultiFileSource(dataFilePath)); var sorted = env.CreateTransform("cachedf", loader); StreamHelper.SaveModel(env, sorted, outModelFilePath); using (var fs = File.OpenRead(outModelFilePath)) { var deserializedData = ModelFileUtils.LoadTransforms(env, loader, fs); var saver = env.CreateSaver("Text"); using (var fs2 = File.Create(outputDataFilePath)) saver.SaveData(fs2, deserializedData, StreamHelper.GetColumnsIndex(deserializedData.Schema, new[] { "Label", "Slength", "Swidth", "Plength", "Pwidth" })); } } }
public static void TrainkNNMultiClassification(int k, NearestNeighborsWeights weight, int threads, float ratio = 0.2f, string distance = "L2") { var methodName = string.Format("{0}-k{1}-W{2}-T{3}-D{4}", System.Reflection.MethodBase.GetCurrentMethod().Name, k, weight, threads, distance); var dataFilePath = FileHelper.GetTestFile("iris.txt"); var outModelFilePath = FileHelper.GetOutputFile("outModelFilePath.zip", methodName); var outData = FileHelper.GetOutputFile("outData1.txt", methodName); var outData2 = FileHelper.GetOutputFile("outData2.txt", methodName); var env = k == 1 ? EnvHelper.NewTestEnvironment(conc: 1) : EnvHelper.NewTestEnvironment(); using (env) { var loader = env.CreateLoader("Text{col=Label:R4:0 col=Slength:R4:1 col=Swidth:R4:2 col=Plength:R4:3 col=Pwidth:R4:4 header=+}", new MultiFileSource(dataFilePath)); var concat = env.CreateTransform("Concat{col=Features:Slength,Swidth}", loader); var roles = env.CreateExamples(concat, "Features", "Label"); string modelDef; modelDef = string.Format("knnmc{{k={0} weighting={1} nt={2} distance={3}}}", k, weight == NearestNeighborsWeights.distance ? "distance" : "uniform", threads, distance); var trainer = env.CreateTrainer(modelDef); using (var ch = env.Start("test")) { var pred = trainer.Train(env, ch, roles); TestTrainerHelper.FinalizeSerializationTest(env, outModelFilePath, pred, roles, outData, outData2, PredictionKind.MultiClassClassification, true, ratio: ratio); } } }
public void TestI_DescribeTransformCode() { /*using (*/ var env = EnvHelper.NewTestEnvironment(); { var inputs = InputOutput.CreateInputs(); var data = DataViewConstructionUtils.CreateFromEnumerable(env, inputs); var args = new DescribeTransform.Arguments() { columns = new[] { "X" } }; var tr = new DescribeTransform(env, args, data); var values = new List <int>(); using (var cursor = tr.GetRowCursor(tr.Schema)) { var columnGetter = cursor.GetGetter <int>(SchemaHelper._dc(1, cursor)); while (cursor.MoveNext()) { int got = 0; columnGetter(ref got); values.Add((int)got); } } Assert.AreEqual(values.Count, 4); } }
public static void TrainMultiToRankerPredictorSparse(bool singleColumn, bool checkError) { var methodName = string.Format("{0}-{1}-V{2}", System.Reflection.MethodBase.GetCurrentMethod().Name, "lr", singleColumn ? "C" : "Vec"); var trainFile = FileHelper.GetTestFile("Train-28x28_small.txt"); var testFile = FileHelper.GetTestFile("Test-28x28_small.txt"); var outModelFilePath = FileHelper.GetOutputFile("outModelFilePath.zip", methodName); var outData = FileHelper.GetOutputFile("outData1.txt", methodName); var outData2 = FileHelper.GetOutputFile("outData2.txt", methodName); /*using (*/ var env = EnvHelper.NewTestEnvironment(); { var loader = env.CreateLoader("Text", new MultiFileSource(trainFile)); var roles = env.CreateExamples(loader, "Features", "Label"); var iova = string.Format("iovark{{p=ftrank sc={0}}}", singleColumn ? "+" : "-"); loader = env.CreateLoader("Text", new MultiFileSource(testFile)); var trainer = env.CreateTrainer(iova); using (var ch = env.Start("train")) { var predictor = trainer.Train(env, ch, roles); TestTrainerHelper.FinalizeSerializationTest(env, outModelFilePath, predictor, roles, outData, outData2, PredictionKind.MulticlassClassification, checkError, ratio: 0.1f); } } }
static void TrainMultiToBinaryPredictorSparse(bool singleColumn, bool checkError) { var methodName = string.Format("{0}-{1}-V{2}", System.Reflection.MethodBase.GetCurrentMethod().Name, "lr", singleColumn ? "C" : "Vec"); var trainFile = FileHelper.GetTestFile("Train-28x28_small.txt"); var testFile = FileHelper.GetTestFile("Test-28x28_small.txt"); var outModelFilePath = FileHelper.GetOutputFile("outModelFilePath.zip", methodName); var outData = FileHelper.GetOutputFile("outData1.txt", methodName); var outData2 = FileHelper.GetOutputFile("outData2.txt", methodName); /*using (*/ var env = EnvHelper.NewTestEnvironment(conc: 1); { var loader = env.CreateLoader("Text{col=Label:R4:0 col=Features:R4:1-784}", new MultiFileSource(trainFile)); var roles = env.CreateExamples(loader, "Features", "Label"); var df = DataFrameIO.ReadView(roles.Data); Assert.IsTrue(df.Shape[0] > 0); var iova = string.Format("iova{{p=lr sc={0} nt=1}}", singleColumn ? "+" : "-"); loader = env.CreateLoader("Text{col=Label:R4:0 col=Features:R4:1-784}", new MultiFileSource(testFile)); var trainer = env.CreateTrainer(iova); using (var ch = env.Start("train")) { var predictor = trainer.Train(env, ch, roles); TestTrainerHelper.FinalizeSerializationTest(env, outModelFilePath, predictor, roles, outData, outData2, PredictionKind.MulticlassClassification, checkError, ratio: 0.1f); } } }
static void TrainMultiToRankerPredictorDense(string modelName, int threads, bool checkError, bool singleColumn, bool shift, bool useUint) { var methodName = string.Format("{0}-{1}-V{2}-T{3}-S{4}", System.Reflection.MethodBase.GetCurrentMethod().Name, modelName, singleColumn ? "C" : "Vec", threads, shift ? "shift" : "std"); var dataFilePath = shift ? FileHelper.GetTestFile("mc_iris_shift.txt") : FileHelper.GetTestFile("mc_iris.txt"); var trainFile = FileHelper.GetOutputFile("iris_train.idv", methodName); var testFile = FileHelper.GetOutputFile("iris_test.idv", methodName); var outModelFilePath = FileHelper.GetOutputFile("outModelFilePath.zip", methodName); var outData = FileHelper.GetOutputFile("outData1.txt", methodName); var outData2 = FileHelper.GetOutputFile("outData2.txt", methodName); /*using (*/ var env = EnvHelper.NewTestEnvironment(conc: threads == 1 ? 1 : 0); { string labelType = useUint ? "U4[0-2]" : "R4"; string loadSettings = string.Format("Text{{col=Label:{0}:0 col=Slength:R4:1 col=Swidth:R4:2 col=Plength:R4:3 col=Pwidth:R4:4 header=+}}", labelType); var loader = env.CreateLoader(loadSettings, new MultiFileSource(dataFilePath)); var concat = env.CreateTransform("Concat{col=Features:Slength,Swidth}", loader); var roles = env.CreateExamples(concat, "Features", "Label"); string modelDef = threads <= 0 ? modelName : string.Format("{0}{{t={1}}}", modelName, threads); string additionnal = modelName.Contains("xgbrk") ? " u4=+" : ""; string iova = string.Format("iovark{{p={0} sc={1}{2}}}", modelDef, singleColumn ? "+" : "-", additionnal); var trainer = env.CreateTrainer(iova); using (var ch = env.Start("train")) { var predictor = trainer.Train(env, ch, roles); TestTrainerHelper.FinalizeSerializationTest(env, outModelFilePath, predictor, roles, outData, outData2, PredictionKind.MulticlassClassification, checkError, ratio: 0.1f); } } }
public void TestI_PolynomialTransformSerialize() { using (var host = EnvHelper.NewTestEnvironment()) { var inputs = new[] { new ExampleA() { X = new float[] { 1, 10, 100 } }, new ExampleA() { X = new float[] { 2, 3, 5 } } }; IDataView loader = host.CreateStreamingDataView(inputs); var data = host.CreateTransform("poly{col=poly:X d=3}", loader); // We create a specific folder in build/UnitTest which will contain the output. var methodName = System.Reflection.MethodBase.GetCurrentMethod().Name; var outModelFilePath = FileHelper.GetOutputFile("outModelFilePath.zip", methodName); var outData = FileHelper.GetOutputFile("outData.txt", methodName); var outData2 = FileHelper.GetOutputFile("outData2.txt", methodName); // This function serializes the output data twice, once before saving the pipeline, once after loading the pipeline. // It checks it gives the same result. TestTransformHelper.SerializationTestTransform(host, outModelFilePath, data, loader, outData, outData2); } }
public void TestI_ULabelToR4LabelTransform() { var methodName = System.Reflection.MethodBase.GetCurrentMethod().Name; var dataFilePath = FileHelper.GetTestFile("iris_binary.txt"); var outModelFilePath = FileHelper.GetOutputFile("outModelFilePath.zip", methodName); var outData = FileHelper.GetOutputFile("outData1.txt", methodName); var outData2 = FileHelper.GetOutputFile("outData2.txt", methodName); using (var env = EnvHelper.NewTestEnvironment(conc: 1)) { var loader = env.CreateLoader("Text{col=LabelText:TX:0 col=Slength:R4:1 col=Swidth:R4:2 col=Plength:R4:3 col=Pwidth:R4:4 header=+}", new MultiFileSource(dataFilePath)); var concat = env.CreateTransform("Concat{col=Features:Slength,Swidth}", loader); var labelTx = env.CreateTransform("TermTransform{col=LabelU4:LabelText}", concat); var labelR4 = env.CreateTransform("U2R4{col=Label:LabelU4}", labelTx); var roles = env.CreateExamples(labelR4, "Features", "Label"); var trainer = env.CreateTrainer("lr"); using (var ch = env.Start("test")) { var pred = trainer.Train(env, ch, roles); TestTrainerHelper.FinalizeSerializationTest(env, outModelFilePath, pred, roles, outData, outData2, trainer.Trainer.PredictionKind, true, ratio: 0.8f); } } }
public void TestDataViewCacheDataFrameSerializationCacheFile() { var methodName = System.Reflection.MethodBase.GetCurrentMethod().Name; var dataFilePath = FileHelper.GetTestFile("mc_iris.txt"); var outputDataFilePath = FileHelper.GetOutputFile("outputDataFilePath.txt", methodName); var outModelFilePath = FileHelper.GetOutputFile("outModelFilePath.zip", methodName); var cacheFile = FileHelper.GetOutputFile("cacheFile.idv", methodName); using (var env = EnvHelper.NewTestEnvironment()) { var loader = env.CreateLoader("Text{col=Label:R4:0 col=Slength:R4:1 col=Swidth:R4:2 col=Plength:R4:3 col=Pwidth:R4:4 header=+}", new MultiFileSource(dataFilePath)); var sorted = env.CreateTransform(string.Format("cachedf{{r=+ df=- f={0}}}", cacheFile), loader); StreamHelper.SaveModel(env, sorted, outModelFilePath); using (var fs = File.OpenRead(outModelFilePath)) { var deserializedData = env.LoadTransforms(fs, loader); var saver = env.CreateSaver("Text"); using (var fs2 = File.Create(outputDataFilePath)) saver.SaveData(fs2, deserializedData, StreamHelper.GetColumnsIndex(deserializedData.Schema, new[] { "Label", "Slength", "Swidth", "Plength", "Pwidth" })); } if (!File.Exists(cacheFile)) { throw new FileNotFoundException(cacheFile); } } }
private void ScikitAPI_SimpleTransform_Load(bool removeFirstTransform) { var methodName = System.Reflection.MethodBase.GetCurrentMethod().Name; var output = FileHelper.GetOutputFile($"model{removeFirstTransform}.zip", methodName); var inputs = new[] { new ExampleA() { X = new float[] { 1, 10, 100 } }, new ExampleA() { X = new float[] { 2, 3, 5 } } }; var inputs2 = new[] { new ExampleA() { X = new float[] { -1, -10, -100 } }, new ExampleA() { X = new float[] { -2, -3, -5 } } }; string expected = null; using (var host = EnvHelper.NewTestEnvironment(conc: 1)) { var data = host.CreateStreamingDataView(inputs); using (var pipe = new ScikitPipeline(new[] { "poly{col=X}" }, host: host)) { var predictor = pipe.Train(data); Assert.IsTrue(predictor != null); var data2 = host.CreateStreamingDataView(inputs2); var predictions = pipe.Transform(data2); var df = DataFrameIO.ReadView(predictions); Assert.AreEqual(df.Shape, new Tuple <int, int>(2, 9)); var dfs = df.ToString(); var dfs2 = dfs.Replace("\n", ";"); expected = dfs2; Assert.AreEqual(dfs2, "X.0,X.1,X.2,X.3,X.4,X.5,X.6,X.7,X.8;-1,-10,-100,1,10,100,100,1000,10000;-2,-3,-5,4,6,10,9,15,25"); pipe.Save(output, removeFirstTransform); } } using (var host = EnvHelper.NewTestEnvironment(conc: 1)) { var data2 = host.CreateStreamingDataView(inputs2); using (var pipe2 = new ScikitPipeline(output, host)) { var predictions = pipe2.Transform(data2); var df = DataFrameIO.ReadView(predictions); Assert.AreEqual(df.Shape, new Tuple <int, int>(2, 9)); var dfs = df.ToString(); var dfs2 = dfs.Replace("\n", ";"); Assert.AreEqual(expected, dfs2); } } }
public void TestLambdaColumnPassThroughTransform() { /*using (*/ var host = EnvHelper.NewTestEnvironment(); { var inputs = new InputOutputU[] { new InputOutputU() { X = new float[] { 0.1f, 1.1f }, Y = 0 }, new InputOutputU() { X = new float[] { 0.2f, 1.2f }, Y = 1 }, new InputOutputU() { X = new float[] { 0.3f, 1.3f }, Y = 2 } }; var data = DataViewConstructionUtils.CreateFromEnumerable(host, inputs); var lambdaView = LambdaColumnHelper.Create <VBuffer <float>, VBuffer <float> >(host, "Lambda", data, "X", "XX", new VectorDataViewType(NumberDataViewType.Single, 2), new VectorDataViewType(NumberDataViewType.Single, 2), (in VBuffer <float> src, ref VBuffer <float> dst) => { dst = new VBuffer <float>(2, new float[2]); dst.Values[0] = src.Values[0] + 1f; dst.Values[1] = src.Values[1] + 1f; });
private IDataScorerTransform _TrainSentiment() { bool normalize = true; var args = new TextLoader.Arguments() { Separator = "tab", HasHeader = true, Column = new[] { new TextLoader.Column("Label", DataKind.BL, 0), new TextLoader.Column("SentimentText", DataKind.Text, 1) } }; var args2 = new TextFeaturizingEstimator.Arguments() { Column = new TextFeaturizingEstimator.Column { Name = "Features", Source = new[] { "SentimentText" } }, KeepDiacritics = false, KeepPunctuations = false, TextCase = TextNormalizingEstimator.CaseNormalizationMode.Lower, OutputTokens = true, UsePredefinedStopWordRemover = true, VectorNormalizer = normalize ? TextFeaturizingEstimator.TextNormKind.L2 : TextFeaturizingEstimator.TextNormKind.None, CharFeatureExtractor = new NgramExtractorTransform.NgramExtractorArguments() { NgramLength = 3, AllLengths = false }, WordFeatureExtractor = new NgramExtractorTransform.NgramExtractorArguments() { NgramLength = 2, AllLengths = true }, }; var trainFilename = FileHelper.GetTestFile("wikipedia-detox-250-line-data.tsv"); using (var env = EnvHelper.NewTestEnvironment(seed: 1, conc: 1)) { // Pipeline var loader = new TextLoader(env, args).Read(new MultiFileSource(trainFilename)); var trans = TextFeaturizingEstimator.Create(env, args2, loader); // Train var trainer = new SdcaBinaryTrainer(env, new SdcaBinaryTrainer.Arguments { NumThreads = 1 }); var cached = new CacheDataView(env, trans, prefetch: null); var predictor = trainer.Fit(cached); var scoreRoles = new RoleMappedData(trans, label: "Label", feature: "Features"); var trainRoles = new RoleMappedData(cached, label: "Label", feature: "Features"); return(ScoreUtils.GetScorer(predictor.Model, scoreRoles, env, trainRoles.Schema)); } }
public PredictionEngineExample(string modelName) { _env = EnvHelper.NewTestEnvironment(); var transformer = TransformerChain.LoadFromLegacy(_env, File.OpenRead(modelName)); var model = new ModelOperationsCatalog(_env); _predictor = model.CreatePredictionEngine <FloatVectorInput, FloatOutput>(transformer); }
public void TestDBScanTransform() { var methodName = System.Reflection.MethodBase.GetCurrentMethod().Name; var dataFilePath = FileHelper.GetTestFile("three_classes_2d.txt"); var outputDataFilePath = FileHelper.GetOutputFile("outputDataFilePath.txt", methodName); var outModelFilePath = FileHelper.GetOutputFile("outModelFilePath.zip", methodName); using (var env = EnvHelper.NewTestEnvironment(conc: 1)) { //var loader = env.CreateLoader("text{col=RowId:I4:0 col=Features:R4:1-2 header=+}", new MultiFileSource(dataFilePath)); var loader = TextLoader.Create(env, new TextLoader.Arguments() { HasHeader = true, Column = new[] { TextLoader.Column.Parse("RowId:R4:0"), TextLoader.Column.Parse("Features:R4:1-2") } }, new MultiFileSource(dataFilePath)); var xf = env.CreateTransform("DBScan{col=Features}", loader); string schema = SchemaHelper.ToString(xf.Schema); if (string.IsNullOrEmpty(schema)) { throw new Exception("Schema is null."); } if (!schema.Contains("Cluster")) { throw new Exception("Schema does not contain Cluster."); } if (!schema.Contains("Score")) { throw new Exception("Schema does not contain Score."); } StreamHelper.SaveModel(env, xf, outModelFilePath); var saver = env.CreateSaver("Text{header=- schema=-}"); using (var fs2 = File.Create(outputDataFilePath)) saver.SaveData(fs2, TestTransformHelper.AddFlatteningTransform(env, xf), StreamHelper.GetColumnsIndex(xf.Schema, new[] { "Features", "ClusterId", "Score" })); // Checking the values. var lines = File.ReadAllLines(outputDataFilePath).Select(c => c.Split('\t')).Where(c => c.Length == 4); if (!lines.Any()) { throw new Exception(string.Format("The output file is empty or not containing three columns '{0}'", outputDataFilePath)); } var clusters = lines.Select(c => c[1]).Distinct(); if (clusters.Count() <= 1) { throw new Exception("Only one cluster, this is unexpected."); } // Serialization. var outData = FileHelper.GetOutputFile("outData1.txt", methodName); var outData2 = FileHelper.GetOutputFile("outData2.txt", methodName); TestTransformHelper.SerializationTestTransform(env, outModelFilePath, xf, loader, outData, outData2); } }
public void TestScikitAPI_SimplePredictor() { var inputs = new[] { new ExampleA() { X = new float[] { 1, 10, 100 } }, new ExampleA() { X = new float[] { 2, 3, 5 } }, new ExampleA() { X = new float[] { 2, 4, 5 } }, new ExampleA() { X = new float[] { 2, 4, 7 } }, }; var inputs2 = new[] { new ExampleA() { X = new float[] { -1, -10, -100 } }, new ExampleA() { X = new float[] { -2, -3, -5 } }, new ExampleA() { X = new float[] { 3, 4, 5 } }, new ExampleA() { X = new float[] { 3, 4, 7 } }, }; /*using (*/ var host = EnvHelper.NewTestEnvironment(conc: 1); { var data = DataViewConstructionUtils.CreateFromEnumerable(host, inputs); using (var pipe = new ScikitPipeline(new[] { "poly{col=X}" }, "km{k=2}", host)) { var predictor = pipe.Train(data, feature: "X"); Assert.IsTrue(predictor != null); var data2 = new StreamingDataFrame(DataViewConstructionUtils.CreateFromEnumerable(host, inputs2)); var predictions = pipe.Predict(data2); var df = DataFrameIO.ReadView(predictions); Assert.AreEqual(df.Shape, new Tuple <int, int>(4, 12)); var dfs = df.ToString(); var dfs2 = dfs.Replace("\n", ";"); Assert.IsTrue(dfs2.StartsWith("X.0,X.1,X.2,X.3,X.4,X.5,X.6,X.7,X.8,PredictedLabel,Score.0,Score.1;-1,-10,-100,1,10,100,100,1000,10000")); } } }
public void TestSelectTagContactViewTransform() { var methodName = System.Reflection.MethodBase.GetCurrentMethod().Name; var firstData = FileHelper.GetOutputFile("first.idv", methodName); var outData = FileHelper.GetOutputFile("outData.txt", methodName); var outData2 = FileHelper.GetOutputFile("outData2.txt", methodName); using (var env = EnvHelper.NewTestEnvironment()) { var inputs = new[] { new ExampleA() { X = new float[] { 0, 1, 4 } }, new ExampleA() { X = new float[] { 2, 3, 7 } } }; // Create IDV IDataView loader = env.CreateStreamingDataView(inputs); var saver = ComponentCreation.CreateSaver(env, "binary"); using (var ch = env.Start("save")) { using (var fs0 = env.CreateOutputFile(firstData)) DataSaverUtils.SaveDataView(ch, saver, loader, fs0, true); // Create parallel pipeline loader = env.CreateStreamingDataView(inputs); var data = env.CreateTransform("Scaler{col=X1:X}", loader); data = env.CreateTransform(string.Format("selecttag{{t=first s=second f={0}}}", firstData), data); data = env.CreateTransform("Scaler{col=X1:X}", data); var merged = env.CreateTransform("append{t=first}", data); // Save the outcome var text = env.CreateSaver("Text"); var columns = new int[merged.Schema.Count]; for (int i = 0; i < columns.Length; ++i) { columns[i] = i; } using (var fs2 = File.Create(outData)) text.SaveData(fs2, merged, columns); // Final checking var lines = File.ReadAllLines(outData); if (!lines.Any()) { throw new Exception("Empty file."); } if (lines.Length != 9) { throw new Exception("Some lines are missing."); } } } }
private static IDataScorerTransform _TrainSentiment() { bool normalize = true; var args = new TextLoader.Options() { Separators = new[] { '\t' }, HasHeader = true, Columns = new[] { new TextLoader.Column("Label", DataKind.Boolean, 0), new TextLoader.Column("SentimentText", DataKind.String, 1) } }; var args2 = new TextFeaturizingEstimator.Options() { KeepDiacritics = false, KeepPunctuations = false, CaseMode = TextNormalizingEstimator.CaseMode.Lower, OutputTokensColumnName = "tokens", Norm = normalize ? TextFeaturizingEstimator.NormFunction.L2 : TextFeaturizingEstimator.NormFunction.None, CharFeatureExtractor = new WordBagEstimator.Options() { NgramLength = 3, UseAllLengths = false }, WordFeatureExtractor = new WordBagEstimator.Options() { NgramLength = 2, UseAllLengths = true }, }; var trainFilename = FileHelper.GetTestFile("wikipedia-detox-250-line-data.tsv"); /*using (*/ var env = EnvHelper.NewTestEnvironment(seed: 1, conc: 1); { // Pipeline var loader = new TextLoader(env, args).Load(new MultiFileSource(trainFilename)); var trans = TextFeaturizingEstimator.Create(env, args2, loader); // Train var trainer = new SdcaLogisticRegressionBinaryTrainer(env, new SdcaLogisticRegressionBinaryTrainer.Options { LabelColumnName = "Label", FeatureColumnName = "Features" }); var cached = new Microsoft.ML.Data.CacheDataView(env, trans, prefetch: null); var predictor = trainer.Fit(cached); var trainRoles = new RoleMappedData(cached, label: "Label", feature: "Features"); var scoreRoles = new RoleMappedData(trans, label: "Label", feature: "Features"); return(ScoreUtils.GetScorer(predictor.Model, scoreRoles, env, trainRoles.Schema)); } }
public void TrainTestPipelinePredictTransform() { var methodName = System.Reflection.MethodBase.GetCurrentMethod().Name; var dataFilePath = FileHelper.GetTestFile("mc_iris.txt"); var outModelFilePath = FileHelper.GetOutputFile("outModelFilePath.zip", methodName); var outData = FileHelper.GetOutputFile("outData1.txt", methodName); var outData2 = FileHelper.GetOutputFile("outData2.txt", methodName); using (var env = EnvHelper.NewTestEnvironment(conc: 1)) { var loader = env.CreateLoader("Text{col=Label:R4:0 col=Slength:R4:1 col=Swidth:R4:2 col=Plength:R4:3 col=Pwidth:R4:4 header=+}", new MultiFileSource(dataFilePath)); var pipe = env.CreateTransform("Concat{col=Features:Slength,Swidth}", loader); pipe = env.CreateTransform("SplitTrainTest{col=base tag=train tag=test}", pipe); pipe = env.CreateTransform("SelectTag{tag=unused selectTag=train}", pipe); pipe = env.CreateTransform(string.Format("TagTrainScore{{tag=trainP out={0} tr=mlr}}", outModelFilePath), pipe); pipe = env.CreateTransform("SelectTag{tag=scoredTrain selectTag=test}", pipe); pipe = env.CreateTransform("TagPredict{in=trainP}", pipe); string schema = SchemaHelper.ToString(pipe.Schema); var cursor = pipe.GetRowCursor(i => true); string schema2 = SchemaHelper.ToString(cursor.Schema); if (schema != schema2) { throw new Exception("Schema mismatch."); } long count = DataViewUtils.ComputeRowCount(pipe); if (count != 49) { throw new Exception(string.Format("Unexpected number of rows {0}", count)); } // Checks the outputs. var saver = env.CreateSaver("Text"); var columns = new string[pipe.Schema.Count]; for (int i = 0; i < columns.Length; ++i) { columns[i] = pipe.Schema[i].Name; } using (var fs2 = File.Create(outData)) saver.SaveData(fs2, pipe, StreamHelper.GetColumnsIndex(pipe.Schema)); var lines = File.ReadAllLines(outData); if (lines.Length < 40) { throw new Exception("Something is missing:" + string.Join("\n", lines)); } if (lines.Length > 70) { throw new Exception("Too much data:" + string.Join("\n", lines)); } TestTransformHelper.SerializationTestTransform(env, outModelFilePath, pipe, loader, outData, outData2); } }
public PredictionEngineExample(string modelName) { _env = EnvHelper.NewTestEnvironment(); var view = DataViewConstructionUtils.CreateFromEnumerable(_env, new FloatVectorInput[] { }); var pipe = DataViewConstructionUtils.LoadPipeWithPredictor(_env, File.OpenRead(modelName), new EmptyDataView(_env, view.Schema)); var transformer = new TransformWrapper(_env, pipe); _predictor = _env.CreatePredictionEngine <FloatVectorInput, FloatOutput>(transformer); }
public void Init(string modelName) { try { using (var env = EnvHelper.NewTestEnvironment()) engine = new ValueMapperPredictionEngineFloat(env, modelName, "Probability"); } catch (Exception e) { throw new Exception("erreur", e); } }
public void TestDataSplitTrainTestSerialization() { var methodName = System.Reflection.MethodBase.GetCurrentMethod().Name; var dataFilePath = FileHelper.GetTestFile("mc_iris.txt"); var cacheFile = FileHelper.GetOutputFile("outputDataFilePath.idv", methodName); var trainFile = FileHelper.GetOutputFile("iris_train.idv", methodName); var testFile = FileHelper.GetOutputFile("iris_test.idv", methodName); var outModelFilePath = FileHelper.GetOutputFile("outModelFilePath.zip", methodName); var outData = FileHelper.GetOutputFile("outData.txt", methodName); /*using (*/ var env = EnvHelper.NewTestEnvironment(); { var loader = env.CreateLoader("Text{col=Label:R4:0 col=Slength:R4:1 col=Swidth:R4:2 col=Plength:R4:3 col=Pwidth:R4:4 header=+}", new MultiFileSource(dataFilePath)); var args = new SplitTrainTestTransform.Arguments { newColumn = "Part", cacheFile = cacheFile, filename = new string[] { trainFile, testFile }, reuse = true }; var transformedData = new SplitTrainTestTransform(env, args, loader); StreamHelper.SaveModel(env, transformedData, outModelFilePath); using (var fs = File.OpenRead(outModelFilePath)) { var deserializedData = ModelFileUtils.LoadTransforms(env, loader, fs); var saver = env.CreateSaver("Text"); var columns = new int[deserializedData.Schema.Count]; for (int i = 0; i < columns.Length; ++i) { columns[i] = i; } using (var fs2 = File.Create(outData)) saver.SaveData(fs2, deserializedData, columns); } if (!File.Exists(cacheFile)) { throw new FileNotFoundException(cacheFile); } if (!File.Exists(trainFile)) { throw new FileNotFoundException(trainFile); } if (!File.Exists(testFile)) { throw new FileNotFoundException(testFile); } } }