public void TestI_ScalerTransformSerialize()
        {
            using (var host = EnvHelper.NewTestEnvironment())
            {
                var inputs = new[] {
                    new ExampleA()
                    {
                        X = new float[] { 1, 10, 100 }
                    },
                    new ExampleA()
                    {
                        X = new float[] { 2, 3, 5 }
                    }
                };

                IDataView loader = host.CreateStreamingDataView(inputs);
                var       data   = host.CreateTransform("Scaler{col=X}", loader);
                (data as ITrainableTransform).Estimate();

                // We create a specific folder in build/UnitTest which will contain the output.
                var methodName       = System.Reflection.MethodBase.GetCurrentMethod().Name;
                var outModelFilePath = FileHelper.GetOutputFile("outModelFilePath.zip", methodName);
                var outData          = FileHelper.GetOutputFile("outData.txt", methodName);
                var outData2         = FileHelper.GetOutputFile("outData2.txt", methodName);
                var nb = DataViewUtils.ComputeRowCount(data);
                if (nb < 1)
                {
                    throw new Exception("empty view");
                }

                // This function serializes the output data twice, once before saving the pipeline, once after loading the pipeline.
                // It checks it gives the same result.
                TestTransformHelper.SerializationTestTransform(host, outModelFilePath, data, loader, outData, outData2);
            }
        }
예제 #2
0
        public void SdcaRegression()
        {
            var env        = new TlcEnvironment(seed: 0);
            var dataPath   = GetDataPath("external", "winequality-white.csv");
            var dataSource = new MultiFileSource(dataPath);

            var reader = TextLoader.CreateReader(env,
                                                 c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)),
                                                 separator: ';', hasHeader: true);

            LinearRegressionPredictor pred = null;

            var est = reader.MakeNewEstimator()
                      .Append(r => (r.label, score: r.label.PredictSdcaRegression(r.features, maxIterations: 2, onFit: p => pred = p)));

            var pipe = reader.Append(est);

            Assert.Null(pred);
            var model = pipe.Fit(dataSource);

            Assert.NotNull(pred);
            // 11 input features, so we ought to have 11 weights.
            Assert.Equal(11, pred.Weights2.Count);

            var data = model.Read(dataSource);

            // Just output some data on the schema for fun.
            var rows   = DataViewUtils.ComputeRowCount(data.AsDynamic);
            var schema = data.AsDynamic.Schema;

            for (int c = 0; c < schema.ColumnCount; ++c)
            {
                Console.WriteLine($"{schema.GetColumnName(c)}, {schema.GetColumnType(c)}");
            }
        }
예제 #3
0
        protected void DebugChecking1(IDataView viewI, IDataView choose, RoleMappedData data,
                                      RoleMappedData td, int count, bool singleColumn)
        {
            var cache1 = new MemoryDataView(Host, viewI, numThreads: 1);
            var cache2 = new MemoryDataView(Host, choose, numThreads: 1);
            var t1     = data.Schema.Feature.Type.AsVector();
            var t2     = td.Schema.Feature.Type.AsVector();

            if (t1.DimCount()() != 1)
            {
                throw Host.Except("Expect only 1 dimension.");
            }
            if (t2.DimCount()() != 1)
            {
                throw Host.Except("Expect only 1 dimension.");
            }
            if (singleColumn && t1.GetDim(0) != t2.GetDim(0) - 1)
            {
                throw Host.Except("Different dimension {0} != {1}-1", t1.GetDim(0), t2.GetDim(0));
            }
            if (!singleColumn && t1.GetDim(0) >= t2.GetDim(0) - 1)
            {
                throw Host.Except("Different dimension {0} != {1}-1", t1.GetDim(0), t2.GetDim(0));
            }
            var nb1 = DataViewUtils.ComputeRowCount(cache1);

            if (nb1 == 0)
            {
                throw Host.Except("empty view");
            }
            var nb2 = DataViewUtils.ComputeRowCount(cache2);

            if (nb2 == 0)
            {
                throw Host.Except("empty view");
            }
            if (!singleColumn)
            {
                using (var cursor = cache2.GetRowCursor(i => true))
                {
                    string sch_ = SchemaHelper.ToString(cursor.Schema);
                    int    index;
                    if (!cursor.Schema.TryGetColumnIndex(data.Schema.Label.Name, out index))
                    {
                        throw Host.Except("Unable to find '{0}' in\n{1}", data.Schema.Label.Name, sch_);
                    }
                    var getter = cursor.GetGetter <VBuffer <float> >(index);
                    var buf    = new VBuffer <float>();
                    while (cursor.MoveNext())
                    {
                        getter(ref buf);
                        if (buf.Count > count || buf.Length > count)
                        {
                            throw Contracts.Except("Mismath");
                        }
                    }
                }
            }
        }
예제 #4
0
        public void TrainTestPipelinePredictTransform()
        {
            var methodName       = System.Reflection.MethodBase.GetCurrentMethod().Name;
            var dataFilePath     = FileHelper.GetTestFile("mc_iris.txt");
            var outModelFilePath = FileHelper.GetOutputFile("outModelFilePath.zip", methodName);
            var outData          = FileHelper.GetOutputFile("outData1.txt", methodName);
            var outData2         = FileHelper.GetOutputFile("outData2.txt", methodName);

            using (var env = EnvHelper.NewTestEnvironment(conc: 1))
            {
                var loader = env.CreateLoader("Text{col=Label:R4:0 col=Slength:R4:1 col=Swidth:R4:2 col=Plength:R4:3 col=Pwidth:R4:4 header=+}",
                                              new MultiFileSource(dataFilePath));

                var pipe = env.CreateTransform("Concat{col=Features:Slength,Swidth}", loader);
                pipe = env.CreateTransform("SplitTrainTest{col=base tag=train tag=test}", pipe);
                pipe = env.CreateTransform("SelectTag{tag=unused selectTag=train}", pipe);
                pipe = env.CreateTransform(string.Format("TagTrainScore{{tag=trainP out={0} tr=mlr}}", outModelFilePath), pipe);
                pipe = env.CreateTransform("SelectTag{tag=scoredTrain selectTag=test}", pipe);
                pipe = env.CreateTransform("TagPredict{in=trainP}", pipe);

                string schema  = SchemaHelper.ToString(pipe.Schema);
                var    cursor  = pipe.GetRowCursor(i => true);
                string schema2 = SchemaHelper.ToString(cursor.Schema);
                if (schema != schema2)
                {
                    throw new Exception("Schema mismatch.");
                }
                long count = DataViewUtils.ComputeRowCount(pipe);
                if (count != 49)
                {
                    throw new Exception(string.Format("Unexpected number of rows {0}", count));
                }

                // Checks the outputs.
                var saver   = env.CreateSaver("Text");
                var columns = new string[pipe.Schema.Count];
                for (int i = 0; i < columns.Length; ++i)
                {
                    columns[i] = pipe.Schema[i].Name;
                }
                using (var fs2 = File.Create(outData))
                    saver.SaveData(fs2, pipe, StreamHelper.GetColumnsIndex(pipe.Schema));

                var lines = File.ReadAllLines(outData);
                if (lines.Length < 40)
                {
                    throw new Exception("Something is missing:" + string.Join("\n", lines));
                }
                if (lines.Length > 70)
                {
                    throw new Exception("Too much data:" + string.Join("\n", lines));
                }

                TestTransformHelper.SerializationTestTransform(env, outModelFilePath, pipe, loader, outData, outData2);
            }
        }
예제 #5
0
        private static void TransposeCheckHelper <T>(IDataView view, int viewCol, ITransposeDataView trans)
        {
            Assert.NotNull(view);
            Assert.NotNull(trans);

            int col = viewCol;
            VectorDataViewType type    = trans.GetSlotType(col);
            DataViewType       colType = trans.Schema[col].Type;

            Assert.Equal(view.Schema[viewCol].Name, trans.Schema[col].Name);
            DataViewType expectedType = view.Schema[viewCol].Type;

            Assert.Equal(expectedType, colType);
            string desc = string.Format("Column {0} named '{1}'", col, trans.Schema[col].Name);

            Assert.Equal(DataViewUtils.ComputeRowCount(view), type.Size);
            Assert.True(typeof(T) == type.ItemType.RawType, $"{desc} had wrong type for slot cursor");
            Assert.True(type.Size > 0, $"{desc} expected to be known sized vector but is not");
            int valueCount = (colType as VectorDataViewType)?.Size ?? 1;

            Assert.True(0 != valueCount, $"{desc} expected to have fixed size, but does not");
            int rc = type.Size;

            T[] expectedVals = NaiveTranspose <T>(view, viewCol);
            T[] vals         = new T[rc * valueCount];
            Contracts.Assert(vals.Length == expectedVals.Length);
            using (var cursor = trans.GetSlotCursor(col))
            {
                var         getter = cursor.GetGetter <T>();
                VBuffer <T> temp   = default(VBuffer <T>);
                int         offset = 0;
                while (cursor.MoveNext())
                {
                    Assert.True(offset < vals.Length, $"{desc} slot cursor went further than it should have");
                    getter(ref temp);
                    Assert.True(rc == temp.Length, $"{desc} slot cursor yielded vector with unexpected length");
                    temp.CopyTo(vals, offset);
                    offset += rc;
                }
                Assert.True(valueCount == offset / rc, $"{desc} slot cursor yielded fewer than expected values");
            }
            for (int i = 0; i < vals.Length; ++i)
            {
                Assert.Equal(expectedVals[i], vals[i]);
            }
        }
예제 #6
0
        private static T[] NaiveTranspose <T>(IDataView view, int col)
        {
            var type     = view.Schema[col].Type;
            int rc       = checked ((int)DataViewUtils.ComputeRowCount(view));
            var vecType  = type as VectorDataViewType;
            var itemType = vecType?.ItemType ?? type;

            Assert.Equal(typeof(T), itemType.RawType);
            Assert.NotEqual(0, vecType?.Size);
            T[] retval = new T[rc * (vecType?.Size ?? 1)];

            using (var cursor = view.GetRowCursor(view.Schema[col]))
            {
                if (type is VectorDataViewType)
                {
                    var         getter = cursor.GetGetter <VBuffer <T> >(cursor.Schema[col]);
                    VBuffer <T> temp   = default;
                    int         offset = 0;
                    while (cursor.MoveNext())
                    {
                        Assert.True(0 <= offset && offset < rc && offset == cursor.Position);
                        getter(ref temp);
                        var tempValues  = temp.GetValues();
                        var tempIndices = temp.GetIndices();
                        for (int i = 0; i < tempValues.Length; ++i)
                        {
                            retval[(temp.IsDense ? i : tempIndices[i]) * rc + offset] = tempValues[i];
                        }
                        offset++;
                    }
                }
                else
                {
                    var getter = cursor.GetGetter <T>(cursor.Schema[col]);
                    while (cursor.MoveNext())
                    {
                        Assert.True(0 <= cursor.Position && cursor.Position < rc);
                        getter(ref retval[(int)cursor.Position]);
                    }
                }
            }
            return(retval);
        }
예제 #7
0
        private static void TransposeCheckHelper <T>(IDataView view, int viewCol, ITransposeDataView trans)
        {
            int col     = viewCol;
            var type    = trans.TransposeSchema.GetSlotType(col);
            var colType = trans.Schema.GetColumnType(col);

            Assert.Equal(view.Schema.GetColumnName(viewCol), trans.Schema.GetColumnName(col));
            var expectedType = view.Schema.GetColumnType(viewCol);

            // Unfortunately can't use equals because column type equality is a simple reference comparison. :P
            Assert.Equal(expectedType, colType);
            Assert.Equal(DataViewUtils.ComputeRowCount(view), (long)type.VectorSize);
            string desc = string.Format("Column {0} named '{1}'", col, trans.Schema.GetColumnName(col));

            Assert.True(typeof(T) == type.ItemType.RawType, $"{desc} had wrong type for slot cursor");
            Assert.True(type.IsVector, $"{desc} expected to be vector but is not");
            Assert.True(type.VectorSize > 0, $"{desc} expected to be known sized vector but is not");
            Assert.True(0 != colType.ValueCount, $"{desc} expected to have fixed size, but does not");
            int rc = type.VectorSize;

            T[] expectedVals = NaiveTranspose <T>(view, viewCol);
            T[] vals         = new T[rc * colType.ValueCount];
            Contracts.Assert(vals.Length == expectedVals.Length);
            using (var cursor = trans.GetSlotCursor(col))
            {
                var         getter = cursor.GetGetter <T>();
                VBuffer <T> temp   = default(VBuffer <T>);
                int         offset = 0;
                while (cursor.MoveNext())
                {
                    Assert.True(offset < vals.Length, $"{desc} slot cursor went further than it should have");
                    getter(ref temp);
                    Assert.True(rc == temp.Length, $"{desc} slot cursor yielded vector with unexpected length");
                    temp.CopyTo(vals, offset);
                    offset += rc;
                }
                Assert.True(colType.ValueCount == offset / rc, $"{desc} slot cursor yielded fewer than expected values");
            }
            for (int i = 0; i < vals.Length; ++i)
            {
                Assert.Equal(expectedVals[i], vals[i]);
            }
        }
예제 #8
0
        public void SdcaBinaryClassificationNoClaibration()
        {
            var env        = new TlcEnvironment(seed: 0);
            var dataPath   = GetDataPath("breast-cancer.txt");
            var dataSource = new MultiFileSource(dataPath);

            var reader = TextLoader.CreateReader(env,
                                                 c => (label: c.LoadBool(0), features: c.LoadFloat(1, 9)));

            LinearBinaryPredictor pred = null;

            var loss = new HingeLoss(new HingeLoss.Arguments()
            {
                Margin = 1
            });

            // With a custom loss function we no longer get calibrated predictions.
            var est = reader.MakeNewEstimator()
                      .Append(r => (r.label, preds: r.label.PredictSdcaBinaryClassification(r.features,
                                                                                            maxIterations: 2,
                                                                                            loss: loss, onFit: p => pred = p)));

            var pipe = reader.Append(est);

            Assert.Null(pred);
            var model = pipe.Fit(dataSource);

            Assert.NotNull(pred);
            // 9 input features, so we ought to have 9 weights.
            Assert.Equal(9, pred.Weights2.Count);

            var data = model.Read(dataSource);

            // Just output some data on the schema for fun.
            var rows   = DataViewUtils.ComputeRowCount(data.AsDynamic);
            var schema = data.AsDynamic.Schema;

            for (int c = 0; c < schema.ColumnCount; ++c)
            {
                Console.WriteLine($"{schema.GetColumnName(c)}, {schema.GetColumnType(c)}");
            }
        }
예제 #9
0
        private static T[] NaiveTranspose <T>(IDataView view, int col)
        {
            var type = view.Schema.GetColumnType(col);
            int rc   = checked ((int)DataViewUtils.ComputeRowCount(view));

            Assert.True(type.ItemType.RawType == typeof(T));
            Assert.True(type.ValueCount > 0);
            T[] retval = new T[rc * type.ValueCount];

            using (var cursor = view.GetRowCursor(c => c == col))
            {
                if (type.IsVector)
                {
                    var         getter = cursor.GetGetter <VBuffer <T> >(col);
                    VBuffer <T> temp   = default(VBuffer <T>);
                    int         offset = 0;
                    while (cursor.MoveNext())
                    {
                        Assert.True(0 <= offset && offset < rc && offset == cursor.Position);
                        getter(ref temp);
                        for (int i = 0; i < temp.Count; ++i)
                        {
                            retval[(temp.IsDense ? i : temp.Indices[i]) * rc + offset] = temp.Values[i];
                        }
                        offset++;
                    }
                }
                else
                {
                    var getter = cursor.GetGetter <T>(col);
                    while (cursor.MoveNext())
                    {
                        Assert.True(0 <= cursor.Position && cursor.Position < rc);
                        getter(ref retval[(int)cursor.Position]);
                    }
                }
            }
            return(retval);
        }
        static IDataTransform Create(IHostEnvironment env, Arguments args, IDataView input, out IDataView sourceCtx)
        {
            sourceCtx = input;
            env.CheckValue(args.tag, "Tag cannot be empty.");
            if (TagHelper.EnumerateTaggedView(true, input).Where(c => c.Item1 == args.tag).Any())
            {
                throw env.Except("Tag '{0}' is already used.", args.tag);
            }
            env.CheckValue(args.selectTag, "Selected tag cannot be empty.");

            if (string.IsNullOrEmpty(args.filename))
            {
                var selected = TagHelper.EnumerateTaggedView(true, input).Where(c => c.Item1 == args.selectTag);
                if (!selected.Any())
                {
                    throw env.Except("Unable to find a view to select with tag '{0}'. Did you forget to specify a filename?", args.selectTag);
                }
                var first = selected.First();
                if (selected.Skip(1).Any())
                {
                    throw env.Except("Tag '{0}' is ambiguous, {1} views were found.", args.selectTag, selected.Count());
                }
                var tagged = input as ITaggedDataView;
                if (tagged == null)
                {
                    var ag = new TagViewTransform.Arguments {
                        tag = args.tag
                    };
                    tagged = new TagViewTransform(env, ag, input);
                }
                first.Item2.AddRange(new[] { new Tuple <string, ITaggedDataView>(args.tag, tagged) });
                tagged.AddRange(new[] { new Tuple <string, ITaggedDataView>(args.selectTag, first.Item2) });
#if (DEBUG_TIP)
                long count = DataViewUtils.ComputeRowCount(tagged);
                if (count == 0)
                {
                    throw env.Except("Replaced view is empty.");
                }
                count = DataViewUtils.ComputeRowCount(first.Item2);
                if (count == 0)
                {
                    throw env.Except("Selected view is empty.");
                }
#endif
                var tr = first.Item2 as IDataTransform;
                env.AssertValue(tr);
                return(tr);
            }
            else
            {
                if (!File.Exists(args.filename))
                {
                    throw env.Except("Unable to find file '{0}'.", args.filename);
                }
                var selected = TagHelper.EnumerateTaggedView(true, input).Where(c => c.Item1 == args.selectTag);
                if (selected.Any())
                {
                    throw env.Except("Tag '{0}' was already given. It cannot be assigned to the new file.", args.selectTag);
                }
                var loaderArgs   = new BinaryLoader.Arguments();
                var file         = new MultiFileSource(args.filename);
                var loadSettings = ScikitSubComponent <ILegacyDataLoader, SignatureDataLoader> .AsSubComponent(args.loaderSettings);

                IDataView loader = loadSettings.CreateInstance(env, file);

                var ag = new TagViewTransform.Arguments {
                    tag = args.selectTag
                };
                var newInput = new TagViewTransform(env, ag, loader);
                var tagged   = input as ITaggedDataView;
                if (tagged == null)
                {
                    ag = new TagViewTransform.Arguments {
                        tag = args.tag
                    };
                    tagged = new TagViewTransform(env, ag, input);
                }

                newInput.AddRange(new[] { new Tuple <string, ITaggedDataView>(args.tag, tagged) });
                tagged.AddRange(new[] { new Tuple <string, ITaggedDataView>(args.selectTag, newInput) });

                var schema = loader.Schema;
                if (schema.Count == 0)
                {
                    throw env.Except("The loaded view '{0}' is empty (empty schema).", args.filename);
                }
                return(newInput);
            }
        }
        private void TrainCore(IChannel ch, IProgressChannel pch, RoleMappedData data, TPredictor predictor)
        {
            // Verifications.
            _host.AssertValue(ch);
            ch.CheckValue(data, nameof(data));

            ValidateTrainInput(ch, data);

            var featureColumns = data.Schema.GetColumns(RoleMappedSchema.ColumnRole.Feature);

            ch.Check(featureColumns.Count == 1, "Only one vector of features is allowed.");

            // Data dimension.
            int fi      = data.Schema.Feature.Index;
            var colType = data.Schema.Schema.GetColumnType(fi);

            ch.Assert(colType.IsVector, "Feature must be a vector.");
            ch.Assert(colType.VectorSize > 0, "Feature dimension must be known.");
            int       nbDim  = colType.VectorSize;
            IDataView view   = data.Data;
            long      nbRows = DataViewUtils.ComputeRowCount(view);

            Float[] labels;
            uint[]  groupCount;
            DMatrix dtrain;
            // REVIEW xadupre: this can be avoided by using method XGDMatrixCreateFromDataIter from the XGBoost API.
            // XGBoost removes NaN values from a dense matrix and stores it in sparse format anyway.
            bool isDense = DetectDensity(data);
            var  dt      = DateTime.Now;

            if (isDense)
            {
                dtrain = FillDenseMatrix(ch, nbDim, nbRows, data, out labels, out groupCount);
                ch.Info("Dense matrix created with nbFeatures={0} and nbRows={1} in {2}.", nbDim, nbRows, DateTime.Now - dt);
            }
            else
            {
                dtrain = FillSparseMatrix(ch, nbDim, nbRows, data, out labels, out groupCount);
                ch.Info("Sparse matrix created with nbFeatures={0} and nbRows={1} in {2}.", nbDim, nbRows, DateTime.Now - dt);
            }

            // Some options are filled based on the data.
            var options = _args.ToDict(_host);

            UpdateXGBoostOptions(ch, options, labels, groupCount);

            // For multi class, the number of labels is required.
            ch.Assert(PredictionKind != PredictionKind.MultiClassClassification || options.ContainsKey("num_class"),
                      "XGBoost requires the number of classes to be specified in the parameters.");

            ch.Info("XGBoost objective={0}", options["objective"]);

            int     numTrees;
            Booster res = WrappedXGBoostTraining.Train(ch, pch, out numTrees, options, dtrain,
                                                       numBoostRound: _args.numBoostRound,
                                                       obj: null, verboseEval: _args.verboseEval,
                                                       xgbModel: predictor == null ? null : predictor.GetBooster(),
                                                       saveBinaryDMatrix: _args.saveXGBoostDMatrixAsBinary);

            int nbTrees = res.GetNumTrees();

            ch.Info("Training is complete. Number of added trees={0}, total={1}.", numTrees, nbTrees);

            _model             = res.SaveRaw();
            _nbFeaturesXGboost = (int)dtrain.GetNumCols();
            _nbFeaturesML      = nbDim;
        }
예제 #12
0
        public static void TrainkNNTransformId(int k, NearestNeighborsWeights weight, int threads, string distance = "L2")
        {
            var methodName       = string.Format("{0}-k{1}-W{2}-T{3}-D{4}", System.Reflection.MethodBase.GetCurrentMethod().Name, k, weight, threads, distance);
            var dataFilePath     = FileHelper.GetTestFile("iris_binary_id.txt");
            var outModelFilePath = FileHelper.GetOutputFile("outModelFilePath.zip", methodName);
            var outData          = FileHelper.GetOutputFile("outData1.txt", methodName);
            var outData2         = FileHelper.GetOutputFile("outData2.txt", methodName);

            var env = k == 1 ? EnvHelper.NewTestEnvironment(conc: 1) : EnvHelper.NewTestEnvironment();

            using (env)
            {
                var loader = env.CreateLoader("Text{col=Label:R4:0 col=Slength:R4:1 col=Swidth:R4:2 col=Plength:R4:3 col=Pwidth:R4:4 col=Uid:I8:5 header=+}",
                                              new MultiFileSource(dataFilePath));

                var concat = env.CreateTransform("Concat{col=Features:Slength,Swidth}", loader);
                if (distance == "cosine")
                {
                    concat = env.CreateTransform("Scaler{col=Features}", concat);
                }
                concat = env.CreateTransform("knntr{k=5 id=Uid}", concat);
                long nb = DataViewUtils.ComputeRowCount(concat);
                if (nb == 0)
                {
                    throw new System.Exception("Empty pipeline.");
                }

                using (var cursor = concat.GetRowCursor(i => true))
                {
                    var getdist = cursor.GetGetter <VBuffer <float> >(7);
                    var getid   = cursor.GetGetter <VBuffer <long> >(8);
                    var ddist   = new VBuffer <float>();
                    var did     = new VBuffer <long>();
                    while (cursor.MoveNext())
                    {
                        getdist(ref ddist);
                        getid(ref did);
                        if (!ddist.IsDense || !did.IsDense)
                        {
                            throw new System.Exception("not dense");
                        }
                        if (ddist.Count != did.Count)
                        {
                            throw new System.Exception("not the same dimension");
                        }
                        for (int i = 1; i < ddist.Count; ++i)
                        {
                            if (ddist.Values[i - 1] > ddist.Values[i])
                            {
                                throw new System.Exception("not sorted");
                            }
                            if (did.Values[i] % 2 != 1)
                            {
                                throw new System.Exception("wrong id");
                            }
                        }
                    }
                }

                TestTransformHelper.SerializationTestTransform(env, outModelFilePath, concat, loader, outData, outData2, false);
            }
        }