public void TestI_ScalerTransformSerialize() { using (var host = EnvHelper.NewTestEnvironment()) { var inputs = new[] { new ExampleA() { X = new float[] { 1, 10, 100 } }, new ExampleA() { X = new float[] { 2, 3, 5 } } }; IDataView loader = host.CreateStreamingDataView(inputs); var data = host.CreateTransform("Scaler{col=X}", loader); (data as ITrainableTransform).Estimate(); // We create a specific folder in build/UnitTest which will contain the output. var methodName = System.Reflection.MethodBase.GetCurrentMethod().Name; var outModelFilePath = FileHelper.GetOutputFile("outModelFilePath.zip", methodName); var outData = FileHelper.GetOutputFile("outData.txt", methodName); var outData2 = FileHelper.GetOutputFile("outData2.txt", methodName); var nb = DataViewUtils.ComputeRowCount(data); if (nb < 1) { throw new Exception("empty view"); } // This function serializes the output data twice, once before saving the pipeline, once after loading the pipeline. // It checks it gives the same result. TestTransformHelper.SerializationTestTransform(host, outModelFilePath, data, loader, outData, outData2); } }
public void SdcaRegression() { var env = new TlcEnvironment(seed: 0); var dataPath = GetDataPath("external", "winequality-white.csv"); var dataSource = new MultiFileSource(dataPath); var reader = TextLoader.CreateReader(env, c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)), separator: ';', hasHeader: true); LinearRegressionPredictor pred = null; var est = reader.MakeNewEstimator() .Append(r => (r.label, score: r.label.PredictSdcaRegression(r.features, maxIterations: 2, onFit: p => pred = p))); var pipe = reader.Append(est); Assert.Null(pred); var model = pipe.Fit(dataSource); Assert.NotNull(pred); // 11 input features, so we ought to have 11 weights. Assert.Equal(11, pred.Weights2.Count); var data = model.Read(dataSource); // Just output some data on the schema for fun. var rows = DataViewUtils.ComputeRowCount(data.AsDynamic); var schema = data.AsDynamic.Schema; for (int c = 0; c < schema.ColumnCount; ++c) { Console.WriteLine($"{schema.GetColumnName(c)}, {schema.GetColumnType(c)}"); } }
protected void DebugChecking1(IDataView viewI, IDataView choose, RoleMappedData data, RoleMappedData td, int count, bool singleColumn) { var cache1 = new MemoryDataView(Host, viewI, numThreads: 1); var cache2 = new MemoryDataView(Host, choose, numThreads: 1); var t1 = data.Schema.Feature.Type.AsVector(); var t2 = td.Schema.Feature.Type.AsVector(); if (t1.DimCount()() != 1) { throw Host.Except("Expect only 1 dimension."); } if (t2.DimCount()() != 1) { throw Host.Except("Expect only 1 dimension."); } if (singleColumn && t1.GetDim(0) != t2.GetDim(0) - 1) { throw Host.Except("Different dimension {0} != {1}-1", t1.GetDim(0), t2.GetDim(0)); } if (!singleColumn && t1.GetDim(0) >= t2.GetDim(0) - 1) { throw Host.Except("Different dimension {0} != {1}-1", t1.GetDim(0), t2.GetDim(0)); } var nb1 = DataViewUtils.ComputeRowCount(cache1); if (nb1 == 0) { throw Host.Except("empty view"); } var nb2 = DataViewUtils.ComputeRowCount(cache2); if (nb2 == 0) { throw Host.Except("empty view"); } if (!singleColumn) { using (var cursor = cache2.GetRowCursor(i => true)) { string sch_ = SchemaHelper.ToString(cursor.Schema); int index; if (!cursor.Schema.TryGetColumnIndex(data.Schema.Label.Name, out index)) { throw Host.Except("Unable to find '{0}' in\n{1}", data.Schema.Label.Name, sch_); } var getter = cursor.GetGetter <VBuffer <float> >(index); var buf = new VBuffer <float>(); while (cursor.MoveNext()) { getter(ref buf); if (buf.Count > count || buf.Length > count) { throw Contracts.Except("Mismath"); } } } } }
public void TrainTestPipelinePredictTransform() { var methodName = System.Reflection.MethodBase.GetCurrentMethod().Name; var dataFilePath = FileHelper.GetTestFile("mc_iris.txt"); var outModelFilePath = FileHelper.GetOutputFile("outModelFilePath.zip", methodName); var outData = FileHelper.GetOutputFile("outData1.txt", methodName); var outData2 = FileHelper.GetOutputFile("outData2.txt", methodName); using (var env = EnvHelper.NewTestEnvironment(conc: 1)) { var loader = env.CreateLoader("Text{col=Label:R4:0 col=Slength:R4:1 col=Swidth:R4:2 col=Plength:R4:3 col=Pwidth:R4:4 header=+}", new MultiFileSource(dataFilePath)); var pipe = env.CreateTransform("Concat{col=Features:Slength,Swidth}", loader); pipe = env.CreateTransform("SplitTrainTest{col=base tag=train tag=test}", pipe); pipe = env.CreateTransform("SelectTag{tag=unused selectTag=train}", pipe); pipe = env.CreateTransform(string.Format("TagTrainScore{{tag=trainP out={0} tr=mlr}}", outModelFilePath), pipe); pipe = env.CreateTransform("SelectTag{tag=scoredTrain selectTag=test}", pipe); pipe = env.CreateTransform("TagPredict{in=trainP}", pipe); string schema = SchemaHelper.ToString(pipe.Schema); var cursor = pipe.GetRowCursor(i => true); string schema2 = SchemaHelper.ToString(cursor.Schema); if (schema != schema2) { throw new Exception("Schema mismatch."); } long count = DataViewUtils.ComputeRowCount(pipe); if (count != 49) { throw new Exception(string.Format("Unexpected number of rows {0}", count)); } // Checks the outputs. var saver = env.CreateSaver("Text"); var columns = new string[pipe.Schema.Count]; for (int i = 0; i < columns.Length; ++i) { columns[i] = pipe.Schema[i].Name; } using (var fs2 = File.Create(outData)) saver.SaveData(fs2, pipe, StreamHelper.GetColumnsIndex(pipe.Schema)); var lines = File.ReadAllLines(outData); if (lines.Length < 40) { throw new Exception("Something is missing:" + string.Join("\n", lines)); } if (lines.Length > 70) { throw new Exception("Too much data:" + string.Join("\n", lines)); } TestTransformHelper.SerializationTestTransform(env, outModelFilePath, pipe, loader, outData, outData2); } }
private static void TransposeCheckHelper <T>(IDataView view, int viewCol, ITransposeDataView trans) { Assert.NotNull(view); Assert.NotNull(trans); int col = viewCol; VectorDataViewType type = trans.GetSlotType(col); DataViewType colType = trans.Schema[col].Type; Assert.Equal(view.Schema[viewCol].Name, trans.Schema[col].Name); DataViewType expectedType = view.Schema[viewCol].Type; Assert.Equal(expectedType, colType); string desc = string.Format("Column {0} named '{1}'", col, trans.Schema[col].Name); Assert.Equal(DataViewUtils.ComputeRowCount(view), type.Size); Assert.True(typeof(T) == type.ItemType.RawType, $"{desc} had wrong type for slot cursor"); Assert.True(type.Size > 0, $"{desc} expected to be known sized vector but is not"); int valueCount = (colType as VectorDataViewType)?.Size ?? 1; Assert.True(0 != valueCount, $"{desc} expected to have fixed size, but does not"); int rc = type.Size; T[] expectedVals = NaiveTranspose <T>(view, viewCol); T[] vals = new T[rc * valueCount]; Contracts.Assert(vals.Length == expectedVals.Length); using (var cursor = trans.GetSlotCursor(col)) { var getter = cursor.GetGetter <T>(); VBuffer <T> temp = default(VBuffer <T>); int offset = 0; while (cursor.MoveNext()) { Assert.True(offset < vals.Length, $"{desc} slot cursor went further than it should have"); getter(ref temp); Assert.True(rc == temp.Length, $"{desc} slot cursor yielded vector with unexpected length"); temp.CopyTo(vals, offset); offset += rc; } Assert.True(valueCount == offset / rc, $"{desc} slot cursor yielded fewer than expected values"); } for (int i = 0; i < vals.Length; ++i) { Assert.Equal(expectedVals[i], vals[i]); } }
private static T[] NaiveTranspose <T>(IDataView view, int col) { var type = view.Schema[col].Type; int rc = checked ((int)DataViewUtils.ComputeRowCount(view)); var vecType = type as VectorDataViewType; var itemType = vecType?.ItemType ?? type; Assert.Equal(typeof(T), itemType.RawType); Assert.NotEqual(0, vecType?.Size); T[] retval = new T[rc * (vecType?.Size ?? 1)]; using (var cursor = view.GetRowCursor(view.Schema[col])) { if (type is VectorDataViewType) { var getter = cursor.GetGetter <VBuffer <T> >(cursor.Schema[col]); VBuffer <T> temp = default; int offset = 0; while (cursor.MoveNext()) { Assert.True(0 <= offset && offset < rc && offset == cursor.Position); getter(ref temp); var tempValues = temp.GetValues(); var tempIndices = temp.GetIndices(); for (int i = 0; i < tempValues.Length; ++i) { retval[(temp.IsDense ? i : tempIndices[i]) * rc + offset] = tempValues[i]; } offset++; } } else { var getter = cursor.GetGetter <T>(cursor.Schema[col]); while (cursor.MoveNext()) { Assert.True(0 <= cursor.Position && cursor.Position < rc); getter(ref retval[(int)cursor.Position]); } } } return(retval); }
private static void TransposeCheckHelper <T>(IDataView view, int viewCol, ITransposeDataView trans) { int col = viewCol; var type = trans.TransposeSchema.GetSlotType(col); var colType = trans.Schema.GetColumnType(col); Assert.Equal(view.Schema.GetColumnName(viewCol), trans.Schema.GetColumnName(col)); var expectedType = view.Schema.GetColumnType(viewCol); // Unfortunately can't use equals because column type equality is a simple reference comparison. :P Assert.Equal(expectedType, colType); Assert.Equal(DataViewUtils.ComputeRowCount(view), (long)type.VectorSize); string desc = string.Format("Column {0} named '{1}'", col, trans.Schema.GetColumnName(col)); Assert.True(typeof(T) == type.ItemType.RawType, $"{desc} had wrong type for slot cursor"); Assert.True(type.IsVector, $"{desc} expected to be vector but is not"); Assert.True(type.VectorSize > 0, $"{desc} expected to be known sized vector but is not"); Assert.True(0 != colType.ValueCount, $"{desc} expected to have fixed size, but does not"); int rc = type.VectorSize; T[] expectedVals = NaiveTranspose <T>(view, viewCol); T[] vals = new T[rc * colType.ValueCount]; Contracts.Assert(vals.Length == expectedVals.Length); using (var cursor = trans.GetSlotCursor(col)) { var getter = cursor.GetGetter <T>(); VBuffer <T> temp = default(VBuffer <T>); int offset = 0; while (cursor.MoveNext()) { Assert.True(offset < vals.Length, $"{desc} slot cursor went further than it should have"); getter(ref temp); Assert.True(rc == temp.Length, $"{desc} slot cursor yielded vector with unexpected length"); temp.CopyTo(vals, offset); offset += rc; } Assert.True(colType.ValueCount == offset / rc, $"{desc} slot cursor yielded fewer than expected values"); } for (int i = 0; i < vals.Length; ++i) { Assert.Equal(expectedVals[i], vals[i]); } }
public void SdcaBinaryClassificationNoClaibration() { var env = new TlcEnvironment(seed: 0); var dataPath = GetDataPath("breast-cancer.txt"); var dataSource = new MultiFileSource(dataPath); var reader = TextLoader.CreateReader(env, c => (label: c.LoadBool(0), features: c.LoadFloat(1, 9))); LinearBinaryPredictor pred = null; var loss = new HingeLoss(new HingeLoss.Arguments() { Margin = 1 }); // With a custom loss function we no longer get calibrated predictions. var est = reader.MakeNewEstimator() .Append(r => (r.label, preds: r.label.PredictSdcaBinaryClassification(r.features, maxIterations: 2, loss: loss, onFit: p => pred = p))); var pipe = reader.Append(est); Assert.Null(pred); var model = pipe.Fit(dataSource); Assert.NotNull(pred); // 9 input features, so we ought to have 9 weights. Assert.Equal(9, pred.Weights2.Count); var data = model.Read(dataSource); // Just output some data on the schema for fun. var rows = DataViewUtils.ComputeRowCount(data.AsDynamic); var schema = data.AsDynamic.Schema; for (int c = 0; c < schema.ColumnCount; ++c) { Console.WriteLine($"{schema.GetColumnName(c)}, {schema.GetColumnType(c)}"); } }
private static T[] NaiveTranspose <T>(IDataView view, int col) { var type = view.Schema.GetColumnType(col); int rc = checked ((int)DataViewUtils.ComputeRowCount(view)); Assert.True(type.ItemType.RawType == typeof(T)); Assert.True(type.ValueCount > 0); T[] retval = new T[rc * type.ValueCount]; using (var cursor = view.GetRowCursor(c => c == col)) { if (type.IsVector) { var getter = cursor.GetGetter <VBuffer <T> >(col); VBuffer <T> temp = default(VBuffer <T>); int offset = 0; while (cursor.MoveNext()) { Assert.True(0 <= offset && offset < rc && offset == cursor.Position); getter(ref temp); for (int i = 0; i < temp.Count; ++i) { retval[(temp.IsDense ? i : temp.Indices[i]) * rc + offset] = temp.Values[i]; } offset++; } } else { var getter = cursor.GetGetter <T>(col); while (cursor.MoveNext()) { Assert.True(0 <= cursor.Position && cursor.Position < rc); getter(ref retval[(int)cursor.Position]); } } } return(retval); }
static IDataTransform Create(IHostEnvironment env, Arguments args, IDataView input, out IDataView sourceCtx) { sourceCtx = input; env.CheckValue(args.tag, "Tag cannot be empty."); if (TagHelper.EnumerateTaggedView(true, input).Where(c => c.Item1 == args.tag).Any()) { throw env.Except("Tag '{0}' is already used.", args.tag); } env.CheckValue(args.selectTag, "Selected tag cannot be empty."); if (string.IsNullOrEmpty(args.filename)) { var selected = TagHelper.EnumerateTaggedView(true, input).Where(c => c.Item1 == args.selectTag); if (!selected.Any()) { throw env.Except("Unable to find a view to select with tag '{0}'. Did you forget to specify a filename?", args.selectTag); } var first = selected.First(); if (selected.Skip(1).Any()) { throw env.Except("Tag '{0}' is ambiguous, {1} views were found.", args.selectTag, selected.Count()); } var tagged = input as ITaggedDataView; if (tagged == null) { var ag = new TagViewTransform.Arguments { tag = args.tag }; tagged = new TagViewTransform(env, ag, input); } first.Item2.AddRange(new[] { new Tuple <string, ITaggedDataView>(args.tag, tagged) }); tagged.AddRange(new[] { new Tuple <string, ITaggedDataView>(args.selectTag, first.Item2) }); #if (DEBUG_TIP) long count = DataViewUtils.ComputeRowCount(tagged); if (count == 0) { throw env.Except("Replaced view is empty."); } count = DataViewUtils.ComputeRowCount(first.Item2); if (count == 0) { throw env.Except("Selected view is empty."); } #endif var tr = first.Item2 as IDataTransform; env.AssertValue(tr); return(tr); } else { if (!File.Exists(args.filename)) { throw env.Except("Unable to find file '{0}'.", args.filename); } var selected = TagHelper.EnumerateTaggedView(true, input).Where(c => c.Item1 == args.selectTag); if (selected.Any()) { throw env.Except("Tag '{0}' was already given. It cannot be assigned to the new file.", args.selectTag); } var loaderArgs = new BinaryLoader.Arguments(); var file = new MultiFileSource(args.filename); var loadSettings = ScikitSubComponent <ILegacyDataLoader, SignatureDataLoader> .AsSubComponent(args.loaderSettings); IDataView loader = loadSettings.CreateInstance(env, file); var ag = new TagViewTransform.Arguments { tag = args.selectTag }; var newInput = new TagViewTransform(env, ag, loader); var tagged = input as ITaggedDataView; if (tagged == null) { ag = new TagViewTransform.Arguments { tag = args.tag }; tagged = new TagViewTransform(env, ag, input); } newInput.AddRange(new[] { new Tuple <string, ITaggedDataView>(args.tag, tagged) }); tagged.AddRange(new[] { new Tuple <string, ITaggedDataView>(args.selectTag, newInput) }); var schema = loader.Schema; if (schema.Count == 0) { throw env.Except("The loaded view '{0}' is empty (empty schema).", args.filename); } return(newInput); } }
private void TrainCore(IChannel ch, IProgressChannel pch, RoleMappedData data, TPredictor predictor) { // Verifications. _host.AssertValue(ch); ch.CheckValue(data, nameof(data)); ValidateTrainInput(ch, data); var featureColumns = data.Schema.GetColumns(RoleMappedSchema.ColumnRole.Feature); ch.Check(featureColumns.Count == 1, "Only one vector of features is allowed."); // Data dimension. int fi = data.Schema.Feature.Index; var colType = data.Schema.Schema.GetColumnType(fi); ch.Assert(colType.IsVector, "Feature must be a vector."); ch.Assert(colType.VectorSize > 0, "Feature dimension must be known."); int nbDim = colType.VectorSize; IDataView view = data.Data; long nbRows = DataViewUtils.ComputeRowCount(view); Float[] labels; uint[] groupCount; DMatrix dtrain; // REVIEW xadupre: this can be avoided by using method XGDMatrixCreateFromDataIter from the XGBoost API. // XGBoost removes NaN values from a dense matrix and stores it in sparse format anyway. bool isDense = DetectDensity(data); var dt = DateTime.Now; if (isDense) { dtrain = FillDenseMatrix(ch, nbDim, nbRows, data, out labels, out groupCount); ch.Info("Dense matrix created with nbFeatures={0} and nbRows={1} in {2}.", nbDim, nbRows, DateTime.Now - dt); } else { dtrain = FillSparseMatrix(ch, nbDim, nbRows, data, out labels, out groupCount); ch.Info("Sparse matrix created with nbFeatures={0} and nbRows={1} in {2}.", nbDim, nbRows, DateTime.Now - dt); } // Some options are filled based on the data. var options = _args.ToDict(_host); UpdateXGBoostOptions(ch, options, labels, groupCount); // For multi class, the number of labels is required. ch.Assert(PredictionKind != PredictionKind.MultiClassClassification || options.ContainsKey("num_class"), "XGBoost requires the number of classes to be specified in the parameters."); ch.Info("XGBoost objective={0}", options["objective"]); int numTrees; Booster res = WrappedXGBoostTraining.Train(ch, pch, out numTrees, options, dtrain, numBoostRound: _args.numBoostRound, obj: null, verboseEval: _args.verboseEval, xgbModel: predictor == null ? null : predictor.GetBooster(), saveBinaryDMatrix: _args.saveXGBoostDMatrixAsBinary); int nbTrees = res.GetNumTrees(); ch.Info("Training is complete. Number of added trees={0}, total={1}.", numTrees, nbTrees); _model = res.SaveRaw(); _nbFeaturesXGboost = (int)dtrain.GetNumCols(); _nbFeaturesML = nbDim; }
public static void TrainkNNTransformId(int k, NearestNeighborsWeights weight, int threads, string distance = "L2") { var methodName = string.Format("{0}-k{1}-W{2}-T{3}-D{4}", System.Reflection.MethodBase.GetCurrentMethod().Name, k, weight, threads, distance); var dataFilePath = FileHelper.GetTestFile("iris_binary_id.txt"); var outModelFilePath = FileHelper.GetOutputFile("outModelFilePath.zip", methodName); var outData = FileHelper.GetOutputFile("outData1.txt", methodName); var outData2 = FileHelper.GetOutputFile("outData2.txt", methodName); var env = k == 1 ? EnvHelper.NewTestEnvironment(conc: 1) : EnvHelper.NewTestEnvironment(); using (env) { var loader = env.CreateLoader("Text{col=Label:R4:0 col=Slength:R4:1 col=Swidth:R4:2 col=Plength:R4:3 col=Pwidth:R4:4 col=Uid:I8:5 header=+}", new MultiFileSource(dataFilePath)); var concat = env.CreateTransform("Concat{col=Features:Slength,Swidth}", loader); if (distance == "cosine") { concat = env.CreateTransform("Scaler{col=Features}", concat); } concat = env.CreateTransform("knntr{k=5 id=Uid}", concat); long nb = DataViewUtils.ComputeRowCount(concat); if (nb == 0) { throw new System.Exception("Empty pipeline."); } using (var cursor = concat.GetRowCursor(i => true)) { var getdist = cursor.GetGetter <VBuffer <float> >(7); var getid = cursor.GetGetter <VBuffer <long> >(8); var ddist = new VBuffer <float>(); var did = new VBuffer <long>(); while (cursor.MoveNext()) { getdist(ref ddist); getid(ref did); if (!ddist.IsDense || !did.IsDense) { throw new System.Exception("not dense"); } if (ddist.Count != did.Count) { throw new System.Exception("not the same dimension"); } for (int i = 1; i < ddist.Count; ++i) { if (ddist.Values[i - 1] > ddist.Values[i]) { throw new System.Exception("not sorted"); } if (did.Values[i] % 2 != 1) { throw new System.Exception("wrong id"); } } } } TestTransformHelper.SerializationTestTransform(env, outModelFilePath, concat, loader, outData, outData2, false); } }