public void TestOldSavingAndLoading() { var data = new[] { new TestClass() { A = "1", B = "2", C = "3", }, new TestClass() { A = "4", B = "5", C = "6" } }; var dataView = ComponentCreation.CreateDataView(Env, data); var pipe = new OneHotHashEncodingEstimator(Env, new[] { new OneHotHashEncodingEstimator.ColumnInfo("A", "CatHashA"), new OneHotHashEncodingEstimator.ColumnInfo("B", "CatHashB"), new OneHotHashEncodingEstimator.ColumnInfo("C", "CatHashC") }); var result = pipe.Fit(dataView).Transform(dataView); var resultRoles = new RoleMappedData(result); using (var ms = new MemoryStream()) { TrainUtils.SaveModel(Env, Env.Start("saving"), ms, null, resultRoles); ms.Position = 0; var loadedView = ModelFileUtils.LoadTransforms(Env, dataView, ms); } }
public void TestMetadataPropagation() { var data = new[] { new TestMeta() { A = new string[2] { "A", "B" }, B = "C", C = new float[2] { 1.0f, 2.0f }, D = 1.0f, E = new string[2] { "A", "D" }, F = "D" }, new TestMeta() { A = new string[2] { "A", "B" }, B = "C", C = new float[2] { 3.0f, 4.0f }, D = -1.0f, E = new string[2] { "E", "A" }, F = "E" }, new TestMeta() { A = new string[2] { "A", "B" }, B = "C", C = new float[2] { 5.0f, 6.0f }, D = 1.0f, E = new string[2] { "D", "E" }, F = "D" } }; var dataView = ComponentCreation.CreateDataView(Env, data); var bagPipe = new OneHotHashEncodingEstimator(Env, new OneHotHashEncodingEstimator.ColumnInfo("A", "CatA", OneHotEncodingTransformer.OutputKind.Bag, invertHash: -1), new OneHotHashEncodingEstimator.ColumnInfo("B", "CatB", OneHotEncodingTransformer.OutputKind.Bag, invertHash: -1), new OneHotHashEncodingEstimator.ColumnInfo("C", "CatC", OneHotEncodingTransformer.OutputKind.Bag, invertHash: -1), new OneHotHashEncodingEstimator.ColumnInfo("D", "CatD", OneHotEncodingTransformer.OutputKind.Bag, invertHash: -1), new OneHotHashEncodingEstimator.ColumnInfo("E", "CatE", OneHotEncodingTransformer.OutputKind.Ind, invertHash: -1), new OneHotHashEncodingEstimator.ColumnInfo("F", "CatF", OneHotEncodingTransformer.OutputKind.Ind, invertHash: -1), new OneHotHashEncodingEstimator.ColumnInfo("A", "CatG", OneHotEncodingTransformer.OutputKind.Key, invertHash: -1), new OneHotHashEncodingEstimator.ColumnInfo("B", "CatH", OneHotEncodingTransformer.OutputKind.Key, invertHash: -1), new OneHotHashEncodingEstimator.ColumnInfo("A", "CatI", OneHotEncodingTransformer.OutputKind.Bin, invertHash: -1), new OneHotHashEncodingEstimator.ColumnInfo("B", "CatJ", OneHotEncodingTransformer.OutputKind.Bin, invertHash: -1)); var bagResult = bagPipe.Fit(dataView).Transform(dataView); ValidateMetadata(bagResult); Done(); }
public void CategoricalHashWorkout() { var data = new[] { new TestClass() { A = "1", B = "2", C = "3", }, new TestClass() { A = "4", B = "5", C = "6" } }; var dataView = ComponentCreation.CreateDataView(Env, data); var pipe = new OneHotHashEncodingEstimator(Env, new[] { new OneHotHashEncodingEstimator.ColumnInfo("A", "CatA", OneHotEncodingTransformer.OutputKind.Bag), new OneHotHashEncodingEstimator.ColumnInfo("A", "CatB", OneHotEncodingTransformer.OutputKind.Bin), new OneHotHashEncodingEstimator.ColumnInfo("A", "CatC", OneHotEncodingTransformer.OutputKind.Ind), new OneHotHashEncodingEstimator.ColumnInfo("A", "CatD", OneHotEncodingTransformer.OutputKind.Key), }); TestEstimatorCore(pipe, dataView); Done(); }
public override IEnumerable <SuggestedTransform> Apply(IntermediateColumn[] columns) { bool foundCat = false; bool foundCatHash = false; var catColumnsNew = new List <OneHotEncodingEstimator.ColumnInfo>(); var catHashColumnsNew = new List <OneHotHashEncodingEstimator.ColumnInfo>(); var featureCols = new List <string>(); foreach (var column in columns) { if (!column.Type.ItemType().IsText() || column.Purpose != ColumnPurpose.CategoricalFeature) { continue; } var columnName = new StringBuilder(); columnName.AppendFormat("{0}", column.ColumnName); if (IsDictionaryOk(column, EstimatedSampleFraction)) { foundCat = true; catColumnsNew.Add(new OneHotEncodingEstimator.ColumnInfo(columnName.ToString(), columnName.ToString())); } else { foundCatHash = true; catHashColumnsNew.Add(new OneHotHashEncodingEstimator.ColumnInfo(columnName.ToString(), columnName.ToString())); } } if (foundCat) { ColumnRoutingStructure.AnnotatedName[] columnsSource = catColumnsNew.Select(c => new ColumnRoutingStructure.AnnotatedName { IsNumeric = false, Name = c.Output }).ToArray(); ColumnRoutingStructure.AnnotatedName[] columnsDest = catColumnsNew.Select(c => new ColumnRoutingStructure.AnnotatedName { IsNumeric = true, Name = c.Output }).ToArray(); var routingStructure = new ColumnRoutingStructure(columnsSource, columnsDest); var input = new OneHotEncodingEstimator(Env, catColumnsNew.ToArray()); featureCols.AddRange(catColumnsNew.Select(c => c.Output)); yield return(new SuggestedTransform(input, routingStructure)); } if (foundCatHash) { ColumnRoutingStructure.AnnotatedName[] columnsSource = catHashColumnsNew.Select(c => new ColumnRoutingStructure.AnnotatedName { IsNumeric = false, Name = c.HashInfo.Output }).ToArray(); ColumnRoutingStructure.AnnotatedName[] columnsDest = catHashColumnsNew.Select(c => new ColumnRoutingStructure.AnnotatedName { IsNumeric = true, Name = c.HashInfo.Output }).ToArray(); var routingStructure = new ColumnRoutingStructure(columnsSource, columnsDest); var input = new OneHotHashEncodingEstimator(Env, catHashColumnsNew.ToArray()); yield return(new SuggestedTransform(input, routingStructure)); } if (!ExcludeFeaturesConcatTransforms && featureCols.Count > 0) { yield return(InferenceHelpers.GetRemainingFeatures(featureCols, columns, GetType(), IncludeFeaturesOverride)); IncludeFeaturesOverride = true; } }
public override IEnumerable <SuggestedTransform> Apply(IntermediateColumn[] columns) { var firstGroupColId = Array.FindIndex(columns, x => x.Purpose == ColumnPurpose.Group); if (firstGroupColId < 0) { yield break; } var col = columns[firstGroupColId]; var columnName = new StringBuilder(); columnName.AppendFormat("{0}", col.ColumnName); if (col.Type.IsText()) { // REVIEW: we could potentially apply HashJoin to vectors of text. string dest = DefaultColumnNames.GroupId; string source = columnName.ToString(); var input = new OneHotHashEncodingEstimator(Env, new OneHotHashEncodingEstimator.ColumnInfo(dest, source)); var routingStructure = new ColumnRoutingStructure( new[] { new ColumnRoutingStructure.AnnotatedName { IsNumeric = false, Name = source } }, new[] { new ColumnRoutingStructure.AnnotatedName { IsNumeric = true, Name = dest } } ); string[] outputColNames = new string[] { DefaultColumnNames.GroupId }; yield return(new SuggestedTransform(input, routingStructure)); } else if (col.ColumnName != DefaultColumnNames.GroupId) { string dest = DefaultColumnNames.GroupId; string source = columnName.ToString(); var input = new ColumnCopyingEstimator(Env, source, dest); var routingStructure = new ColumnRoutingStructure( new[] { new ColumnRoutingStructure.AnnotatedName { IsNumeric = true, Name = source } }, new[] { new ColumnRoutingStructure.AnnotatedName { IsNumeric = true, Name = dest } } ); yield return(new SuggestedTransform(input, routingStructure)); } }