public void TestOldSavingAndLoading()
        {
            var data = new[] { new TestClass()
                               {
                                   A = "1", B = "2", C = "3",
                               }, new TestClass()
                               {
                                   A = "4", B = "5", C = "6"
                               } };
            var dataView = ComponentCreation.CreateDataView(Env, data);
            var pipe     = new OneHotHashEncodingEstimator(Env, new[] {
                new OneHotHashEncodingEstimator.ColumnInfo("A", "CatHashA"),
                new OneHotHashEncodingEstimator.ColumnInfo("B", "CatHashB"),
                new OneHotHashEncodingEstimator.ColumnInfo("C", "CatHashC")
            });
            var result      = pipe.Fit(dataView).Transform(dataView);
            var resultRoles = new RoleMappedData(result);

            using (var ms = new MemoryStream())
            {
                TrainUtils.SaveModel(Env, Env.Start("saving"), ms, null, resultRoles);
                ms.Position = 0;
                var loadedView = ModelFileUtils.LoadTransforms(Env, dataView, ms);
            }
        }
        public void TestMetadataPropagation()
        {
            var data = new[] {
                new TestMeta()
                {
                    A = new string[2] {
                        "A", "B"
                    }, B = "C", C = new float[2] {
                        1.0f, 2.0f
                    }, D = 1.0f, E = new string[2] {
                        "A", "D"
                    }, F = "D"
                },
                new TestMeta()
                {
                    A = new string[2] {
                        "A", "B"
                    }, B = "C", C = new float[2] {
                        3.0f, 4.0f
                    }, D = -1.0f, E = new string[2] {
                        "E", "A"
                    }, F = "E"
                },
                new TestMeta()
                {
                    A = new string[2] {
                        "A", "B"
                    }, B = "C", C = new float[2] {
                        5.0f, 6.0f
                    }, D = 1.0f, E = new string[2] {
                        "D", "E"
                    }, F = "D"
                }
            };

            var dataView = ComponentCreation.CreateDataView(Env, data);
            var bagPipe  = new OneHotHashEncodingEstimator(Env,
                                                           new OneHotHashEncodingEstimator.ColumnInfo("A", "CatA", OneHotEncodingTransformer.OutputKind.Bag, invertHash: -1),
                                                           new OneHotHashEncodingEstimator.ColumnInfo("B", "CatB", OneHotEncodingTransformer.OutputKind.Bag, invertHash: -1),
                                                           new OneHotHashEncodingEstimator.ColumnInfo("C", "CatC", OneHotEncodingTransformer.OutputKind.Bag, invertHash: -1),
                                                           new OneHotHashEncodingEstimator.ColumnInfo("D", "CatD", OneHotEncodingTransformer.OutputKind.Bag, invertHash: -1),
                                                           new OneHotHashEncodingEstimator.ColumnInfo("E", "CatE", OneHotEncodingTransformer.OutputKind.Ind, invertHash: -1),
                                                           new OneHotHashEncodingEstimator.ColumnInfo("F", "CatF", OneHotEncodingTransformer.OutputKind.Ind, invertHash: -1),
                                                           new OneHotHashEncodingEstimator.ColumnInfo("A", "CatG", OneHotEncodingTransformer.OutputKind.Key, invertHash: -1),
                                                           new OneHotHashEncodingEstimator.ColumnInfo("B", "CatH", OneHotEncodingTransformer.OutputKind.Key, invertHash: -1),
                                                           new OneHotHashEncodingEstimator.ColumnInfo("A", "CatI", OneHotEncodingTransformer.OutputKind.Bin, invertHash: -1),
                                                           new OneHotHashEncodingEstimator.ColumnInfo("B", "CatJ", OneHotEncodingTransformer.OutputKind.Bin, invertHash: -1));

            var bagResult = bagPipe.Fit(dataView).Transform(dataView);

            ValidateMetadata(bagResult);
            Done();
        }
        public void CategoricalHashWorkout()
        {
            var data = new[] { new TestClass()
                               {
                                   A = "1", B = "2", C = "3",
                               }, new TestClass()
                               {
                                   A = "4", B = "5", C = "6"
                               } };

            var dataView = ComponentCreation.CreateDataView(Env, data);
            var pipe     = new OneHotHashEncodingEstimator(Env, new[] {
                new OneHotHashEncodingEstimator.ColumnInfo("A", "CatA", OneHotEncodingTransformer.OutputKind.Bag),
                new OneHotHashEncodingEstimator.ColumnInfo("A", "CatB", OneHotEncodingTransformer.OutputKind.Bin),
                new OneHotHashEncodingEstimator.ColumnInfo("A", "CatC", OneHotEncodingTransformer.OutputKind.Ind),
                new OneHotHashEncodingEstimator.ColumnInfo("A", "CatD", OneHotEncodingTransformer.OutputKind.Key),
            });

            TestEstimatorCore(pipe, dataView);
            Done();
        }
예제 #4
0
                public override IEnumerable <SuggestedTransform> Apply(IntermediateColumn[] columns)
                {
                    bool foundCat          = false;
                    bool foundCatHash      = false;
                    var  catColumnsNew     = new List <OneHotEncodingEstimator.ColumnInfo>();
                    var  catHashColumnsNew = new List <OneHotHashEncodingEstimator.ColumnInfo>();
                    var  featureCols       = new List <string>();

                    foreach (var column in columns)
                    {
                        if (!column.Type.ItemType().IsText() || column.Purpose != ColumnPurpose.CategoricalFeature)
                        {
                            continue;
                        }

                        var columnName = new StringBuilder();
                        columnName.AppendFormat("{0}", column.ColumnName);

                        if (IsDictionaryOk(column, EstimatedSampleFraction))
                        {
                            foundCat = true;
                            catColumnsNew.Add(new OneHotEncodingEstimator.ColumnInfo(columnName.ToString(), columnName.ToString()));
                        }
                        else
                        {
                            foundCatHash = true;
                            catHashColumnsNew.Add(new OneHotHashEncodingEstimator.ColumnInfo(columnName.ToString(), columnName.ToString()));
                        }
                    }

                    if (foundCat)
                    {
                        ColumnRoutingStructure.AnnotatedName[] columnsSource =
                            catColumnsNew.Select(c => new ColumnRoutingStructure.AnnotatedName {
                            IsNumeric = false, Name = c.Output
                        }).ToArray();
                        ColumnRoutingStructure.AnnotatedName[] columnsDest =
                            catColumnsNew.Select(c => new ColumnRoutingStructure.AnnotatedName {
                            IsNumeric = true, Name = c.Output
                        }).ToArray();
                        var routingStructure = new ColumnRoutingStructure(columnsSource, columnsDest);

                        var input = new OneHotEncodingEstimator(Env, catColumnsNew.ToArray());
                        featureCols.AddRange(catColumnsNew.Select(c => c.Output));

                        yield return(new SuggestedTransform(input, routingStructure));
                    }

                    if (foundCatHash)
                    {
                        ColumnRoutingStructure.AnnotatedName[] columnsSource =
                            catHashColumnsNew.Select(c => new ColumnRoutingStructure.AnnotatedName {
                            IsNumeric = false, Name = c.HashInfo.Output
                        }).ToArray();
                        ColumnRoutingStructure.AnnotatedName[] columnsDest =
                            catHashColumnsNew.Select(c => new ColumnRoutingStructure.AnnotatedName {
                            IsNumeric = true, Name = c.HashInfo.Output
                        }).ToArray();
                        var routingStructure = new ColumnRoutingStructure(columnsSource, columnsDest);

                        var input = new OneHotHashEncodingEstimator(Env, catHashColumnsNew.ToArray());

                        yield return(new SuggestedTransform(input, routingStructure));
                    }

                    if (!ExcludeFeaturesConcatTransforms && featureCols.Count > 0)
                    {
                        yield return(InferenceHelpers.GetRemainingFeatures(featureCols, columns, GetType(), IncludeFeaturesOverride));

                        IncludeFeaturesOverride = true;
                    }
                }
예제 #5
0
                public override IEnumerable <SuggestedTransform> Apply(IntermediateColumn[] columns)
                {
                    var firstGroupColId = Array.FindIndex(columns, x => x.Purpose == ColumnPurpose.Group);

                    if (firstGroupColId < 0)
                    {
                        yield break;
                    }

                    var col = columns[firstGroupColId];

                    var columnName = new StringBuilder();

                    columnName.AppendFormat("{0}", col.ColumnName);

                    if (col.Type.IsText())
                    {
                        // REVIEW: we could potentially apply HashJoin to vectors of text.
                        string dest   = DefaultColumnNames.GroupId;
                        string source = columnName.ToString();
                        var    input  = new OneHotHashEncodingEstimator(Env, new OneHotHashEncodingEstimator.ColumnInfo(dest, source));

                        var routingStructure = new ColumnRoutingStructure(
                            new[]
                        {
                            new ColumnRoutingStructure.AnnotatedName {
                                IsNumeric = false, Name = source
                            }
                        },
                            new[]
                        {
                            new ColumnRoutingStructure.AnnotatedName {
                                IsNumeric = true, Name = dest
                            }
                        }
                            );

                        string[] outputColNames = new string[] { DefaultColumnNames.GroupId };
                        yield return(new SuggestedTransform(input, routingStructure));
                    }
                    else if (col.ColumnName != DefaultColumnNames.GroupId)
                    {
                        string dest   = DefaultColumnNames.GroupId;
                        string source = columnName.ToString();
                        var    input  = new ColumnCopyingEstimator(Env, source, dest);

                        var routingStructure = new ColumnRoutingStructure(
                            new[]
                        {
                            new ColumnRoutingStructure.AnnotatedName {
                                IsNumeric = true, Name = source
                            }
                        },
                            new[]
                        {
                            new ColumnRoutingStructure.AnnotatedName {
                                IsNumeric = true, Name = dest
                            }
                        }
                            );

                        yield return(new SuggestedTransform(input, routingStructure));
                    }
                }