예제 #1
0
                public static SuggestedTransform ConcatColumnsIntoOne(List <string> columnNames, string concatColumnName,
                                                                      Type transformType, bool isNumeric)
                {
                    StringBuilder columnName = new StringBuilder();

                    columnNames.ForEach(column =>
                    {
                        columnName.AppendFormat("{0}", column);
                    });

                    string columnsToConcat = string.Join(",", columnNames);

                    var env   = new MLContext();
                    var input = new ColumnConcatenatingEstimator(env, concatColumnName, columnNames.ToArray());

                    // Not sure if resulting columns will be numeric or text, since concat can apply to either.
                    ColumnRoutingStructure.AnnotatedName[] columnsSource =
                        columnNames.Select(c => new ColumnRoutingStructure.AnnotatedName {
                        IsNumeric = isNumeric, Name = c
                    }).ToArray();
                    ColumnRoutingStructure.AnnotatedName[] columnsDest =
                        new[] { new ColumnRoutingStructure.AnnotatedName {
                                    IsNumeric = isNumeric, Name = concatColumnName
                                } };
                    var routingStructure = new ColumnRoutingStructure(columnsSource, columnsDest);

                    return(new SuggestedTransform(input, routingStructure));
                }
예제 #2
0
 public SuggestedTransform(IEstimator <ITransformer> estimator,
                           ColumnRoutingStructure routingStructure = null, IDictionary <string, string> properties = null)
 {
     Estimator        = estimator;
     RoutingStructure = routingStructure;
     Properties       = properties;
 }
예제 #3
0
                public static SuggestedTransform TextTransform(string srcColumn, string dstColumn, IEstimator <ITransformer> estimator)
                {
                    ColumnRoutingStructure.AnnotatedName[] columnsSource =
                    { new ColumnRoutingStructure.AnnotatedName {
                          IsNumeric = false, Name = srcColumn
                      } };
                    ColumnRoutingStructure.AnnotatedName[] columnsDest =
                    { new ColumnRoutingStructure.AnnotatedName {
                          IsNumeric = true, Name = dstColumn
                      } };
                    var routingStructure = new ColumnRoutingStructure(columnsSource, columnsDest);

                    return(new SuggestedTransform(estimator, routingStructure));
                }
예제 #4
0
                public override IEnumerable <SuggestedTransform> Apply(IntermediateColumn[] columns)
                {
                    var  selectedColumns      = columns.Where(c => !IgnoreColumn(c.Purpose)).ToArray();
                    var  colList              = selectedColumns.Select(c => c.ColumnName).ToArray();
                    bool allColumnsNumeric    = selectedColumns.All(c => c.Purpose == ColumnPurpose.NumericFeature && c.Type.ItemType() != BoolType.Instance);
                    bool allColumnsNonNumeric = selectedColumns.All(c => c.Purpose != ColumnPurpose.NumericFeature);

                    if (colList.Length > 0)
                    {
                        // Check if column is named features and already numeric
                        if (colList.Length == 1 && colList[0] == DefaultColumnNames.Features && allColumnsNumeric)
                        {
                            yield break;
                        }

                        if (!allColumnsNumeric && !allColumnsNonNumeric)
                        {
                            yield break;
                        }

                        List <string> columnList = new List <string>();

                        foreach (var column in colList)
                        {
                            var columnName = new StringBuilder();
                            columnName.AppendFormat("{0}", column);
                            columnList.Add(columnName.ToString());
                        }

                        var input = new ColumnConcatenatingEstimator(Env, DefaultColumnNames.Features, columnList.ToArray());

                        ColumnRoutingStructure.AnnotatedName[] columnsSource =
                            columnList.Select(c => new ColumnRoutingStructure.AnnotatedName {
                            IsNumeric = allColumnsNumeric, Name = c
                        }).ToArray();
                        ColumnRoutingStructure.AnnotatedName[] columnsDest =
                        { new ColumnRoutingStructure.AnnotatedName {
                              IsNumeric = allColumnsNumeric, Name = DefaultColumnNames.Features
                          } };
                        var routingStructure = new ColumnRoutingStructure(columnsSource, columnsDest);
                        yield return(new SuggestedTransform(input, routingStructure));
                    }
                }
예제 #5
0
                public override IEnumerable <SuggestedTransform> Apply(IntermediateColumn[] columns)
                {
                    var columnName = new StringBuilder();
                    var newColumns = new List <TypeConvertingTransformer.ColumnInfo>();

                    foreach (var column in columns)
                    {
                        if (!column.Type.ItemType().IsBool() || column.Purpose != ColumnPurpose.NumericFeature)
                        {
                            continue;
                        }
                        columnName.AppendFormat("{0}", column.ColumnName);

                        newColumns.Add(new TypeConvertingTransformer.ColumnInfo(columnName.ToString(),
                                                                                columnName.ToString(), DataKind.R4));
                    }

                    if (columnName.Length > 0)
                    {
                        var input = new TypeConvertingEstimator(Env, newColumns.ToArray());
                        ColumnRoutingStructure.AnnotatedName[] columnsSource =
                            newColumns.Select(c => new ColumnRoutingStructure.AnnotatedName {
                            IsNumeric = false, Name = c.Input
                        }).ToArray();
                        ColumnRoutingStructure.AnnotatedName[] columnsDest =
                            newColumns.Select(c => new ColumnRoutingStructure.AnnotatedName {
                            IsNumeric = true, Name = c.Output
                        }).ToArray();
                        var routingStructure = new ColumnRoutingStructure(columnsSource, columnsDest);
                        yield return(new SuggestedTransform(input, routingStructure));

                        // Concat featurized columns into existing feature column, if transformed at least one column.
                        if (!ExcludeFeaturesConcatTransforms)
                        {
                            yield return(InferenceHelpers.GetRemainingFeatures(newColumns.Select(c => c.Output).ToList(),
                                                                               columns, GetType(), IncludeFeaturesOverride));

                            IncludeFeaturesOverride = true;
                        }
                    }
                }
예제 #6
0
                public override IEnumerable <SuggestedTransform> Apply(IntermediateColumn[] columns)
                {
                    var featureCols = new List <string>();

                    foreach (var column in columns)
                    {
                        if (!column.Type.ItemType().IsText() || column.Purpose != ColumnPurpose.TextFeature)
                        {
                            continue;
                        }

                        var columnDestSuffix = "_tf";
                        var columnNameSafe   = column.ColumnName;

                        string columnDestRenamed = $"{columnNameSafe}{columnDestSuffix}";

                        featureCols.Add(columnDestRenamed);
                        var input = new TextFeaturizingEstimator(Env, columnDestRenamed, columnNameSafe);
                        ColumnRoutingStructure.AnnotatedName[] columnsSource =
                        { new ColumnRoutingStructure.AnnotatedName {
                              IsNumeric = false, Name = columnNameSafe
                          } };
                        ColumnRoutingStructure.AnnotatedName[] columnsDest =
                        { new ColumnRoutingStructure.AnnotatedName {
                              IsNumeric = true, Name = columnDestRenamed
                          } };
                        var routingStructure = new ColumnRoutingStructure(columnsSource, columnsDest);
                        yield return(new SuggestedTransform(input, routingStructure));
                    }

                    // Concat text featurized columns into existing feature column, if transformed at least one column.
                    if (!ExcludeFeaturesConcatTransforms && featureCols.Count > 0)
                    {
                        yield return(InferenceHelpers.GetRemainingFeatures(featureCols, columns, GetType(), IncludeFeaturesOverride));

                        IncludeFeaturesOverride = true;
                    }
                }
예제 #7
0
                public override IEnumerable <SuggestedTransform> Apply(IntermediateColumn[] columns)
                {
                    bool found      = false;
                    var  columnName = new StringBuilder();

                    foreach (var column in columns)
                    {
                        if (column.Type.ItemType() != NumberType.R4 || column.Purpose != ColumnPurpose.NumericFeature)
                        {
                            continue;
                        }
                        if (!column.HasMissing)
                        {
                            continue;
                        }

                        found = true;

                        columnName.AppendFormat("{0}", column.ColumnName);
                    }
                    if (found)
                    {
                        string name  = columnName.ToString();
                        var    input = new MissingValueIndicatorEstimator(Env, name, name);

                        ColumnRoutingStructure.AnnotatedName[] columnsSource =
                        { new ColumnRoutingStructure.AnnotatedName {
                              IsNumeric = true, Name = name
                          } };
                        ColumnRoutingStructure.AnnotatedName[] columnsDest =
                        { new ColumnRoutingStructure.AnnotatedName {
                              IsNumeric = true, Name = name
                          } };
                        var routingStructure = new ColumnRoutingStructure(columnsSource, columnsDest);
                        yield return(new SuggestedTransform(input, routingStructure));
                    }
                }
예제 #8
0
                public override IEnumerable <SuggestedTransform> Apply(IntermediateColumn[] columns)
                {
                    int  count           = 0;
                    bool isAllText       = true;
                    var  colSpec         = new StringBuilder();
                    var  colSpecTextOnly = new List <string>();
                    var  columnList      = new List <string>();

                    foreach (var column in columns)
                    {
                        var columnName = new StringBuilder();
                        if (column.Purpose != ColumnPurpose.Name)
                        {
                            continue;
                        }
                        count++;

                        if (colSpec.Length > 0)
                        {
                            colSpec.Append(",");
                        }
                        colSpec.Append(column.ColumnName);

                        columnName.Append(column.ColumnName);
                        columnList.Add(columnName.ToString());

                        if (column.Type.ItemType().IsText())
                        {
                            colSpecTextOnly.Add(column.ColumnName);
                        }
                        isAllText = isAllText && column.Type.ItemType().IsText();
                    }

                    if (count == 1 && colSpec.ToString() != DefaultColumnNames.Name)
                    {
                        var columnName = new StringBuilder();
                        columnName.AppendFormat("{0}", colSpec);
                        var input = new ColumnCopyingEstimator(Env, columnName.ToString(), DefaultColumnNames.Name);
                        ColumnRoutingStructure.AnnotatedName[] columnsSource =
                        { new ColumnRoutingStructure.AnnotatedName {
                              IsNumeric = false, Name = columnName.ToString()
                          } };
                        ColumnRoutingStructure.AnnotatedName[] columnsDest =
                        { new ColumnRoutingStructure.AnnotatedName {
                              IsNumeric = false, Name = DefaultColumnNames.Name
                          } };
                        var routingStructure = new ColumnRoutingStructure(columnsSource, columnsDest);
                        yield return(new SuggestedTransform(input, routingStructure));
                    }
                    else if (count > 1)
                    {
                        if (string.IsNullOrWhiteSpace(colSpecTextOnly.ToString()))
                        {
                            yield break;
                        }

                        // suggested grouping name columns into one vector
                        var input = new ColumnConcatenatingEstimator(Env, DefaultColumnNames.Name, columnList.ToArray());

                        ColumnRoutingStructure.AnnotatedName[] columnsSource =
                            columnList.Select(c => new ColumnRoutingStructure.AnnotatedName {
                            IsNumeric = false, Name = c
                        }).ToArray();
                        ColumnRoutingStructure.AnnotatedName[] columnsDest =
                        { new ColumnRoutingStructure.AnnotatedName {
                              IsNumeric = false, Name = DefaultColumnNames.Name
                          } };
                        var routingStructure = new ColumnRoutingStructure(columnsSource, columnsDest);
                        yield return(new SuggestedTransform(input, routingStructure));
                    }
                }
예제 #9
0
                public override IEnumerable <SuggestedTransform> Apply(IntermediateColumn[] columns)
                {
                    bool foundCat          = false;
                    bool foundCatHash      = false;
                    var  catColumnsNew     = new List <OneHotEncodingEstimator.ColumnInfo>();
                    var  catHashColumnsNew = new List <OneHotHashEncodingEstimator.ColumnInfo>();
                    var  featureCols       = new List <string>();

                    foreach (var column in columns)
                    {
                        if (!column.Type.ItemType().IsText() || column.Purpose != ColumnPurpose.CategoricalFeature)
                        {
                            continue;
                        }

                        var columnName = new StringBuilder();
                        columnName.AppendFormat("{0}", column.ColumnName);

                        if (IsDictionaryOk(column, EstimatedSampleFraction))
                        {
                            foundCat = true;
                            catColumnsNew.Add(new OneHotEncodingEstimator.ColumnInfo(columnName.ToString(), columnName.ToString()));
                        }
                        else
                        {
                            foundCatHash = true;
                            catHashColumnsNew.Add(new OneHotHashEncodingEstimator.ColumnInfo(columnName.ToString(), columnName.ToString()));
                        }
                    }

                    if (foundCat)
                    {
                        ColumnRoutingStructure.AnnotatedName[] columnsSource =
                            catColumnsNew.Select(c => new ColumnRoutingStructure.AnnotatedName {
                            IsNumeric = false, Name = c.Output
                        }).ToArray();
                        ColumnRoutingStructure.AnnotatedName[] columnsDest =
                            catColumnsNew.Select(c => new ColumnRoutingStructure.AnnotatedName {
                            IsNumeric = true, Name = c.Output
                        }).ToArray();
                        var routingStructure = new ColumnRoutingStructure(columnsSource, columnsDest);

                        var input = new OneHotEncodingEstimator(Env, catColumnsNew.ToArray());
                        featureCols.AddRange(catColumnsNew.Select(c => c.Output));

                        yield return(new SuggestedTransform(input, routingStructure));
                    }

                    if (foundCatHash)
                    {
                        ColumnRoutingStructure.AnnotatedName[] columnsSource =
                            catHashColumnsNew.Select(c => new ColumnRoutingStructure.AnnotatedName {
                            IsNumeric = false, Name = c.HashInfo.Output
                        }).ToArray();
                        ColumnRoutingStructure.AnnotatedName[] columnsDest =
                            catHashColumnsNew.Select(c => new ColumnRoutingStructure.AnnotatedName {
                            IsNumeric = true, Name = c.HashInfo.Output
                        }).ToArray();
                        var routingStructure = new ColumnRoutingStructure(columnsSource, columnsDest);

                        var input = new OneHotHashEncodingEstimator(Env, catHashColumnsNew.ToArray());

                        yield return(new SuggestedTransform(input, routingStructure));
                    }

                    if (!ExcludeFeaturesConcatTransforms && featureCols.Count > 0)
                    {
                        yield return(InferenceHelpers.GetRemainingFeatures(featureCols, columns, GetType(), IncludeFeaturesOverride));

                        IncludeFeaturesOverride = true;
                    }
                }
예제 #10
0
                public override IEnumerable <SuggestedTransform> Apply(IntermediateColumn[] columns)
                {
                    var firstGroupColId = Array.FindIndex(columns, x => x.Purpose == ColumnPurpose.Group);

                    if (firstGroupColId < 0)
                    {
                        yield break;
                    }

                    var col = columns[firstGroupColId];

                    var columnName = new StringBuilder();

                    columnName.AppendFormat("{0}", col.ColumnName);

                    if (col.Type.IsText())
                    {
                        // REVIEW: we could potentially apply HashJoin to vectors of text.
                        string dest   = DefaultColumnNames.GroupId;
                        string source = columnName.ToString();
                        var    input  = new OneHotHashEncodingEstimator(Env, new OneHotHashEncodingEstimator.ColumnInfo(dest, source));

                        var routingStructure = new ColumnRoutingStructure(
                            new[]
                        {
                            new ColumnRoutingStructure.AnnotatedName {
                                IsNumeric = false, Name = source
                            }
                        },
                            new[]
                        {
                            new ColumnRoutingStructure.AnnotatedName {
                                IsNumeric = true, Name = dest
                            }
                        }
                            );

                        string[] outputColNames = new string[] { DefaultColumnNames.GroupId };
                        yield return(new SuggestedTransform(input, routingStructure));
                    }
                    else if (col.ColumnName != DefaultColumnNames.GroupId)
                    {
                        string dest   = DefaultColumnNames.GroupId;
                        string source = columnName.ToString();
                        var    input  = new ColumnCopyingEstimator(Env, source, dest);

                        var routingStructure = new ColumnRoutingStructure(
                            new[]
                        {
                            new ColumnRoutingStructure.AnnotatedName {
                                IsNumeric = true, Name = source
                            }
                        },
                            new[]
                        {
                            new ColumnRoutingStructure.AnnotatedName {
                                IsNumeric = true, Name = dest
                            }
                        }
                            );

                        yield return(new SuggestedTransform(input, routingStructure));
                    }
                }
예제 #11
0
                public override IEnumerable <SuggestedTransform> Apply(IntermediateColumn[] columns)
                {
                    var lastLabelColId = Array.FindLastIndex(columns, x => x.Purpose == ColumnPurpose.Label);

                    if (lastLabelColId < 0)
                    {
                        yield break;
                    }

                    var col = columns[lastLabelColId];

                    var columnName = new StringBuilder();

                    columnName.Append(col.ColumnName);

                    if (col.Type.IsText())
                    {
                        col.GetUniqueValueCounts <ReadOnlyMemory <char> >(out var unique, out var _, out var _);

                        string dest   = DefaultColumnNames.Label;
                        string source = columnName.ToString();
                        var    input  = new ValueToKeyMappingEstimator(Env, source, dest);

                        var routingStructure = new ColumnRoutingStructure(
                            new[]
                        {
                            new ColumnRoutingStructure.AnnotatedName {
                                IsNumeric = false, Name = source
                            }
                        },
                            new[]
                        {
                            new ColumnRoutingStructure.AnnotatedName {
                                IsNumeric = true, Name = dest
                            }
                        }
                            );
                        yield return(new SuggestedTransform(input, routingStructure));
                    }
                    else if (col.ColumnName != DefaultColumnNames.Label)
                    {
                        string dest   = DefaultColumnNames.Label;
                        string source = columnName.ToString();
                        var    input  = new ColumnCopyingEstimator(Env, source, dest);

                        var routingStructure = new ColumnRoutingStructure(
                            new[]
                        {
                            new ColumnRoutingStructure.AnnotatedName {
                                IsNumeric = true, Name = source
                            }
                        },
                            new[]
                        {
                            new ColumnRoutingStructure.AnnotatedName {
                                IsNumeric = true, Name = dest
                            }
                        }
                            );

                        yield return(new SuggestedTransform(input, routingStructure));
                    }
                }