public static SuggestedTransform ConcatColumnsIntoOne(List <string> columnNames, string concatColumnName, Type transformType, bool isNumeric) { StringBuilder columnName = new StringBuilder(); columnNames.ForEach(column => { columnName.AppendFormat("{0}", column); }); string columnsToConcat = string.Join(",", columnNames); var env = new MLContext(); var input = new ColumnConcatenatingEstimator(env, concatColumnName, columnNames.ToArray()); // Not sure if resulting columns will be numeric or text, since concat can apply to either. ColumnRoutingStructure.AnnotatedName[] columnsSource = columnNames.Select(c => new ColumnRoutingStructure.AnnotatedName { IsNumeric = isNumeric, Name = c }).ToArray(); ColumnRoutingStructure.AnnotatedName[] columnsDest = new[] { new ColumnRoutingStructure.AnnotatedName { IsNumeric = isNumeric, Name = concatColumnName } }; var routingStructure = new ColumnRoutingStructure(columnsSource, columnsDest); return(new SuggestedTransform(input, routingStructure)); }
public SuggestedTransform(IEstimator <ITransformer> estimator, ColumnRoutingStructure routingStructure = null, IDictionary <string, string> properties = null) { Estimator = estimator; RoutingStructure = routingStructure; Properties = properties; }
public static SuggestedTransform TextTransform(string srcColumn, string dstColumn, IEstimator <ITransformer> estimator) { ColumnRoutingStructure.AnnotatedName[] columnsSource = { new ColumnRoutingStructure.AnnotatedName { IsNumeric = false, Name = srcColumn } }; ColumnRoutingStructure.AnnotatedName[] columnsDest = { new ColumnRoutingStructure.AnnotatedName { IsNumeric = true, Name = dstColumn } }; var routingStructure = new ColumnRoutingStructure(columnsSource, columnsDest); return(new SuggestedTransform(estimator, routingStructure)); }
public override IEnumerable <SuggestedTransform> Apply(IntermediateColumn[] columns) { var selectedColumns = columns.Where(c => !IgnoreColumn(c.Purpose)).ToArray(); var colList = selectedColumns.Select(c => c.ColumnName).ToArray(); bool allColumnsNumeric = selectedColumns.All(c => c.Purpose == ColumnPurpose.NumericFeature && c.Type.ItemType() != BoolType.Instance); bool allColumnsNonNumeric = selectedColumns.All(c => c.Purpose != ColumnPurpose.NumericFeature); if (colList.Length > 0) { // Check if column is named features and already numeric if (colList.Length == 1 && colList[0] == DefaultColumnNames.Features && allColumnsNumeric) { yield break; } if (!allColumnsNumeric && !allColumnsNonNumeric) { yield break; } List <string> columnList = new List <string>(); foreach (var column in colList) { var columnName = new StringBuilder(); columnName.AppendFormat("{0}", column); columnList.Add(columnName.ToString()); } var input = new ColumnConcatenatingEstimator(Env, DefaultColumnNames.Features, columnList.ToArray()); ColumnRoutingStructure.AnnotatedName[] columnsSource = columnList.Select(c => new ColumnRoutingStructure.AnnotatedName { IsNumeric = allColumnsNumeric, Name = c }).ToArray(); ColumnRoutingStructure.AnnotatedName[] columnsDest = { new ColumnRoutingStructure.AnnotatedName { IsNumeric = allColumnsNumeric, Name = DefaultColumnNames.Features } }; var routingStructure = new ColumnRoutingStructure(columnsSource, columnsDest); yield return(new SuggestedTransform(input, routingStructure)); } }
public override IEnumerable <SuggestedTransform> Apply(IntermediateColumn[] columns) { var columnName = new StringBuilder(); var newColumns = new List <TypeConvertingTransformer.ColumnInfo>(); foreach (var column in columns) { if (!column.Type.ItemType().IsBool() || column.Purpose != ColumnPurpose.NumericFeature) { continue; } columnName.AppendFormat("{0}", column.ColumnName); newColumns.Add(new TypeConvertingTransformer.ColumnInfo(columnName.ToString(), columnName.ToString(), DataKind.R4)); } if (columnName.Length > 0) { var input = new TypeConvertingEstimator(Env, newColumns.ToArray()); ColumnRoutingStructure.AnnotatedName[] columnsSource = newColumns.Select(c => new ColumnRoutingStructure.AnnotatedName { IsNumeric = false, Name = c.Input }).ToArray(); ColumnRoutingStructure.AnnotatedName[] columnsDest = newColumns.Select(c => new ColumnRoutingStructure.AnnotatedName { IsNumeric = true, Name = c.Output }).ToArray(); var routingStructure = new ColumnRoutingStructure(columnsSource, columnsDest); yield return(new SuggestedTransform(input, routingStructure)); // Concat featurized columns into existing feature column, if transformed at least one column. if (!ExcludeFeaturesConcatTransforms) { yield return(InferenceHelpers.GetRemainingFeatures(newColumns.Select(c => c.Output).ToList(), columns, GetType(), IncludeFeaturesOverride)); IncludeFeaturesOverride = true; } } }
public override IEnumerable <SuggestedTransform> Apply(IntermediateColumn[] columns) { var featureCols = new List <string>(); foreach (var column in columns) { if (!column.Type.ItemType().IsText() || column.Purpose != ColumnPurpose.TextFeature) { continue; } var columnDestSuffix = "_tf"; var columnNameSafe = column.ColumnName; string columnDestRenamed = $"{columnNameSafe}{columnDestSuffix}"; featureCols.Add(columnDestRenamed); var input = new TextFeaturizingEstimator(Env, columnDestRenamed, columnNameSafe); ColumnRoutingStructure.AnnotatedName[] columnsSource = { new ColumnRoutingStructure.AnnotatedName { IsNumeric = false, Name = columnNameSafe } }; ColumnRoutingStructure.AnnotatedName[] columnsDest = { new ColumnRoutingStructure.AnnotatedName { IsNumeric = true, Name = columnDestRenamed } }; var routingStructure = new ColumnRoutingStructure(columnsSource, columnsDest); yield return(new SuggestedTransform(input, routingStructure)); } // Concat text featurized columns into existing feature column, if transformed at least one column. if (!ExcludeFeaturesConcatTransforms && featureCols.Count > 0) { yield return(InferenceHelpers.GetRemainingFeatures(featureCols, columns, GetType(), IncludeFeaturesOverride)); IncludeFeaturesOverride = true; } }
public override IEnumerable <SuggestedTransform> Apply(IntermediateColumn[] columns) { bool found = false; var columnName = new StringBuilder(); foreach (var column in columns) { if (column.Type.ItemType() != NumberType.R4 || column.Purpose != ColumnPurpose.NumericFeature) { continue; } if (!column.HasMissing) { continue; } found = true; columnName.AppendFormat("{0}", column.ColumnName); } if (found) { string name = columnName.ToString(); var input = new MissingValueIndicatorEstimator(Env, name, name); ColumnRoutingStructure.AnnotatedName[] columnsSource = { new ColumnRoutingStructure.AnnotatedName { IsNumeric = true, Name = name } }; ColumnRoutingStructure.AnnotatedName[] columnsDest = { new ColumnRoutingStructure.AnnotatedName { IsNumeric = true, Name = name } }; var routingStructure = new ColumnRoutingStructure(columnsSource, columnsDest); yield return(new SuggestedTransform(input, routingStructure)); } }
public override IEnumerable <SuggestedTransform> Apply(IntermediateColumn[] columns) { int count = 0; bool isAllText = true; var colSpec = new StringBuilder(); var colSpecTextOnly = new List <string>(); var columnList = new List <string>(); foreach (var column in columns) { var columnName = new StringBuilder(); if (column.Purpose != ColumnPurpose.Name) { continue; } count++; if (colSpec.Length > 0) { colSpec.Append(","); } colSpec.Append(column.ColumnName); columnName.Append(column.ColumnName); columnList.Add(columnName.ToString()); if (column.Type.ItemType().IsText()) { colSpecTextOnly.Add(column.ColumnName); } isAllText = isAllText && column.Type.ItemType().IsText(); } if (count == 1 && colSpec.ToString() != DefaultColumnNames.Name) { var columnName = new StringBuilder(); columnName.AppendFormat("{0}", colSpec); var input = new ColumnCopyingEstimator(Env, columnName.ToString(), DefaultColumnNames.Name); ColumnRoutingStructure.AnnotatedName[] columnsSource = { new ColumnRoutingStructure.AnnotatedName { IsNumeric = false, Name = columnName.ToString() } }; ColumnRoutingStructure.AnnotatedName[] columnsDest = { new ColumnRoutingStructure.AnnotatedName { IsNumeric = false, Name = DefaultColumnNames.Name } }; var routingStructure = new ColumnRoutingStructure(columnsSource, columnsDest); yield return(new SuggestedTransform(input, routingStructure)); } else if (count > 1) { if (string.IsNullOrWhiteSpace(colSpecTextOnly.ToString())) { yield break; } // suggested grouping name columns into one vector var input = new ColumnConcatenatingEstimator(Env, DefaultColumnNames.Name, columnList.ToArray()); ColumnRoutingStructure.AnnotatedName[] columnsSource = columnList.Select(c => new ColumnRoutingStructure.AnnotatedName { IsNumeric = false, Name = c }).ToArray(); ColumnRoutingStructure.AnnotatedName[] columnsDest = { new ColumnRoutingStructure.AnnotatedName { IsNumeric = false, Name = DefaultColumnNames.Name } }; var routingStructure = new ColumnRoutingStructure(columnsSource, columnsDest); yield return(new SuggestedTransform(input, routingStructure)); } }
public override IEnumerable <SuggestedTransform> Apply(IntermediateColumn[] columns) { bool foundCat = false; bool foundCatHash = false; var catColumnsNew = new List <OneHotEncodingEstimator.ColumnInfo>(); var catHashColumnsNew = new List <OneHotHashEncodingEstimator.ColumnInfo>(); var featureCols = new List <string>(); foreach (var column in columns) { if (!column.Type.ItemType().IsText() || column.Purpose != ColumnPurpose.CategoricalFeature) { continue; } var columnName = new StringBuilder(); columnName.AppendFormat("{0}", column.ColumnName); if (IsDictionaryOk(column, EstimatedSampleFraction)) { foundCat = true; catColumnsNew.Add(new OneHotEncodingEstimator.ColumnInfo(columnName.ToString(), columnName.ToString())); } else { foundCatHash = true; catHashColumnsNew.Add(new OneHotHashEncodingEstimator.ColumnInfo(columnName.ToString(), columnName.ToString())); } } if (foundCat) { ColumnRoutingStructure.AnnotatedName[] columnsSource = catColumnsNew.Select(c => new ColumnRoutingStructure.AnnotatedName { IsNumeric = false, Name = c.Output }).ToArray(); ColumnRoutingStructure.AnnotatedName[] columnsDest = catColumnsNew.Select(c => new ColumnRoutingStructure.AnnotatedName { IsNumeric = true, Name = c.Output }).ToArray(); var routingStructure = new ColumnRoutingStructure(columnsSource, columnsDest); var input = new OneHotEncodingEstimator(Env, catColumnsNew.ToArray()); featureCols.AddRange(catColumnsNew.Select(c => c.Output)); yield return(new SuggestedTransform(input, routingStructure)); } if (foundCatHash) { ColumnRoutingStructure.AnnotatedName[] columnsSource = catHashColumnsNew.Select(c => new ColumnRoutingStructure.AnnotatedName { IsNumeric = false, Name = c.HashInfo.Output }).ToArray(); ColumnRoutingStructure.AnnotatedName[] columnsDest = catHashColumnsNew.Select(c => new ColumnRoutingStructure.AnnotatedName { IsNumeric = true, Name = c.HashInfo.Output }).ToArray(); var routingStructure = new ColumnRoutingStructure(columnsSource, columnsDest); var input = new OneHotHashEncodingEstimator(Env, catHashColumnsNew.ToArray()); yield return(new SuggestedTransform(input, routingStructure)); } if (!ExcludeFeaturesConcatTransforms && featureCols.Count > 0) { yield return(InferenceHelpers.GetRemainingFeatures(featureCols, columns, GetType(), IncludeFeaturesOverride)); IncludeFeaturesOverride = true; } }
public override IEnumerable <SuggestedTransform> Apply(IntermediateColumn[] columns) { var firstGroupColId = Array.FindIndex(columns, x => x.Purpose == ColumnPurpose.Group); if (firstGroupColId < 0) { yield break; } var col = columns[firstGroupColId]; var columnName = new StringBuilder(); columnName.AppendFormat("{0}", col.ColumnName); if (col.Type.IsText()) { // REVIEW: we could potentially apply HashJoin to vectors of text. string dest = DefaultColumnNames.GroupId; string source = columnName.ToString(); var input = new OneHotHashEncodingEstimator(Env, new OneHotHashEncodingEstimator.ColumnInfo(dest, source)); var routingStructure = new ColumnRoutingStructure( new[] { new ColumnRoutingStructure.AnnotatedName { IsNumeric = false, Name = source } }, new[] { new ColumnRoutingStructure.AnnotatedName { IsNumeric = true, Name = dest } } ); string[] outputColNames = new string[] { DefaultColumnNames.GroupId }; yield return(new SuggestedTransform(input, routingStructure)); } else if (col.ColumnName != DefaultColumnNames.GroupId) { string dest = DefaultColumnNames.GroupId; string source = columnName.ToString(); var input = new ColumnCopyingEstimator(Env, source, dest); var routingStructure = new ColumnRoutingStructure( new[] { new ColumnRoutingStructure.AnnotatedName { IsNumeric = true, Name = source } }, new[] { new ColumnRoutingStructure.AnnotatedName { IsNumeric = true, Name = dest } } ); yield return(new SuggestedTransform(input, routingStructure)); } }
public override IEnumerable <SuggestedTransform> Apply(IntermediateColumn[] columns) { var lastLabelColId = Array.FindLastIndex(columns, x => x.Purpose == ColumnPurpose.Label); if (lastLabelColId < 0) { yield break; } var col = columns[lastLabelColId]; var columnName = new StringBuilder(); columnName.Append(col.ColumnName); if (col.Type.IsText()) { col.GetUniqueValueCounts <ReadOnlyMemory <char> >(out var unique, out var _, out var _); string dest = DefaultColumnNames.Label; string source = columnName.ToString(); var input = new ValueToKeyMappingEstimator(Env, source, dest); var routingStructure = new ColumnRoutingStructure( new[] { new ColumnRoutingStructure.AnnotatedName { IsNumeric = false, Name = source } }, new[] { new ColumnRoutingStructure.AnnotatedName { IsNumeric = true, Name = dest } } ); yield return(new SuggestedTransform(input, routingStructure)); } else if (col.ColumnName != DefaultColumnNames.Label) { string dest = DefaultColumnNames.Label; string source = columnName.ToString(); var input = new ColumnCopyingEstimator(Env, source, dest); var routingStructure = new ColumnRoutingStructure( new[] { new ColumnRoutingStructure.AnnotatedName { IsNumeric = true, Name = source } }, new[] { new ColumnRoutingStructure.AnnotatedName { IsNumeric = true, Name = dest } } ); yield return(new SuggestedTransform(input, routingStructure)); } }