public static SuggestedTransform GetRemainingFeatures(List <string> newCols, IntermediateColumn[] existingColumns, Type currentType, bool includeFeaturesOverride) { // Pick up existing features columns, if they exist var featuresColumnsCount = existingColumns.Count(col => (col.Purpose == ColumnPurpose.NumericFeature) && (col.ColumnName == DefaultColumnNames.Features)); if (includeFeaturesOverride || featuresColumnsCount > 0) { newCols.Insert(0, DefaultColumnNames.Features); } return(InferenceHelpers.ConcatColumnsIntoOne(newCols, DefaultColumnNames.Features, currentType, true)); }
public override IEnumerable <SuggestedTransform> Apply(IntermediateColumn[] columns) { List <string> textColumnNames = columns.Where( column => column.Type.ItemType().IsText() && column.Purpose == ColumnPurpose.TextFeature) .Select(column => column.ColumnName).ToList(); if ((textColumnNames.Count == 0) || (columns.Count(col => col.Purpose == ColumnPurpose.Label) != 1)) { yield break; } //Concat text columns into one. string concatTextColumnName; if (textColumnNames.Count > 1) { concatTextColumnName = columns[0].GetTempColumnName("TextConcat"); yield return (InferenceHelpers.ConcatColumnsIntoOne(textColumnNames, concatTextColumnName, GetType(), false)); } else { concatTextColumnName = textColumnNames.First(); } //Get Unigram + Trichar for text transform on the concatenated text column. string featureTextColumn = columns[0].GetTempColumnName("FeaturesText"); yield return(InferenceHelpers.TextTransformUnigramTriChar(Env, concatTextColumnName, featureTextColumn)); //Concat text featurized column into feature column. List <string> featureCols = new List <string>(new[] { featureTextColumn }); if (columns.Any( col => (col.Purpose == ColumnPurpose.NumericFeature) || (col.Purpose == ColumnPurpose.CategoricalFeature))) { featureCols.Add(DefaultColumnNames.Features); } if (!ExcludeFeaturesConcatTransforms) { yield return(InferenceHelpers.ConcatColumnsIntoOne(featureCols, DefaultColumnNames.Features, GetType(), true)); } }
public override IEnumerable <SuggestedTransform> Apply(IntermediateColumn[] columns) { var columnName = new StringBuilder(); var newColumns = new List <TypeConvertingTransformer.ColumnInfo>(); foreach (var column in columns) { if (!column.Type.ItemType().IsBool() || column.Purpose != ColumnPurpose.NumericFeature) { continue; } columnName.AppendFormat("{0}", column.ColumnName); newColumns.Add(new TypeConvertingTransformer.ColumnInfo(columnName.ToString(), columnName.ToString(), DataKind.R4)); } if (columnName.Length > 0) { var input = new TypeConvertingEstimator(Env, newColumns.ToArray()); ColumnRoutingStructure.AnnotatedName[] columnsSource = newColumns.Select(c => new ColumnRoutingStructure.AnnotatedName { IsNumeric = false, Name = c.Input }).ToArray(); ColumnRoutingStructure.AnnotatedName[] columnsDest = newColumns.Select(c => new ColumnRoutingStructure.AnnotatedName { IsNumeric = true, Name = c.Output }).ToArray(); var routingStructure = new ColumnRoutingStructure(columnsSource, columnsDest); yield return(new SuggestedTransform(input, routingStructure)); // Concat featurized columns into existing feature column, if transformed at least one column. if (!ExcludeFeaturesConcatTransforms) { yield return(InferenceHelpers.GetRemainingFeatures(newColumns.Select(c => c.Output).ToList(), columns, GetType(), IncludeFeaturesOverride)); IncludeFeaturesOverride = true; } } }
public override IEnumerable <SuggestedTransform> Apply(IntermediateColumn[] columns) { var featureCols = new List <string>(); foreach (var column in columns) { if (!column.Type.ItemType().IsText() || column.Purpose != ColumnPurpose.TextFeature) { continue; } var columnDestSuffix = "_tf"; var columnNameSafe = column.ColumnName; string columnDestRenamed = $"{columnNameSafe}{columnDestSuffix}"; featureCols.Add(columnDestRenamed); var input = new TextFeaturizingEstimator(Env, columnDestRenamed, columnNameSafe); ColumnRoutingStructure.AnnotatedName[] columnsSource = { new ColumnRoutingStructure.AnnotatedName { IsNumeric = false, Name = columnNameSafe } }; ColumnRoutingStructure.AnnotatedName[] columnsDest = { new ColumnRoutingStructure.AnnotatedName { IsNumeric = true, Name = columnDestRenamed } }; var routingStructure = new ColumnRoutingStructure(columnsSource, columnsDest); yield return(new SuggestedTransform(input, routingStructure)); } // Concat text featurized columns into existing feature column, if transformed at least one column. if (!ExcludeFeaturesConcatTransforms && featureCols.Count > 0) { yield return(InferenceHelpers.GetRemainingFeatures(featureCols, columns, GetType(), IncludeFeaturesOverride)); IncludeFeaturesOverride = true; } }
public override IEnumerable <SuggestedTransform> Apply(IntermediateColumn[] columns) { bool foundCat = false; bool foundCatHash = false; var catColumnsNew = new List <OneHotEncodingEstimator.ColumnInfo>(); var catHashColumnsNew = new List <OneHotHashEncodingEstimator.ColumnInfo>(); var featureCols = new List <string>(); foreach (var column in columns) { if (!column.Type.ItemType().IsText() || column.Purpose != ColumnPurpose.CategoricalFeature) { continue; } var columnName = new StringBuilder(); columnName.AppendFormat("{0}", column.ColumnName); if (IsDictionaryOk(column, EstimatedSampleFraction)) { foundCat = true; catColumnsNew.Add(new OneHotEncodingEstimator.ColumnInfo(columnName.ToString(), columnName.ToString())); } else { foundCatHash = true; catHashColumnsNew.Add(new OneHotHashEncodingEstimator.ColumnInfo(columnName.ToString(), columnName.ToString())); } } if (foundCat) { ColumnRoutingStructure.AnnotatedName[] columnsSource = catColumnsNew.Select(c => new ColumnRoutingStructure.AnnotatedName { IsNumeric = false, Name = c.Output }).ToArray(); ColumnRoutingStructure.AnnotatedName[] columnsDest = catColumnsNew.Select(c => new ColumnRoutingStructure.AnnotatedName { IsNumeric = true, Name = c.Output }).ToArray(); var routingStructure = new ColumnRoutingStructure(columnsSource, columnsDest); var input = new OneHotEncodingEstimator(Env, catColumnsNew.ToArray()); featureCols.AddRange(catColumnsNew.Select(c => c.Output)); yield return(new SuggestedTransform(input, routingStructure)); } if (foundCatHash) { ColumnRoutingStructure.AnnotatedName[] columnsSource = catHashColumnsNew.Select(c => new ColumnRoutingStructure.AnnotatedName { IsNumeric = false, Name = c.HashInfo.Output }).ToArray(); ColumnRoutingStructure.AnnotatedName[] columnsDest = catHashColumnsNew.Select(c => new ColumnRoutingStructure.AnnotatedName { IsNumeric = true, Name = c.HashInfo.Output }).ToArray(); var routingStructure = new ColumnRoutingStructure(columnsSource, columnsDest); var input = new OneHotHashEncodingEstimator(Env, catHashColumnsNew.ToArray()); yield return(new SuggestedTransform(input, routingStructure)); } if (!ExcludeFeaturesConcatTransforms && featureCols.Count > 0) { yield return(InferenceHelpers.GetRemainingFeatures(featureCols, columns, GetType(), IncludeFeaturesOverride)); IncludeFeaturesOverride = true; } }