public static SuggestedRecipe[] InferRecipesFromData(IHostEnvironment env, string dataFile, string schemaDefinitionFile, out Type predictorType, out string settingsString, out TransformInference.InferenceResult inferenceResult, bool excludeFeaturesConcatTransforms = false) { Contracts.CheckValue(env, nameof(env)); var h = env.Register("InferRecipesFromData", seed: 0, verbose: false); using (var ch = h.Start("InferRecipesFromData")) { // Validate the schema file has content if provided. // Warn the user early if that is provided but beign skipped. string schemaJson = null; if (!string.IsNullOrEmpty(schemaDefinitionFile)) { try { schemaJson = File.ReadAllText(schemaDefinitionFile); } catch (Exception ex) { ch.Warning($"Unable to read the schema file. Proceeding to infer the schema :{ex.Message}"); } } ch.Info("Loading file sample into memory."); var sample = TextFileSample.CreateFromFullFile(h, dataFile); ch.Info("Detecting separator and columns"); var splitResult = TextFileContents.TrySplitColumns(h, sample, TextFileContents.DefaultSeparators); // initialize to clustering if we're not successful? predictorType = typeof(SignatureClusteringTrainer); settingsString = ""; if (!splitResult.IsSuccess) { throw ch.ExceptDecode("Couldn't detect separator."); } ch.Info($"Separator detected as '{splitResult.Separator}', there's {splitResult.ColumnCount} columns."); ColumnGroupingInference.GroupingColumn[] columns; bool hasHeader = false; if (string.IsNullOrEmpty(schemaJson)) { ch.Warning("Empty schema file. Proceeding to infer the schema."); columns = InferenceUtils.InferColumnPurposes(ch, h, sample, splitResult, out hasHeader); } else { try { columns = JsonConvert.DeserializeObject <ColumnGroupingInference.GroupingColumn[]>(schemaJson); ch.Info("Using the provided schema file."); } catch { ch.Warning("Invalid json in the schema file. Proceeding to infer the schema."); columns = InferenceUtils.InferColumnPurposes(ch, h, sample, splitResult, out hasHeader); } } var finalLoaderArgs = new TextLoader.Arguments { Column = ColumnGroupingInference.GenerateLoaderColumns(columns), HasHeader = hasHeader, Separator = splitResult.Separator, AllowSparse = splitResult.AllowSparse, AllowQuoting = splitResult.AllowQuote }; settingsString = CommandLine.CmdParser.GetSettings(ch, finalLoaderArgs, new TextLoader.Arguments()); ch.Info($"Loader options: {settingsString}"); ch.Info("Inferring recipes"); var finalData = TextLoader.ReadFile(h, finalLoaderArgs, sample); var cached = new CacheDataView(h, finalData, Enumerable.Range(0, finalLoaderArgs.Column.Length).ToArray()); var purposeColumns = columns.Select((x, i) => new PurposeInference.Column(i, x.Purpose, x.ItemKind)).ToArray(); var fraction = sample.FullFileSize == null ? 1.0 : (double)sample.SampleSize / sample.FullFileSize.Value; var transformInferenceResult = TransformInference.InferTransforms(h, cached, purposeColumns, new TransformInference.Arguments { EstimatedSampleFraction = fraction, ExcludeFeaturesConcatTransforms = excludeFeaturesConcatTransforms } ); predictorType = InferenceUtils.InferPredictorCategoryType(cached, purposeColumns); var recipeInferenceResult = InferRecipes(h, transformInferenceResult, predictorType); ch.Done(); inferenceResult = transformInferenceResult; return(recipeInferenceResult.SuggestedRecipes); } }
public static ColumnGroupingInference.GroupingColumn[] InferColumnPurposes(IChannel ch, IHost env, TextFileSample sample, TextFileContents.ColumnSplitResult splitResult, out bool hasHeader) { ch.Info("Detecting column types"); var typeInferenceResult = ColumnTypeInference.InferTextFileColumnTypes(env, sample, new ColumnTypeInference.Arguments { ColumnCount = splitResult.ColumnCount, Separator = splitResult.Separator, AllowSparse = splitResult.AllowSparse, AllowQuote = splitResult.AllowQuote, }); hasHeader = typeInferenceResult.HasHeader; if (!typeInferenceResult.IsSuccess) { ch.Error("Couldn't detect column types."); return(null); } ch.Info("Detecting column purposes"); var typedLoaderArgs = new TextLoader.Arguments { Column = ColumnTypeInference.GenerateLoaderColumns(typeInferenceResult.Columns), Separator = splitResult.Separator, AllowSparse = splitResult.AllowSparse, AllowQuoting = splitResult.AllowQuote, HasHeader = typeInferenceResult.HasHeader }; var typedLoader = new TextLoader(env, typedLoaderArgs, sample); var purposeInferenceResult = PurposeInference.InferPurposes(env, typedLoader, Utils.GetIdentityPermutation(typedLoaderArgs.Column.Length), new PurposeInference.Arguments()); ch.Info("Detecting column grouping and generating column names"); ColumnGroupingInference.GroupingColumn[] groupingResult = ColumnGroupingInference.InferGroupingAndNames(env, typeInferenceResult.HasHeader, typeInferenceResult.Columns, purposeInferenceResult.Columns).Columns; return(groupingResult); }