public Column(string name, ColumnPurpose purpose, DataKind?dataKind, string ranges)
            {
                Contracts.CheckValue(name, nameof(name));
                Contracts.CheckValue(ranges, nameof(ranges));

                _name         = name;
                ColumnPurpose = purpose;
                Kind          = dataKind;
                Indices       = ColumnGroupingInference.GetRange(ranges);
            }
Ejemplo n.º 2
0
        public static ColumnGroupingInference.GroupingColumn[] InferColumnPurposes(IChannel ch, IHost env, TextFileSample sample, TextFileContents.ColumnSplitResult splitResult, out bool hasHeader)
        {
            ch.Info("Detecting column types");
            var typeInferenceResult = ColumnTypeInference.InferTextFileColumnTypes(env, sample,
                                                                                   new ColumnTypeInference.Arguments
            {
                ColumnCount = splitResult.ColumnCount,
                Separator   = splitResult.Separator,
                AllowSparse = splitResult.AllowSparse,
                AllowQuote  = splitResult.AllowQuote,
            });

            hasHeader = typeInferenceResult.HasHeader;
            if (!typeInferenceResult.IsSuccess)
            {
                ch.Error("Couldn't detect column types.");
                return(null);
            }

            ch.Info("Detecting column purposes");
            var typedLoaderArgs = new TextLoader.Arguments
            {
                Column       = ColumnTypeInference.GenerateLoaderColumns(typeInferenceResult.Columns),
                Separator    = splitResult.Separator,
                AllowSparse  = splitResult.AllowSparse,
                AllowQuoting = splitResult.AllowQuote,
                HasHeader    = typeInferenceResult.HasHeader
            };
            var typedLoader = new TextLoader(env, typedLoaderArgs, sample);

            var purposeInferenceResult = PurposeInference.InferPurposes(env, typedLoader,
                                                                        Utils.GetIdentityPermutation(typedLoaderArgs.Column.Length), new PurposeInference.Arguments());

            ch.Info("Detecting column grouping and generating column names");

            ColumnGroupingInference.GroupingColumn[] groupingResult = ColumnGroupingInference.InferGroupingAndNames(env, typeInferenceResult.HasHeader,
                                                                                                                    typeInferenceResult.Columns, purposeInferenceResult.Columns).Columns;

            return(groupingResult);
        }
        public static SuggestedRecipe[] InferRecipesFromData(IHostEnvironment env, string dataFile, string schemaDefinitionFile,
                                                             out Type predictorType, out string settingsString, out TransformInference.InferenceResult inferenceResult,
                                                             bool excludeFeaturesConcatTransforms = false)
        {
            Contracts.CheckValue(env, nameof(env));
            var h = env.Register("InferRecipesFromData", seed: 0, verbose: false);

            using (var ch = h.Start("InferRecipesFromData"))
            {
                // Validate the schema file has content if provided.
                // Warn the user early if that is provided but beign skipped.
                string schemaJson = null;
                if (!string.IsNullOrEmpty(schemaDefinitionFile))
                {
                    try
                    {
                        schemaJson = File.ReadAllText(schemaDefinitionFile);
                    }
                    catch (Exception ex)
                    {
                        ch.Warning($"Unable to read the schema file. Proceeding to infer the schema :{ex.Message}");
                    }
                }

                ch.Info("Loading file sample into memory.");
                var sample = TextFileSample.CreateFromFullFile(h, dataFile);

                ch.Info("Detecting separator and columns");
                var splitResult = TextFileContents.TrySplitColumns(h, sample, TextFileContents.DefaultSeparators);

                // initialize to clustering if we're not successful?
                predictorType  = typeof(SignatureClusteringTrainer);
                settingsString = "";
                if (!splitResult.IsSuccess)
                {
                    throw ch.ExceptDecode("Couldn't detect separator.");
                }

                ch.Info($"Separator detected as '{splitResult.Separator}', there's {splitResult.ColumnCount} columns.");

                ColumnGroupingInference.GroupingColumn[] columns;
                bool hasHeader = false;
                if (string.IsNullOrEmpty(schemaJson))
                {
                    ch.Warning("Empty schema file. Proceeding to infer the schema.");
                    columns = InferenceUtils.InferColumnPurposes(ch, h, sample, splitResult, out hasHeader);
                }
                else
                {
                    try
                    {
                        columns = JsonConvert.DeserializeObject <ColumnGroupingInference.GroupingColumn[]>(schemaJson);
                        ch.Info("Using the provided schema file.");
                    }
                    catch
                    {
                        ch.Warning("Invalid json in the schema file. Proceeding to infer the schema.");
                        columns = InferenceUtils.InferColumnPurposes(ch, h, sample, splitResult, out hasHeader);
                    }
                }

                var finalLoaderArgs = new TextLoader.Arguments
                {
                    Column       = ColumnGroupingInference.GenerateLoaderColumns(columns),
                    HasHeader    = hasHeader,
                    Separator    = splitResult.Separator,
                    AllowSparse  = splitResult.AllowSparse,
                    AllowQuoting = splitResult.AllowQuote
                };

                settingsString = CommandLine.CmdParser.GetSettings(ch, finalLoaderArgs, new TextLoader.Arguments());
                ch.Info($"Loader options: {settingsString}");

                ch.Info("Inferring recipes");
                var finalData = TextLoader.ReadFile(h, finalLoaderArgs, sample);
                var cached    = new CacheDataView(h, finalData,
                                                  Enumerable.Range(0, finalLoaderArgs.Column.Length).ToArray());

                var purposeColumns = columns.Select((x, i) => new PurposeInference.Column(i, x.Purpose, x.ItemKind)).ToArray();

                var fraction = sample.FullFileSize == null ? 1.0 : (double)sample.SampleSize / sample.FullFileSize.Value;
                var transformInferenceResult = TransformInference.InferTransforms(h, cached, purposeColumns,
                                                                                  new TransformInference.Arguments
                {
                    EstimatedSampleFraction         = fraction,
                    ExcludeFeaturesConcatTransforms = excludeFeaturesConcatTransforms
                }
                                                                                  );
                predictorType = InferenceUtils.InferPredictorCategoryType(cached, purposeColumns);
                var recipeInferenceResult = InferRecipes(h, transformInferenceResult, predictorType);

                ch.Done();

                inferenceResult = transformInferenceResult;
                return(recipeInferenceResult.SuggestedRecipes);
            }
        }