Пример #1
0
        public void DatasetInferenceTest()
        {
            var datasets = new[]
            {
                GetDataPath(@"..\UCI\adult.train"),
                GetDataPath(@"..\UCI\adult.test"),
                GetDataPath(@"..\UnitTest\breast-cancer.txt"),
            };

            IHostEnvironment env = new MLContext();
            var h = env.Register("InferDatasetFeatures", seed: 0, verbose: false);

            using (var ch = h.Start("InferDatasetFeatures"))
            {
                for (int i = 0; i < datasets.Length; i++)
                {
                    var sample      = TextFileSample.CreateFromFullFile(h, datasets[i]);
                    var splitResult = TextFileContents.TrySplitColumns(h, sample, TextFileContents.DefaultSeparators);
                    if (!splitResult.IsSuccess)
                    {
                        throw ch.ExceptDecode("Couldn't detect separator.");
                    }

                    var typeInfResult = ColumnTypeInference.InferTextFileColumnTypes(Env, sample,
                                                                                     new ColumnTypeInference.Arguments
                    {
                        Separator   = splitResult.Separator,
                        AllowSparse = splitResult.AllowSparse,
                        AllowQuote  = splitResult.AllowQuote,
                        ColumnCount = splitResult.ColumnCount
                    });

                    if (!typeInfResult.IsSuccess)
                    {
                        return;
                    }

                    ColumnGroupingInference.GroupingColumn[] columns = null;
                    bool hasHeader = false;
                    columns = InferenceUtils.InferColumnPurposes(ch, h, sample, splitResult, out hasHeader);
                    Guid id          = new Guid("60C77F4E-DB62-4351-8311-9B392A12968E");
                    var  commandArgs = new DatasetFeatureInference.Arguments(typeInfResult.Data,
                                                                             columns.Select(
                                                                                 col =>
                                                                                 new DatasetFeatureInference.Column(col.SuggestedName, col.Purpose, col.ItemKind,
                                                                                                                    col.ColumnRangeSelector)).ToArray(), sample.FullFileSize, sample.ApproximateRowCount,
                                                                             false, id, true);

                    string jsonString = DatasetFeatureInference.InferDatasetFeatures(env, commandArgs);
                    var    outFile    = string.Format("dataset-inference-result-{0:00}.txt", i);
                    string dataPath   = GetOutputPath(@"..\Common\Inference", outFile);
                    using (var sw = new StreamWriter(File.Create(dataPath)))
                        sw.WriteLine(jsonString);

                    CheckEquality(@"..\Common\Inference", outFile);
                }
            }
            Done();
        }
Пример #2
0
        public void TrySplitColumns_should_split_on_dataset_with_newline_between_double_quotes()
        {
            var context = new MLContext();
            var dataset = Path.Combine("TestData", "DatasetWithNewlineBetweenQuotes.txt");
            var sample  = TextFileSample.CreateFromFullFile(dataset);
            var result  = TextFileContents.TrySplitColumns(context, sample, TextFileContents.DefaultSeparators);

            result.ColumnCount.Should().Be(4);
            result.Separator.Should().Be(',');
            result.IsSuccess.Should().BeTrue();
        }
        public void RunCore(IChannel ch)
        {
            _host.AssertValue(ch);

            // Inner env is used to ignore verbose messages from the text loader.
            var envInner = _host.Register("inner host", seed: 0, verbose: false);

            ch.Info("Loading file sample into memory.");
            var sample = TextFileSample.CreateFromFullFile(envInner, _dataFile);

            ch.Info("Detecting separator and columns");
            var splitResult = TextFileContents.TrySplitColumns(envInner, sample, TextFileContents.DefaultSeparators);

            if (!splitResult.IsSuccess)
            {
                throw Contracts.ExceptDecode("Couldn't detect separator.");
            }

            ch.Info("Separator detected as '{0}', there are {1} columns.", splitResult.Separator, splitResult.ColumnCount);
            bool hasHeader;

            ColumnGroupingInference.GroupingColumn[] groupingResult = InferenceUtils.InferColumnPurposes(ch, envInner, sample, splitResult, out hasHeader);

            string json = "";

            try
            {
                json = JsonConvert.SerializeObject(groupingResult, Formatting.Indented);
            }
            catch
            {
                ch.Error("Error serializing the schema file. Check its content.");
            }

            if (!string.IsNullOrEmpty(json))
            {
                if (_outFile != null)
                {
                    using (var sw = new StreamWriter(_outFile))
                        PrintSchema(json, sw, ch);
                }
                else
                {
                    PrintSchema(json, null, ch);
                }
            }
        }