public void CanParseLargeRandomStream() { using (var stream = new MemoryStream()) { const int numRows = 100000; const int rowSize = 100; var eol = Encoding.UTF8.GetBytes("\r\n"); for (var i = 0; i < numRows; i++) { var row = new byte[rowSize]; AutoMlUtils.Random.Value.NextBytes(row); // ensure byte array has no 0s, so text file sampler doesn't // think file is encoded with UTF-16 or UTF-32 without a BOM for (var k = 0; k < row.Length; k++) { if (row[k] == 0) { row[k] = 1; } } stream.Write(row, 0, rowSize); stream.Write(eol, 0, eol.Length); } stream.Seek(0, SeekOrigin.Begin); var sample = TextFileSample.CreateFromFullStream(stream); Assert.NotNull(sample); Assert.True(sample.FullFileSize > 0); } }
public void DatasetInferenceTest() { var datasets = new[] { GetDataPath(@"..\UCI\adult.train"), GetDataPath(@"..\UCI\adult.test"), GetDataPath(@"..\UnitTest\breast-cancer.txt"), }; IHostEnvironment env = new MLContext(); var h = env.Register("InferDatasetFeatures", seed: 0, verbose: false); using (var ch = h.Start("InferDatasetFeatures")) { for (int i = 0; i < datasets.Length; i++) { var sample = TextFileSample.CreateFromFullFile(h, datasets[i]); var splitResult = TextFileContents.TrySplitColumns(h, sample, TextFileContents.DefaultSeparators); if (!splitResult.IsSuccess) { throw ch.ExceptDecode("Couldn't detect separator."); } var typeInfResult = ColumnTypeInference.InferTextFileColumnTypes(Env, sample, new ColumnTypeInference.Arguments { Separator = splitResult.Separator, AllowSparse = splitResult.AllowSparse, AllowQuote = splitResult.AllowQuote, ColumnCount = splitResult.ColumnCount }); if (!typeInfResult.IsSuccess) { return; } ColumnGroupingInference.GroupingColumn[] columns = null; bool hasHeader = false; columns = InferenceUtils.InferColumnPurposes(ch, h, sample, splitResult, out hasHeader); Guid id = new Guid("60C77F4E-DB62-4351-8311-9B392A12968E"); var commandArgs = new DatasetFeatureInference.Arguments(typeInfResult.Data, columns.Select( col => new DatasetFeatureInference.Column(col.SuggestedName, col.Purpose, col.ItemKind, col.ColumnRangeSelector)).ToArray(), sample.FullFileSize, sample.ApproximateRowCount, false, id, true); string jsonString = DatasetFeatureInference.InferDatasetFeatures(env, commandArgs); var outFile = string.Format("dataset-inference-result-{0:00}.txt", i); string dataPath = GetOutputPath(@"..\Common\Inference", outFile); using (var sw = new StreamWriter(File.Create(dataPath))) sw.WriteLine(jsonString); CheckEquality(@"..\Common\Inference", outFile); } } Done(); }
public void TrySplitColumns_should_split_on_dataset_with_newline_between_double_quotes() { var context = new MLContext(); var dataset = Path.Combine("TestData", "DatasetWithNewlineBetweenQuotes.txt"); var sample = TextFileSample.CreateFromFullFile(dataset); var result = TextFileContents.TrySplitColumns(context, sample, TextFileContents.DefaultSeparators); result.ColumnCount.Should().Be(4); result.Separator.Should().Be(','); result.IsSuccess.Should().BeTrue(); }
public void RunCore(IChannel ch) { _host.AssertValue(ch); // Inner env is used to ignore verbose messages from the text loader. var envInner = _host.Register("inner host", seed: 0, verbose: false); ch.Info("Loading file sample into memory."); var sample = TextFileSample.CreateFromFullFile(envInner, _dataFile); ch.Info("Detecting separator and columns"); var splitResult = TextFileContents.TrySplitColumns(envInner, sample, TextFileContents.DefaultSeparators); if (!splitResult.IsSuccess) { throw Contracts.ExceptDecode("Couldn't detect separator."); } ch.Info("Separator detected as '{0}', there are {1} columns.", splitResult.Separator, splitResult.ColumnCount); bool hasHeader; ColumnGroupingInference.GroupingColumn[] groupingResult = InferenceUtils.InferColumnPurposes(ch, envInner, sample, splitResult, out hasHeader); string json = ""; try { json = JsonConvert.SerializeObject(groupingResult, Formatting.Indented); } catch { ch.Error("Error serializing the schema file. Check its content."); } if (!string.IsNullOrEmpty(json)) { if (_outFile != null) { using (var sw = new StreamWriter(_outFile)) PrintSchema(json, sw, ch); } else { PrintSchema(json, null, ch); } } }