public IExtractedDataset <T> Extract <T>(IDataToImport dataToImport) where T : new() { var extractedDataset = new ExtractedDataset <T>(_thresholdLevel); var validatedDataToImportResult = ValidateDataToImport(dataToImport); extractedDataset.AddParsingResults(validatedDataToImportResult.Item2); if (!validatedDataToImportResult.Item1) { //format not valid, return return(extractedDataset); } var results = new List <IResult>(); var model = new T(); IDataSourceLocation currentLocation = null; foreach (var configuration in _extractConfigurations) { if (configuration is SimpleXMLExtractConfiguration) { currentLocation = new XMLDataSourceLocation(((SimpleXMLExtractConfiguration)configuration).ElementName, ((SimpleXMLExtractConfiguration)configuration).AttributeName); } results.AddRange(((SimpleXMLExtractConfiguration)configuration).ExtractData(model, dataToImport, currentLocation)); } var parsingResult = new ParsingResult(ResultLevel.DEBUG, "Extract data from single row success", model, null); results.Add(parsingResult); extractedDataset.AddParsingResults(results); return(extractedDataset); }
public void TestUnifiedIteratorYield() { string filename = ".unittestfile" + nameof(TestUnifiedIteratorYield); CreateCsvTempFile(filename); SigmaEnvironment.Clear(); FileSource source = new FileSource(filename, Path.GetTempPath()); CsvRecordExtractor extractor = (CsvRecordExtractor) new CsvRecordReader(source).Extractor(new CsvRecordExtractor(new Dictionary <string, int[][]> { ["inputs"] = new[] { new[] { 0 } } })); ExtractedDataset dataset = new ExtractedDataset("test", 2, new DiskCacheProvider(Path.GetTempPath() + "/" + nameof(TestUnifiedIteratorYield)), true, extractor); UnifiedIterator iterator = new UnifiedIterator(dataset); SigmaEnvironment sigma = SigmaEnvironment.Create("test"); IComputationHandler handler = new CpuFloat32Handler(); foreach (var block in iterator.Yield(handler, sigma)) { Assert.AreEqual(new[] { 5.1f, 4.9f, 4.7f }, block["inputs"].GetDataAs <float>().GetValuesArrayAs <float>(0, 3)); } dataset.Dispose(); DeleteTempFile(filename); }
public void TestDatasetBlockwiseSliceCreate() { RedirectGlobalsToTempPath(); string filename = nameof(TestDatasetBlockwiseSliceCreate) + "test.dat"; CreateCsvTempFile(filename); CsvRecordExtractor extractor = new CsvRecordReader(new FileSource(filename)).Extractor("inputs", 1, 2, "targets", 3); ExtractedDataset dataset = new ExtractedDataset("name", ExtractedDataset.BlockSizeAuto, extractor); Assert.Throws <ArgumentNullException>(() => new DatasetBlockwiseSlice(null, 0, 0, 1)); Assert.Throws <ArgumentException>(() => new DatasetBlockwiseSlice(dataset, 0, 0, 0)); Assert.Throws <ArgumentException>(() => new DatasetBlockwiseSlice(dataset, -1, 0, 1)); Assert.Throws <ArgumentException>(() => new DatasetBlockwiseSlice(dataset, 0, -1, 1)); Assert.Throws <ArgumentException>(() => new DatasetBlockwiseSlice(dataset, 1, 0, 1)); Assert.Throws <ArgumentException>(() => new DatasetBlockwiseSlice(dataset, 0, 2, 2)); DatasetBlockwiseSlice slice = new DatasetBlockwiseSlice(dataset, 0, 1, 3); Assert.AreSame(dataset, slice.UnderlyingDataset); Assert.AreEqual(0, slice.SplitBeginIndex); Assert.AreEqual(1, slice.SplitEndIndex); Assert.AreEqual(2, slice.SplitSize); Assert.AreEqual(3, slice.SplitInterval); Assert.AreEqual(dataset.Name, slice.Name); Assert.AreEqual(dataset.TargetBlockSizeRecords, slice.TargetBlockSizeRecords); Assert.AreEqual(dataset.SectionNames, slice.SectionNames); DeleteTempFile(filename); }
public IExtractedDataset <T> Extract <T>(IDataToImport dataToImport) where T : new() { var extractedDataset = new ExtractedDataset <T>(_thresholdLevel); var validatedDataToImportResult = ValidateDataToImport(dataToImport); extractedDataset.AddParsingResults(validatedDataToImportResult.Item2); if (!validatedDataToImportResult.Item1) { //format not valid, return return(extractedDataset); } var csvDataSource = dataToImport as XMLDataToImport; var rawData = csvDataSource.Data as XDocument; XNamespace rdf = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"; var parentNodes = rawData.Descendants().Where(x => x.Name.LocalName == _rootNode); foreach (XElement element in parentNodes) { var extractResultsForNode = ExtractDataForSingleNode <T>(_extractConfigurations, new XMLDataToImport("", new XDocument(element))); extractedDataset.AddParsingResults(extractResultsForNode); } return(extractedDataset); }
private static void SampleHutter() { const long timeWindowSize = 10L; SigmaEnvironment sigma = SigmaEnvironment.Create("recurrent"); IDataSource source = new MultiSource(new FileSource("enwik8"), new CompressedSource(new MultiSource(new FileSource("enwik8.zip"), new UrlSource("http://mattmahoney.net/dc/enwik8.zip")))); IRecordExtractor extractor = new CharacterRecordReader(source, (int)(timeWindowSize + 1), Encoding.ASCII) .Extractor(new ArrayRecordExtractor <short>(ArrayRecordExtractor <short> .ParseExtractorParameters("inputs", new[] { 0L }, new[] { timeWindowSize }, "targets", new[] { 0L }, new[] { timeWindowSize })) .Offset("targets", 1L)) .Preprocess(new PermutePreprocessor(0, 2, 1)) .Preprocess(new OneHotPreprocessor(0, 255)); IDataset dataset = new ExtractedDataset("hutter", ExtractedDataset.BlockSizeAuto, false, extractor); ITrainer trainer = sigma.CreateTrainer("hutter"); trainer.Network.Architecture = InputLayer.Construct(256) + RecurrentLayer.Construct(256) + OutputLayer.Construct(256) + SoftMaxCrossEntropyCostLayer.Construct(); trainer.TrainingDataIterator = new MinibatchIterator(32, dataset); trainer.AddNamedDataIterator("validation", new MinibatchIterator(100, dataset)); trainer.Optimiser = new AdagradOptimiser(baseLearningRate: 0.07); trainer.Operator = new CudaSinglethreadedOperator(); trainer.AddInitialiser("*.*", new GaussianInitialiser(standardDeviation: 0.05)); trainer.AddLocalHook(new AccumulatedValueReporter("optimiser.cost_total", TimeStep.Every(1, TimeScale.Iteration), averageValues: true)); trainer.AddLocalHook(new RunningTimeReporter(TimeStep.Every(10, TimeScale.Iteration))); sigma.PrepareAndRun(); }
private static void SampleLoadExtractIterate() { SigmaEnvironment sigma = SigmaEnvironment.Create("test"); sigma.Prepare(); //var irisReader = new CsvRecordReader(new MultiSource(new FileSource("iris.data"), new UrlSource("http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"))); //IRecordExtractor irisExtractor = irisReader.Extractor("inputs2", new[] { 0, 3 }, "targets2", 4).AddValueMapping(4, "Iris-setosa", "Iris-versicolor", "Iris-virginica"); //irisExtractor = irisExtractor.Preprocess(new OneHotPreprocessor(sectionName: "targets2", minValue: 0, maxValue: 2), new NormalisingPreprocessor(sectionNames: "inputs2", minInputValue: 0, maxInputValue: 6)); ByteRecordReader mnistImageReader = new ByteRecordReader(headerLengthBytes: 16, recordSizeBytes: 28 * 28, source: new CompressedSource(new MultiSource(new FileSource("train-images-idx3-ubyte.gz"), new UrlSource("http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz")))); IRecordExtractor mnistImageExtractor = mnistImageReader.Extractor("inputs", new[] { 0L, 0L }, new[] { 28L, 28L }).Preprocess(new NormalisingPreprocessor(0, 255)); ByteRecordReader mnistTargetReader = new ByteRecordReader(headerLengthBytes: 8, recordSizeBytes: 1, source: new CompressedSource(new MultiSource(new FileSource("train-labels-idx1-ubyte.gz"), new UrlSource("http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz")))); IRecordExtractor mnistTargetExtractor = mnistTargetReader.Extractor("targets", new[] { 0L }, new[] { 1L }).Preprocess(new OneHotPreprocessor(minValue: 0, maxValue: 9)); IComputationHandler handler = new CpuFloat32Handler(); ExtractedDataset dataset = new ExtractedDataset("mnist-training", ExtractedDataset.BlockSizeAuto, mnistImageExtractor, mnistTargetExtractor); IDataset[] slices = dataset.SplitRecordwise(0.8, 0.2); IDataset trainingData = slices[0]; IDataset validationData = slices[1]; MinibatchIterator trainingIterator = new MinibatchIterator(1, trainingData); MinibatchIterator validationIterator = new MinibatchIterator(1, validationData); while (true) { foreach (var block in trainingIterator.Yield(handler, sigma)) { Thread.Sleep(100); PrintFormattedBlock(block, PrintUtils.AsciiGreyscalePalette); Thread.Sleep(1000); } } //Random random = new Random(); //INDArray array = new ADNDArray<float>(3, 1, 2, 2); //new GaussianInitialiser(0.05, 0.05).Initialise(array, Handler, random); //Console.WriteLine(array); //new ConstantValueInitialiser(1).Initialise(array, Handler, random); //Console.WriteLine(array); //dataset.InvalidateAndClearCaches(); }
public void AssertExtractedFailDataset() { var dataSet = new ExtractedDataset(ResultLevel.FATAL); var baseResult = new BaseResult(ResultLevel.FATAL, "Base result message"); var parsingResult = new ParsingResult(ResultLevel.ERROR, "Parsing result", 123); dataSet.AddParsingResult(baseResult); dataSet.AddParsingResult(parsingResult); Assert.False(dataSet.IsExtractedSuccess); Assert.AreEqual(ResultLevel.FATAL, dataSet.ThresholdLevel); var entities = dataSet.ExtractedEntities; Assert.Null(entities); }
public void TestUndividedIteratorCreate() { string filename = ".unittestfile" + nameof(TestUndividedIteratorCreate); CreateCsvTempFile(filename); FileSource source = new FileSource(filename, Path.GetTempPath()); CsvRecordExtractor extractor = (CsvRecordExtractor) new CsvRecordReader(source).Extractor(new CsvRecordExtractor(new Dictionary <string, int[][]> { ["inputs"] = new[] { new[] { 0 } } })); ExtractedDataset dataset = new ExtractedDataset("test", 1, new DiskCacheProvider(Path.GetTempPath() + "/" + nameof(TestUndividedIteratorCreate)), true, extractor); Assert.Throws <ArgumentNullException>(() => new UndividedIterator(null)); dataset.Dispose(); DeleteTempFile(filename); }
public void AssertExtractedSuccessDataset() { var dataSet = new ExtractedDataset(ResultLevel.ERROR); var baseResult = new BaseResult(ResultLevel.INFO, "Base result message"); var parsingResult = new ParsingResult(ResultLevel.INFO, "Parsing result", 123); dataSet.AddParsingResult(baseResult); dataSet.AddParsingResult(parsingResult); Assert.True(dataSet.IsExtractedSuccess); Assert.AreEqual(ResultLevel.ERROR, dataSet.ThresholdLevel); var entities = dataSet.ExtractedEntities; Assert.AreEqual(1, entities.Count()); Assert.AreEqual(123, (int)entities.First()); }
public void TestDatasetBlockwiseSliceFetch() { RedirectGlobalsToTempPath(); string filename = nameof(TestDatasetBlockwiseSliceFetch) + "test.dat"; CreateCsvTempFile(filename); CsvRecordExtractor extractor = new CsvRecordReader(new FileSource(filename)).Extractor("inputs", 0, "targets", 3); ExtractedDataset dataset = new ExtractedDataset("name", 1, extractor); DatasetBlockwiseSlice slice = new DatasetBlockwiseSlice(dataset, 1, 2, 3); Assert.AreEqual(new float[] { 4.9f }, slice.FetchBlock(0, new CpuFloat32Handler())["inputs"].GetDataAs <float>().GetValuesArrayAs <float>(0, 1)); extractor.Reader?.Dispose(); dataset.Dispose(); DeleteTempFile(filename); }
public void AssertExtractedFailDataset() { var mockDataSourceLocation = new Mock<IDataSourceLocation>(); var dataSet = new ExtractedDataset<int>(ResultLevel.FATAL); var baseResult = new BaseResult(ResultLevel.FATAL, "Base result message"); var parsingResult = new ParsingResult(ResultLevel.ERROR, "Parsing result", 123, mockDataSourceLocation.Object); dataSet.AddParsingResult(baseResult); dataSet.AddParsingResult(parsingResult); Assert.False(dataSet.IsExtractedSuccess); Assert.AreEqual(ResultLevel.FATAL, dataSet.ThresholdLevel); var entities = dataSet.ExtractedEntities; Assert.Null(entities); }
public void AssertExtractedSuccessDataset() { var mockDataSourceLocation = new Mock<IDataSourceLocation>(); var dataSet = new ExtractedDataset<int>(ResultLevel.ERROR); var baseResult = new BaseResult(ResultLevel.INFO, "Base result message"); var parsingResult = new ParsingResult(ResultLevel.INFO, "Parsing result", 123, mockDataSourceLocation.Object); dataSet.AddParsingResult(baseResult); dataSet.AddParsingResult(parsingResult); Assert.True(dataSet.IsExtractedSuccess); Assert.AreEqual(ResultLevel.ERROR, dataSet.ThresholdLevel); var entities = dataSet.ExtractedEntities; Assert.AreEqual(1, entities.Count()); Assert.AreEqual(123, (int)entities.First()); }
public void TestUndividedIteratorYield() { string filename = ".unittestfile" + nameof(TestUndividedIteratorCreate); CreateCsvTempFile(filename); SigmaEnvironment.Clear(); FileSource source = new FileSource(filename, Path.GetTempPath()); CsvRecordExtractor extractor = (CsvRecordExtractor) new CsvRecordReader(source).Extractor(new CsvRecordExtractor(new Dictionary <string, int[][]> { ["inputs"] = new[] { new[] { 0 } } })); ExtractedDataset dataset = new ExtractedDataset("test", 2, new DiskCacheProvider(Path.GetTempPath() + "/" + nameof(TestUndividedIteratorCreate)), true, extractor); UndividedIterator iterator = new UndividedIterator(dataset); SigmaEnvironment sigma = SigmaEnvironment.Create("test"); IComputationHandler handler = new CpuFloat32Handler(); int index = 0; foreach (var block in iterator.Yield(handler, sigma)) { if (index == 0) { Assert.AreEqual(new float[] { 5.1f, 4.9f }, block["inputs"].GetDataAs <float>().GetValuesArrayAs <float>(0, 2)); } else if (index == 1) { Assert.AreEqual(new float[] { 4.7f }, block["inputs"].GetDataAs <float>().GetValuesArrayAs <float>(0, 1)); } else { Assert.Fail("There can be a maximum of two iterations, but this is yield iteration 3 (index 2)."); } index++; } dataset.Dispose(); DeleteTempFile(filename); }
private static void SampleCachedFastIteration() { SigmaEnvironment sigma = SigmaEnvironment.Create("test"); IDataSource dataSource = new CompressedSource(new MultiSource(new FileSource("train-images-idx3-ubyte.gz"), new UrlSource("http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz"))); ByteRecordReader mnistImageReader = new ByteRecordReader(headerLengthBytes: 16, recordSizeBytes: 28 * 28, source: dataSource); IRecordExtractor mnistImageExtractor = mnistImageReader.Extractor("inputs", new[] { 0L, 0L }, new[] { 28L, 28L }).Preprocess(new NormalisingPreprocessor(0, 255)); IDataset dataset = new ExtractedDataset("mnist-training", ExtractedDataset.BlockSizeAuto, mnistImageExtractor); IDataset[] slices = dataset.SplitRecordwise(0.8, 0.2); IDataset trainingData = slices[0]; Stopwatch stopwatch = Stopwatch.StartNew(); IDataIterator iterator = new MinibatchIterator(10, trainingData); foreach (var block in iterator.Yield(new CpuFloat32Handler(), sigma)) { //PrintFormattedBlock(block, PrintUtils.AsciiGreyscalePalette); } Console.Write("\nFirst iteration took " + stopwatch.Elapsed + "\n+=+ Iterating over dataset again +=+ Dramatic pause..."); ArrayUtils.Range(1, 10).ToList().ForEach(i => { Thread.Sleep(500); Console.Write("."); }); stopwatch.Restart(); foreach (var block in iterator.Yield(new CpuFloat32Handler(), sigma)) { //PrintFormattedBlock(block, PrintUtils.AsciiGreyscalePalette); } Console.WriteLine("Second iteration took " + stopwatch.Elapsed); }
public void TestDatasetFetchBlockSequential() { RedirectGlobalsToTempPath(); string filename = $"test{nameof(TestDatasetFetchBlockSequential)}.dat"; CreateCsvTempFile(filename); CsvRecordExtractor extractor = new CsvRecordReader(new FileSource(filename, Path.GetTempPath())).Extractor("inputs", 1, 2, "targets", 3); ExtractedDataset dataset = new ExtractedDataset(name: "name", blockSizeRecords: 1, recordExtractors: extractor); CpuFloat32Handler handler = new CpuFloat32Handler(); IDictionary <string, INDArray> namedArrays = dataset.FetchBlock(0, handler, false); Assert.AreEqual(new[] { 3.5f, 1.4f }, namedArrays["inputs"].GetDataAs <float>().GetValuesArrayAs <float>(0, 2)); //fetch the same thing twice to check for same block namedArrays = dataset.FetchBlock(0, handler, false); Assert.AreEqual(new[] { 3.5f, 1.4f }, namedArrays["inputs"].GetDataAs <float>().GetValuesArrayAs <float>(0, 2)); //skipping second block (index 1) namedArrays = dataset.FetchBlock(2, handler, false); Assert.AreEqual(new[] { 3.2f, 1.3f }, namedArrays["inputs"].GetDataAs <float>().GetValuesArrayAs <float>(0, 2)); namedArrays = dataset.FetchBlock(1, handler, false); Assert.AreEqual(new[] { 3.0f, 1.4f }, namedArrays["inputs"].GetDataAs <float>().GetValuesArrayAs <float>(0, 2)); namedArrays = dataset.FetchBlock(3, handler, false); Assert.IsNull(namedArrays); dataset.Dispose(); DeleteTempFile(filename); }
public void TestDatasetFreeBlockSequential() { RedirectGlobalsToTempPath(); string filename = $"test{nameof(TestDatasetFetchBlockSequential)}.dat"; CreateCsvTempFile(filename); CsvRecordExtractor extractor = new CsvRecordReader(new FileSource(filename, Path.GetTempPath())).Extractor("inputs", 1, 2, "targets", 3); ExtractedDataset dataset = new ExtractedDataset(name: "name", blockSizeRecords: 1, recordExtractors: extractor); CpuFloat32Handler handler = new CpuFloat32Handler(); dataset.FetchBlock(0, handler, false); dataset.FetchBlock(1, handler, false); dataset.FetchBlock(2, handler, false); Assert.AreEqual(3, dataset.ActiveBlockRegionCount); dataset.FreeBlock(1, handler); dataset.FreeBlock(2, handler); Assert.AreEqual(1, dataset.ActiveBlockRegionCount); var namedArrays = dataset.FetchBlock(0, handler, false); Assert.AreEqual(new[] { 3.5f, 1.4f }, namedArrays["inputs"].GetDataAs <float>().GetValuesArrayAs <float>(0, 2)); namedArrays = dataset.FetchBlock(1, handler, false); Assert.AreEqual(new[] { 3.0f, 1.4f }, namedArrays["inputs"].GetDataAs <float>().GetValuesArrayAs <float>(0, 2)); namedArrays = dataset.FetchBlock(2, handler, false); Assert.AreEqual(new[] { 3.2f, 1.3f }, namedArrays["inputs"].GetDataAs <float>().GetValuesArrayAs <float>(0, 2)); dataset.Dispose(); DeleteTempFile(filename); }
public void TestMinibatchIteratorYield(int minibatchSize) { string filename = ".unittestfile" + nameof(TestMinibatchIteratorYield); CreateCsvTempFile(filename); SigmaEnvironment.Clear(); FileSource source = new FileSource(filename, Path.GetTempPath()); CsvRecordExtractor extractor = (CsvRecordExtractor) new CsvRecordReader(source).Extractor(new CsvRecordExtractor(new Dictionary <string, int[][]> { ["inputs"] = new[] { new[] { 0 } } })); ExtractedDataset dataset = new ExtractedDataset("test", 1, new DiskCacheProvider(Path.GetTempPath() + "/" + nameof(TestMinibatchIteratorYield)), true, extractor); MinibatchIterator iterator = new MinibatchIterator(minibatchSize, dataset); IComputationHandler handler = new CpuFloat32Handler(); SigmaEnvironment sigma = SigmaEnvironment.Create("test"); Assert.Throws <ArgumentNullException>(() => iterator.Yield(null, null).GetEnumerator().MoveNext()); Assert.Throws <ArgumentNullException>(() => iterator.Yield(handler, null).GetEnumerator().MoveNext()); Assert.Throws <ArgumentNullException>(() => iterator.Yield(null, sigma).GetEnumerator().MoveNext()); int index = 0; foreach (var block in iterator.Yield(handler, sigma)) { //pass through each more than 5 times to ensure consistency if (index++ > 20) { break; } Assert.Contains(block["inputs"].GetValue <float>(0, 0, 0), new float[] { 5.1f, 4.9f, 4.7f }); } dataset.Dispose(); DeleteTempFile(filename); }
public void TestDatasetRecordwiseSliceCreate() { RedirectGlobalsToTempPath(); string filename = nameof(TestDatasetRecordwiseSliceCreate) + "test.dat"; CreateCsvTempFile(filename); CsvRecordExtractor extractor = new CsvRecordReader(new FileSource(filename)).Extractor("inputs", 1, 2, "targets", 3); ExtractedDataset dataset = new ExtractedDataset("name", ExtractedDataset.BlockSizeAuto, extractor); Assert.Throws <ArgumentNullException>(() => new DatasetRecordwiseSlice(null, 0.0, 1.0)); Assert.Throws <ArgumentException>(() => new DatasetRecordwiseSlice(dataset, -0.2, 1.0)); Assert.Throws <ArgumentException>(() => new DatasetRecordwiseSlice(dataset, 0.7, -1.0)); Assert.Throws <ArgumentException>(() => new DatasetRecordwiseSlice(dataset, 0.7, 0.6)); DatasetRecordwiseSlice slice = new DatasetRecordwiseSlice(dataset, 0.1, 0.6); Assert.AreSame(dataset, slice.UnderlyingDataset); Assert.AreEqual(0.1, slice.ShareOffset); Assert.AreEqual(0.6, slice.Share); DeleteTempFile(filename); }
public override IExtractedDataset <T> Extract <T>(IDataToImport dataToImport) { var extractedDataset = new ExtractedDataset <T>(_thresholdLevel); var validatedDataToImportResult = ValidateDataToImport(dataToImport); extractedDataset.AddParsingResults(validatedDataToImportResult.Item2); if (!validatedDataToImportResult.Item1) { //format not valid, return return(extractedDataset); } var csvDataSource = dataToImport as CSVDataToImport; var rawData = csvDataSource.Data as string[][]; for (var i = _startRow; i < rawData.Length; i++) { var extractResultsForRow = ExtractDataForSingleRow <T>(_extractConfigurations, dataToImport, i); extractedDataset.AddParsingResults(extractResultsForRow); } return(extractedDataset); }
public async Task TestDatasetFetchAsync() { RedirectGlobalsToTempPath(); string filename = $"test{nameof(TestDatasetFetchAsync)}.dat"; CreateCsvTempFile(filename); CsvRecordExtractor extractor = new CsvRecordReader(new FileSource(filename, Path.GetTempPath())).Extractor("inputs", 1, 2, "targets", 3); ExtractedDataset dataset = new ExtractedDataset(name: "name", blockSizeRecords: 1, recordExtractors: extractor); CpuFloat32Handler handler = new CpuFloat32Handler(); var block0 = dataset.FetchBlockAsync(0, handler); var block2 = dataset.FetchBlockAsync(2, handler); var block1 = dataset.FetchBlockAsync(1, handler); //mock a free block request to freak out the dataset controller dataset.FreeBlock(1, handler); IDictionary <string, INDArray> namedArrays0 = await block0; IDictionary <string, INDArray> namedArrays1 = await block1; IDictionary <string, INDArray> namedArrays2 = await block2; Assert.IsNotNull(namedArrays1); Assert.AreEqual(new[] { 3.0f, 1.4f }, namedArrays1["inputs"].GetDataAs <float>().GetValuesArrayAs <float>(0, 2)); Assert.IsNotNull(namedArrays2); Assert.AreEqual(new[] { 3.2f, 1.3f }, namedArrays2["inputs"].GetDataAs <float>().GetValuesArrayAs <float>(0, 2)); Assert.IsNotNull(namedArrays0); Assert.AreEqual(new[] { 3.5f, 1.4f }, namedArrays0["inputs"].GetDataAs <float>().GetValuesArrayAs <float>(0, 2)); dataset.Dispose(); DeleteTempFile(filename); }
/// <summary> /// Separate inputs columns from output column (from ExtractedDataset /// table) and convert strings into numeric values. /// </summary> /// <param name="attributeToPredict">Name of the output column.</param> /// <param name="codeBook">Codebook to be used (by default null).</param> /// <returns>Bool value indicating whether the dataset was /// processed correctly or not.</returns> public bool ProcessDataset(string attributeToPredict, Codification codeBook = null) { // ProcessedDataset will have the same structure of ExtractedDataset. ProcessedDataset = ExtractedDataset.Clone(); // Inputs and outputs must preserve ExtractedDataset's dimensions. InputData = new double[ExtractedDataset.Rows.Count][]; OutputData = new int[ExtractedDataset.Rows.Count]; // Except for the output column, columns' types are changed to // double type (classifiers work with numbers, not with strings). foreach (DataColumn column in ExtractedDataset.Columns) { if (column.ColumnName != attributeToPredict) { InputColumnNames.Add(column.ColumnName); ProcessedDataset.Columns[column.Ordinal].DataType = typeof(double); } else { OutputColumnName = column.ColumnName; } } try { // Temporary variables. double tempValue = 0; DataRow processedRow = null; List <double> tempInput = null; for (int row = 0; row < ExtractedDataset.Rows.Count; ++row) { // Process one row at time. processedRow = ProcessedDataset.NewRow(); tempInput = new List <double>(); foreach (DataColumn column in ExtractedDataset.Columns) { if (column.ColumnName != attributeToPredict) { Double.TryParse( ExtractedDataset.Rows[row][column.Ordinal] as string, System.Globalization.NumberStyles.Any, System.Globalization.CultureInfo.InvariantCulture, out tempValue); // Create a row of numeric values to be // added to ProcessedDataset and InputData. processedRow[column.Ordinal] = tempValue; tempInput.Add(tempValue); } else { // Don't convert the output column to a number yet, just copy the string // value (conversion to number will be done later by CodeBook). processedRow[column.Ordinal] = ExtractedDataset.Rows[row][column.Ordinal]; } } // Add/fill a row in ProcessedDataset and InputData // before going to next row. ProcessedDataset.Rows.Add(processedRow); InputData[row] = tempInput.ToArray(); } if (codeBook != null) { // Use given codebook (codebook should be given only when dealing // with testing datasets, in order to have the same codebook both // for training and for testing data). this.CodeBook = codeBook; } else { // If no codebook is given, create one for the output column. CodeBook = new Codification(ExtractedDataset, attributeToPredict); } // Apply codebook to the ProcessedDataset. ProcessedDataset = CodeBook.Apply(ProcessedDataset); // InputData is already set, OutputData is set to be the output // column codified with the codebook. OutputData = ProcessedDataset.ToArray <int>(attributeToPredict); // Number of input columns. InputAttributeNumber = ExtractedDataset.Columns.Count - 1; // Number of possible values the output column may assume. OutputPossibleValues = CodeBook[attributeToPredict].Symbols; } catch { return(false); } return(true); }
public override IExtractedDataset <T> Extract <T>(IDataToImport dataToImport) { var extractedDataset = new ExtractedDataset <T>(_thresholdLevel); ChemistryFileChildObjectExtractConfiguration chemistryDataExtractConfiguration = null; SampleFileChildObjectExtractConfiguration sampleDataExtractConfiguration = null; var castedDataToImport = dataToImport as ESDATDataToImport; if (castedDataToImport == null) { extractedDataset.AddParsingResult(new BaseResult(ResultLevel.FATAL, "Data to Import needs to be ESDATDataToImport")); } try { chemistryDataExtractConfiguration = _extractConfigurations.Where(x => x is ChemistryFileChildObjectExtractConfiguration) .Cast <ChemistryFileChildObjectExtractConfiguration>() .SingleOrDefault(); } catch (Exception ex) { chemistryDataExtractConfiguration = null; extractedDataset.AddParsingResult(new BaseResult(ResultLevel.FATAL, "ESDAT data importer needs to have one and only one Chemistry file extract configuration")); } try { sampleDataExtractConfiguration = _extractConfigurations.Where(x => x is SampleFileChildObjectExtractConfiguration) .Cast <SampleFileChildObjectExtractConfiguration>() .SingleOrDefault(); } catch (Exception ex) { sampleDataExtractConfiguration = null; extractedDataset.AddParsingResult(new BaseResult(ResultLevel.FATAL, "ESDAT data importer needs to have one and only one Sample file extract configuration")); } var model = new T(); if (castedDataToImport.HeaderFileToImport == null) { var castedModel = model as ESDATModel; castedModel.LabName = _wqDefaultValueProvider.OrganizationNameSampleCollection; extractedDataset.AddParsingResults(new List <IResult> { new BaseResult(ResultLevel.WARN, "Header file is null, use the default organization name in the default value provider") }); } else { var headerFileExtractResults = ExtractHeaderFile(model, _extractConfigurations, castedDataToImport.HeaderFileToImport); extractedDataset.AddParsingResults(headerFileExtractResults); } if (chemistryDataExtractConfiguration != null && sampleDataExtractConfiguration != null) { var chemistryFileExtractResults = ExtractChemistryFileData(model, chemistryDataExtractConfiguration, castedDataToImport.ChemistryFileToImport); extractedDataset.AddParsingResults(chemistryFileExtractResults); var sampleFileExtractResults = ExtractSampleFileData(model, sampleDataExtractConfiguration, castedDataToImport.SampleFileToImport); extractedDataset.AddParsingResults(sampleFileExtractResults); } extractedDataset.AddParsingResults(new List <IResult> { new ParsingResult(ResultLevel.DEBUG, "Extract data into ESDAT model", model, null) }); return(extractedDataset); }