Exemple #1
0
        public IExtractedDataset <T> Extract <T>(IDataToImport dataToImport) where T : new()
        {
            var extractedDataset            = new ExtractedDataset <T>(_thresholdLevel);
            var validatedDataToImportResult = ValidateDataToImport(dataToImport);

            extractedDataset.AddParsingResults(validatedDataToImportResult.Item2);

            if (!validatedDataToImportResult.Item1)
            {
                //format not valid, return
                return(extractedDataset);
            }
            var results = new List <IResult>();
            var model   = new T();
            IDataSourceLocation currentLocation = null;

            foreach (var configuration in _extractConfigurations)
            {
                if (configuration is SimpleXMLExtractConfiguration)
                {
                    currentLocation = new XMLDataSourceLocation(((SimpleXMLExtractConfiguration)configuration).ElementName, ((SimpleXMLExtractConfiguration)configuration).AttributeName);
                }

                results.AddRange(((SimpleXMLExtractConfiguration)configuration).ExtractData(model, dataToImport, currentLocation));
            }

            var parsingResult = new ParsingResult(ResultLevel.DEBUG, "Extract data from single row success", model, null);

            results.Add(parsingResult);

            extractedDataset.AddParsingResults(results);

            return(extractedDataset);
        }
        public void TestUnifiedIteratorYield()
        {
            string filename = ".unittestfile" + nameof(TestUnifiedIteratorYield);

            CreateCsvTempFile(filename);
            SigmaEnvironment.Clear();


            FileSource         source    = new FileSource(filename, Path.GetTempPath());
            CsvRecordExtractor extractor = (CsvRecordExtractor) new CsvRecordReader(source).Extractor(new CsvRecordExtractor(new Dictionary <string, int[][]> {
                ["inputs"] = new[] { new[] { 0 } }
            }));
            ExtractedDataset    dataset  = new ExtractedDataset("test", 2, new DiskCacheProvider(Path.GetTempPath() + "/" + nameof(TestUnifiedIteratorYield)), true, extractor);
            UnifiedIterator     iterator = new UnifiedIterator(dataset);
            SigmaEnvironment    sigma    = SigmaEnvironment.Create("test");
            IComputationHandler handler  = new CpuFloat32Handler();

            foreach (var block in iterator.Yield(handler, sigma))
            {
                Assert.AreEqual(new[] { 5.1f, 4.9f, 4.7f }, block["inputs"].GetDataAs <float>().GetValuesArrayAs <float>(0, 3));
            }

            dataset.Dispose();

            DeleteTempFile(filename);
        }
        public void TestDatasetBlockwiseSliceCreate()
        {
            RedirectGlobalsToTempPath();

            string filename = nameof(TestDatasetBlockwiseSliceCreate) + "test.dat";

            CreateCsvTempFile(filename);

            CsvRecordExtractor extractor = new CsvRecordReader(new FileSource(filename)).Extractor("inputs", 1, 2, "targets", 3);
            ExtractedDataset   dataset   = new ExtractedDataset("name", ExtractedDataset.BlockSizeAuto, extractor);

            Assert.Throws <ArgumentNullException>(() => new DatasetBlockwiseSlice(null, 0, 0, 1));
            Assert.Throws <ArgumentException>(() => new DatasetBlockwiseSlice(dataset, 0, 0, 0));
            Assert.Throws <ArgumentException>(() => new DatasetBlockwiseSlice(dataset, -1, 0, 1));
            Assert.Throws <ArgumentException>(() => new DatasetBlockwiseSlice(dataset, 0, -1, 1));
            Assert.Throws <ArgumentException>(() => new DatasetBlockwiseSlice(dataset, 1, 0, 1));
            Assert.Throws <ArgumentException>(() => new DatasetBlockwiseSlice(dataset, 0, 2, 2));

            DatasetBlockwiseSlice slice = new DatasetBlockwiseSlice(dataset, 0, 1, 3);

            Assert.AreSame(dataset, slice.UnderlyingDataset);
            Assert.AreEqual(0, slice.SplitBeginIndex);
            Assert.AreEqual(1, slice.SplitEndIndex);
            Assert.AreEqual(2, slice.SplitSize);
            Assert.AreEqual(3, slice.SplitInterval);

            Assert.AreEqual(dataset.Name, slice.Name);
            Assert.AreEqual(dataset.TargetBlockSizeRecords, slice.TargetBlockSizeRecords);
            Assert.AreEqual(dataset.SectionNames, slice.SectionNames);

            DeleteTempFile(filename);
        }
Exemple #4
0
        public IExtractedDataset <T> Extract <T>(IDataToImport dataToImport) where T : new()
        {
            var extractedDataset            = new ExtractedDataset <T>(_thresholdLevel);
            var validatedDataToImportResult = ValidateDataToImport(dataToImport);

            extractedDataset.AddParsingResults(validatedDataToImportResult.Item2);

            if (!validatedDataToImportResult.Item1)
            {
                //format not valid, return
                return(extractedDataset);
            }

            var        csvDataSource = dataToImport as XMLDataToImport;
            var        rawData       = csvDataSource.Data as XDocument;
            XNamespace rdf           = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
            var        parentNodes   = rawData.Descendants().Where(x => x.Name.LocalName == _rootNode);

            foreach (XElement element in parentNodes)
            {
                var extractResultsForNode = ExtractDataForSingleNode <T>(_extractConfigurations, new XMLDataToImport("", new XDocument(element)));
                extractedDataset.AddParsingResults(extractResultsForNode);
            }
            return(extractedDataset);
        }
Exemple #5
0
        private static void SampleHutter()
        {
            const long timeWindowSize = 10L;

            SigmaEnvironment sigma = SigmaEnvironment.Create("recurrent");

            IDataSource      source    = new MultiSource(new FileSource("enwik8"), new CompressedSource(new MultiSource(new FileSource("enwik8.zip"), new UrlSource("http://mattmahoney.net/dc/enwik8.zip"))));
            IRecordExtractor extractor = new CharacterRecordReader(source, (int)(timeWindowSize + 1), Encoding.ASCII)
                                         .Extractor(new ArrayRecordExtractor <short>(ArrayRecordExtractor <short>
                                                                                     .ParseExtractorParameters("inputs", new[] { 0L }, new[] { timeWindowSize }, "targets", new[] { 0L }, new[] { timeWindowSize }))
                                                    .Offset("targets", 1L))
                                         .Preprocess(new PermutePreprocessor(0, 2, 1))
                                         .Preprocess(new OneHotPreprocessor(0, 255));
            IDataset dataset = new ExtractedDataset("hutter", ExtractedDataset.BlockSizeAuto, false, extractor);

            ITrainer trainer = sigma.CreateTrainer("hutter");

            trainer.Network.Architecture = InputLayer.Construct(256) + RecurrentLayer.Construct(256) + OutputLayer.Construct(256) + SoftMaxCrossEntropyCostLayer.Construct();
            trainer.TrainingDataIterator = new MinibatchIterator(32, dataset);
            trainer.AddNamedDataIterator("validation", new MinibatchIterator(100, dataset));
            trainer.Optimiser = new AdagradOptimiser(baseLearningRate: 0.07);
            trainer.Operator  = new CudaSinglethreadedOperator();

            trainer.AddInitialiser("*.*", new GaussianInitialiser(standardDeviation: 0.05));

            trainer.AddLocalHook(new AccumulatedValueReporter("optimiser.cost_total", TimeStep.Every(1, TimeScale.Iteration), averageValues: true));
            trainer.AddLocalHook(new RunningTimeReporter(TimeStep.Every(10, TimeScale.Iteration)));

            sigma.PrepareAndRun();
        }
Exemple #6
0
        private static void SampleLoadExtractIterate()
        {
            SigmaEnvironment sigma = SigmaEnvironment.Create("test");

            sigma.Prepare();

            //var irisReader = new CsvRecordReader(new MultiSource(new FileSource("iris.data"), new UrlSource("http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data")));
            //IRecordExtractor irisExtractor = irisReader.Extractor("inputs2", new[] { 0, 3 }, "targets2", 4).AddValueMapping(4, "Iris-setosa", "Iris-versicolor", "Iris-virginica");
            //irisExtractor = irisExtractor.Preprocess(new OneHotPreprocessor(sectionName: "targets2", minValue: 0, maxValue: 2), new NormalisingPreprocessor(sectionNames: "inputs2", minInputValue: 0, maxInputValue: 6));

            ByteRecordReader mnistImageReader    = new ByteRecordReader(headerLengthBytes: 16, recordSizeBytes: 28 * 28, source: new CompressedSource(new MultiSource(new FileSource("train-images-idx3-ubyte.gz"), new UrlSource("http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz"))));
            IRecordExtractor mnistImageExtractor = mnistImageReader.Extractor("inputs", new[] { 0L, 0L }, new[] { 28L, 28L }).Preprocess(new NormalisingPreprocessor(0, 255));

            ByteRecordReader mnistTargetReader    = new ByteRecordReader(headerLengthBytes: 8, recordSizeBytes: 1, source: new CompressedSource(new MultiSource(new FileSource("train-labels-idx1-ubyte.gz"), new UrlSource("http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz"))));
            IRecordExtractor mnistTargetExtractor = mnistTargetReader.Extractor("targets", new[] { 0L }, new[] { 1L }).Preprocess(new OneHotPreprocessor(minValue: 0, maxValue: 9));

            IComputationHandler handler = new CpuFloat32Handler();

            ExtractedDataset dataset = new ExtractedDataset("mnist-training", ExtractedDataset.BlockSizeAuto, mnistImageExtractor, mnistTargetExtractor);

            IDataset[] slices         = dataset.SplitRecordwise(0.8, 0.2);
            IDataset   trainingData   = slices[0];
            IDataset   validationData = slices[1];

            MinibatchIterator trainingIterator   = new MinibatchIterator(1, trainingData);
            MinibatchIterator validationIterator = new MinibatchIterator(1, validationData);

            while (true)
            {
                foreach (var block in trainingIterator.Yield(handler, sigma))
                {
                    Thread.Sleep(100);

                    PrintFormattedBlock(block, PrintUtils.AsciiGreyscalePalette);

                    Thread.Sleep(1000);
                }
            }

            //Random random = new Random();
            //INDArray array = new ADNDArray<float>(3, 1, 2, 2);

            //new GaussianInitialiser(0.05, 0.05).Initialise(array, Handler, random);

            //Console.WriteLine(array);

            //new ConstantValueInitialiser(1).Initialise(array, Handler, random);

            //Console.WriteLine(array);

            //dataset.InvalidateAndClearCaches();
        }
        public void AssertExtractedFailDataset()
        {
            var dataSet = new ExtractedDataset(ResultLevel.FATAL);

            var baseResult    = new BaseResult(ResultLevel.FATAL, "Base result message");
            var parsingResult = new ParsingResult(ResultLevel.ERROR, "Parsing result", 123);

            dataSet.AddParsingResult(baseResult);
            dataSet.AddParsingResult(parsingResult);

            Assert.False(dataSet.IsExtractedSuccess);
            Assert.AreEqual(ResultLevel.FATAL, dataSet.ThresholdLevel);

            var entities = dataSet.ExtractedEntities;

            Assert.Null(entities);
        }
Exemple #8
0
        public void TestUndividedIteratorCreate()
        {
            string filename = ".unittestfile" + nameof(TestUndividedIteratorCreate);

            CreateCsvTempFile(filename);

            FileSource         source    = new FileSource(filename, Path.GetTempPath());
            CsvRecordExtractor extractor = (CsvRecordExtractor) new CsvRecordReader(source).Extractor(new CsvRecordExtractor(new Dictionary <string, int[][]> {
                ["inputs"] = new[] { new[] { 0 } }
            }));
            ExtractedDataset dataset = new ExtractedDataset("test", 1, new DiskCacheProvider(Path.GetTempPath() + "/" + nameof(TestUndividedIteratorCreate)), true, extractor);

            Assert.Throws <ArgumentNullException>(() => new UndividedIterator(null));

            dataset.Dispose();

            DeleteTempFile(filename);
        }
        public void AssertExtractedSuccessDataset()
        {
            var dataSet = new ExtractedDataset(ResultLevel.ERROR);

            var baseResult    = new BaseResult(ResultLevel.INFO, "Base result message");
            var parsingResult = new ParsingResult(ResultLevel.INFO, "Parsing result", 123);

            dataSet.AddParsingResult(baseResult);
            dataSet.AddParsingResult(parsingResult);

            Assert.True(dataSet.IsExtractedSuccess);
            Assert.AreEqual(ResultLevel.ERROR, dataSet.ThresholdLevel);

            var entities = dataSet.ExtractedEntities;

            Assert.AreEqual(1, entities.Count());
            Assert.AreEqual(123, (int)entities.First());
        }
        public void TestDatasetBlockwiseSliceFetch()
        {
            RedirectGlobalsToTempPath();

            string filename = nameof(TestDatasetBlockwiseSliceFetch) + "test.dat";

            CreateCsvTempFile(filename);

            CsvRecordExtractor    extractor = new CsvRecordReader(new FileSource(filename)).Extractor("inputs", 0, "targets", 3);
            ExtractedDataset      dataset   = new ExtractedDataset("name", 1, extractor);
            DatasetBlockwiseSlice slice     = new DatasetBlockwiseSlice(dataset, 1, 2, 3);

            Assert.AreEqual(new float[] { 4.9f }, slice.FetchBlock(0, new CpuFloat32Handler())["inputs"].GetDataAs <float>().GetValuesArrayAs <float>(0, 1));

            extractor.Reader?.Dispose();
            dataset.Dispose();

            DeleteTempFile(filename);
        }
        public void AssertExtractedFailDataset()
        {
            var mockDataSourceLocation = new Mock<IDataSourceLocation>();
            var dataSet = new ExtractedDataset<int>(ResultLevel.FATAL);

            var baseResult = new BaseResult(ResultLevel.FATAL, "Base result message");
            var parsingResult = new ParsingResult(ResultLevel.ERROR, "Parsing result", 123, mockDataSourceLocation.Object);

            dataSet.AddParsingResult(baseResult);
            dataSet.AddParsingResult(parsingResult);

            Assert.False(dataSet.IsExtractedSuccess);
            Assert.AreEqual(ResultLevel.FATAL, dataSet.ThresholdLevel);

            var entities = dataSet.ExtractedEntities;

            Assert.Null(entities);

        }
        public void AssertExtractedSuccessDataset()
        {
            var mockDataSourceLocation = new Mock<IDataSourceLocation>();
            var dataSet = new ExtractedDataset<int>(ResultLevel.ERROR);

            var baseResult = new BaseResult(ResultLevel.INFO, "Base result message");
            var parsingResult = new ParsingResult(ResultLevel.INFO, "Parsing result", 123, mockDataSourceLocation.Object);

            dataSet.AddParsingResult(baseResult);
            dataSet.AddParsingResult(parsingResult);

            Assert.True(dataSet.IsExtractedSuccess);
            Assert.AreEqual(ResultLevel.ERROR, dataSet.ThresholdLevel);

            var entities = dataSet.ExtractedEntities;

            Assert.AreEqual(1, entities.Count());
            Assert.AreEqual(123, (int)entities.First());

        }
Exemple #13
0
        public void TestUndividedIteratorYield()
        {
            string filename = ".unittestfile" + nameof(TestUndividedIteratorCreate);

            CreateCsvTempFile(filename);

            SigmaEnvironment.Clear();

            FileSource         source    = new FileSource(filename, Path.GetTempPath());
            CsvRecordExtractor extractor = (CsvRecordExtractor) new CsvRecordReader(source).Extractor(new CsvRecordExtractor(new Dictionary <string, int[][]> {
                ["inputs"] = new[] { new[] { 0 } }
            }));
            ExtractedDataset    dataset  = new ExtractedDataset("test", 2, new DiskCacheProvider(Path.GetTempPath() + "/" + nameof(TestUndividedIteratorCreate)), true, extractor);
            UndividedIterator   iterator = new UndividedIterator(dataset);
            SigmaEnvironment    sigma    = SigmaEnvironment.Create("test");
            IComputationHandler handler  = new CpuFloat32Handler();

            int index = 0;

            foreach (var block in iterator.Yield(handler, sigma))
            {
                if (index == 0)
                {
                    Assert.AreEqual(new float[] { 5.1f, 4.9f }, block["inputs"].GetDataAs <float>().GetValuesArrayAs <float>(0, 2));
                }
                else if (index == 1)
                {
                    Assert.AreEqual(new float[] { 4.7f }, block["inputs"].GetDataAs <float>().GetValuesArrayAs <float>(0, 1));
                }
                else
                {
                    Assert.Fail("There can be a maximum of two iterations, but this is yield iteration 3 (index 2).");
                }

                index++;
            }

            dataset.Dispose();

            DeleteTempFile(filename);
        }
Exemple #14
0
        private static void SampleCachedFastIteration()
        {
            SigmaEnvironment sigma = SigmaEnvironment.Create("test");

            IDataSource dataSource = new CompressedSource(new MultiSource(new FileSource("train-images-idx3-ubyte.gz"), new UrlSource("http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz")));

            ByteRecordReader mnistImageReader    = new ByteRecordReader(headerLengthBytes: 16, recordSizeBytes: 28 * 28, source: dataSource);
            IRecordExtractor mnistImageExtractor = mnistImageReader.Extractor("inputs", new[] { 0L, 0L }, new[] { 28L, 28L }).Preprocess(new NormalisingPreprocessor(0, 255));

            IDataset dataset = new ExtractedDataset("mnist-training", ExtractedDataset.BlockSizeAuto, mnistImageExtractor);

            IDataset[] slices       = dataset.SplitRecordwise(0.8, 0.2);
            IDataset   trainingData = slices[0];

            Stopwatch stopwatch = Stopwatch.StartNew();

            IDataIterator iterator = new MinibatchIterator(10, trainingData);

            foreach (var block in iterator.Yield(new CpuFloat32Handler(), sigma))
            {
                //PrintFormattedBlock(block, PrintUtils.AsciiGreyscalePalette);
            }

            Console.Write("\nFirst iteration took " + stopwatch.Elapsed + "\n+=+ Iterating over dataset again +=+ Dramatic pause...");

            ArrayUtils.Range(1, 10).ToList().ForEach(i =>
            {
                Thread.Sleep(500);
                Console.Write(".");
            });

            stopwatch.Restart();

            foreach (var block in iterator.Yield(new CpuFloat32Handler(), sigma))
            {
                //PrintFormattedBlock(block, PrintUtils.AsciiGreyscalePalette);
            }

            Console.WriteLine("Second iteration took " + stopwatch.Elapsed);
        }
Exemple #15
0
        public void TestDatasetFetchBlockSequential()
        {
            RedirectGlobalsToTempPath();

            string filename = $"test{nameof(TestDatasetFetchBlockSequential)}.dat";

            CreateCsvTempFile(filename);

            CsvRecordExtractor extractor = new CsvRecordReader(new FileSource(filename, Path.GetTempPath())).Extractor("inputs", 1, 2, "targets", 3);
            ExtractedDataset   dataset   = new ExtractedDataset(name: "name", blockSizeRecords: 1, recordExtractors: extractor);
            CpuFloat32Handler  handler   = new CpuFloat32Handler();

            IDictionary <string, INDArray> namedArrays = dataset.FetchBlock(0, handler, false);

            Assert.AreEqual(new[] { 3.5f, 1.4f }, namedArrays["inputs"].GetDataAs <float>().GetValuesArrayAs <float>(0, 2));

            //fetch the same thing twice to check for same block
            namedArrays = dataset.FetchBlock(0, handler, false);

            Assert.AreEqual(new[] { 3.5f, 1.4f }, namedArrays["inputs"].GetDataAs <float>().GetValuesArrayAs <float>(0, 2));

            //skipping second block (index 1)

            namedArrays = dataset.FetchBlock(2, handler, false);

            Assert.AreEqual(new[] { 3.2f, 1.3f }, namedArrays["inputs"].GetDataAs <float>().GetValuesArrayAs <float>(0, 2));

            namedArrays = dataset.FetchBlock(1, handler, false);

            Assert.AreEqual(new[] { 3.0f, 1.4f }, namedArrays["inputs"].GetDataAs <float>().GetValuesArrayAs <float>(0, 2));

            namedArrays = dataset.FetchBlock(3, handler, false);

            Assert.IsNull(namedArrays);

            dataset.Dispose();

            DeleteTempFile(filename);
        }
Exemple #16
0
        public void TestDatasetFreeBlockSequential()
        {
            RedirectGlobalsToTempPath();

            string filename = $"test{nameof(TestDatasetFetchBlockSequential)}.dat";

            CreateCsvTempFile(filename);

            CsvRecordExtractor extractor = new CsvRecordReader(new FileSource(filename, Path.GetTempPath())).Extractor("inputs", 1, 2, "targets", 3);
            ExtractedDataset   dataset   = new ExtractedDataset(name: "name", blockSizeRecords: 1, recordExtractors: extractor);
            CpuFloat32Handler  handler   = new CpuFloat32Handler();

            dataset.FetchBlock(0, handler, false);
            dataset.FetchBlock(1, handler, false);
            dataset.FetchBlock(2, handler, false);

            Assert.AreEqual(3, dataset.ActiveBlockRegionCount);

            dataset.FreeBlock(1, handler);
            dataset.FreeBlock(2, handler);

            Assert.AreEqual(1, dataset.ActiveBlockRegionCount);

            var namedArrays = dataset.FetchBlock(0, handler, false);

            Assert.AreEqual(new[] { 3.5f, 1.4f }, namedArrays["inputs"].GetDataAs <float>().GetValuesArrayAs <float>(0, 2));

            namedArrays = dataset.FetchBlock(1, handler, false);
            Assert.AreEqual(new[] { 3.0f, 1.4f }, namedArrays["inputs"].GetDataAs <float>().GetValuesArrayAs <float>(0, 2));

            namedArrays = dataset.FetchBlock(2, handler, false);
            Assert.AreEqual(new[] { 3.2f, 1.3f }, namedArrays["inputs"].GetDataAs <float>().GetValuesArrayAs <float>(0, 2));

            dataset.Dispose();

            DeleteTempFile(filename);
        }
Exemple #17
0
        public void TestMinibatchIteratorYield(int minibatchSize)
        {
            string filename = ".unittestfile" + nameof(TestMinibatchIteratorYield);

            CreateCsvTempFile(filename);
            SigmaEnvironment.Clear();

            FileSource         source    = new FileSource(filename, Path.GetTempPath());
            CsvRecordExtractor extractor = (CsvRecordExtractor) new CsvRecordReader(source).Extractor(new CsvRecordExtractor(new Dictionary <string, int[][]> {
                ["inputs"] = new[] { new[] { 0 } }
            }));
            ExtractedDataset    dataset  = new ExtractedDataset("test", 1, new DiskCacheProvider(Path.GetTempPath() + "/" + nameof(TestMinibatchIteratorYield)), true, extractor);
            MinibatchIterator   iterator = new MinibatchIterator(minibatchSize, dataset);
            IComputationHandler handler  = new CpuFloat32Handler();
            SigmaEnvironment    sigma    = SigmaEnvironment.Create("test");

            Assert.Throws <ArgumentNullException>(() => iterator.Yield(null, null).GetEnumerator().MoveNext());
            Assert.Throws <ArgumentNullException>(() => iterator.Yield(handler, null).GetEnumerator().MoveNext());
            Assert.Throws <ArgumentNullException>(() => iterator.Yield(null, sigma).GetEnumerator().MoveNext());

            int index = 0;

            foreach (var block in iterator.Yield(handler, sigma))
            {
                //pass through each more than 5 times to ensure consistency
                if (index++ > 20)
                {
                    break;
                }

                Assert.Contains(block["inputs"].GetValue <float>(0, 0, 0), new float[] { 5.1f, 4.9f, 4.7f });
            }

            dataset.Dispose();

            DeleteTempFile(filename);
        }
Exemple #18
0
        public void TestDatasetRecordwiseSliceCreate()
        {
            RedirectGlobalsToTempPath();

            string filename = nameof(TestDatasetRecordwiseSliceCreate) + "test.dat";

            CreateCsvTempFile(filename);

            CsvRecordExtractor extractor = new CsvRecordReader(new FileSource(filename)).Extractor("inputs", 1, 2, "targets", 3);
            ExtractedDataset   dataset   = new ExtractedDataset("name", ExtractedDataset.BlockSizeAuto, extractor);

            Assert.Throws <ArgumentNullException>(() => new DatasetRecordwiseSlice(null, 0.0, 1.0));
            Assert.Throws <ArgumentException>(() => new DatasetRecordwiseSlice(dataset, -0.2, 1.0));
            Assert.Throws <ArgumentException>(() => new DatasetRecordwiseSlice(dataset, 0.7, -1.0));
            Assert.Throws <ArgumentException>(() => new DatasetRecordwiseSlice(dataset, 0.7, 0.6));

            DatasetRecordwiseSlice slice = new DatasetRecordwiseSlice(dataset, 0.1, 0.6);

            Assert.AreSame(dataset, slice.UnderlyingDataset);
            Assert.AreEqual(0.1, slice.ShareOffset);
            Assert.AreEqual(0.6, slice.Share);

            DeleteTempFile(filename);
        }
Exemple #19
0
        public override IExtractedDataset <T> Extract <T>(IDataToImport dataToImport)
        {
            var extractedDataset = new ExtractedDataset <T>(_thresholdLevel);

            var validatedDataToImportResult = ValidateDataToImport(dataToImport);

            extractedDataset.AddParsingResults(validatedDataToImportResult.Item2);
            if (!validatedDataToImportResult.Item1)
            {
                //format not valid, return
                return(extractedDataset);
            }

            var csvDataSource = dataToImport as CSVDataToImport;
            var rawData       = csvDataSource.Data as string[][];

            for (var i = _startRow; i < rawData.Length; i++)
            {
                var extractResultsForRow = ExtractDataForSingleRow <T>(_extractConfigurations, dataToImport, i);
                extractedDataset.AddParsingResults(extractResultsForRow);
            }

            return(extractedDataset);
        }
Exemple #20
0
        public async Task TestDatasetFetchAsync()
        {
            RedirectGlobalsToTempPath();

            string filename = $"test{nameof(TestDatasetFetchAsync)}.dat";

            CreateCsvTempFile(filename);

            CsvRecordExtractor extractor = new CsvRecordReader(new FileSource(filename, Path.GetTempPath())).Extractor("inputs", 1, 2, "targets", 3);
            ExtractedDataset   dataset   = new ExtractedDataset(name: "name", blockSizeRecords: 1, recordExtractors: extractor);
            CpuFloat32Handler  handler   = new CpuFloat32Handler();

            var block0 = dataset.FetchBlockAsync(0, handler);
            var block2 = dataset.FetchBlockAsync(2, handler);
            var block1 = dataset.FetchBlockAsync(1, handler);

            //mock a free block request to freak out the dataset controller
            dataset.FreeBlock(1, handler);

            IDictionary <string, INDArray> namedArrays0 = await block0;
            IDictionary <string, INDArray> namedArrays1 = await block1;
            IDictionary <string, INDArray> namedArrays2 = await block2;

            Assert.IsNotNull(namedArrays1);
            Assert.AreEqual(new[] { 3.0f, 1.4f }, namedArrays1["inputs"].GetDataAs <float>().GetValuesArrayAs <float>(0, 2));

            Assert.IsNotNull(namedArrays2);
            Assert.AreEqual(new[] { 3.2f, 1.3f }, namedArrays2["inputs"].GetDataAs <float>().GetValuesArrayAs <float>(0, 2));

            Assert.IsNotNull(namedArrays0);
            Assert.AreEqual(new[] { 3.5f, 1.4f }, namedArrays0["inputs"].GetDataAs <float>().GetValuesArrayAs <float>(0, 2));

            dataset.Dispose();

            DeleteTempFile(filename);
        }
Exemple #21
0
        /// <summary>
        /// Separate inputs columns from output column (from ExtractedDataset
        /// table) and convert strings into numeric values.
        /// </summary>
        /// <param name="attributeToPredict">Name of the output column.</param>
        /// <param name="codeBook">Codebook to be used (by default null).</param>
        /// <returns>Bool value indicating whether the dataset was
        /// processed correctly or not.</returns>
        public bool ProcessDataset(string attributeToPredict, Codification codeBook = null)
        {
            // ProcessedDataset will have the same structure of ExtractedDataset.
            ProcessedDataset = ExtractedDataset.Clone();

            // Inputs and outputs must preserve ExtractedDataset's dimensions.
            InputData  = new double[ExtractedDataset.Rows.Count][];
            OutputData = new int[ExtractedDataset.Rows.Count];

            // Except for the output column, columns' types are changed to
            // double type (classifiers work with numbers, not with strings).
            foreach (DataColumn column in ExtractedDataset.Columns)
            {
                if (column.ColumnName != attributeToPredict)
                {
                    InputColumnNames.Add(column.ColumnName);
                    ProcessedDataset.Columns[column.Ordinal].DataType = typeof(double);
                }
                else
                {
                    OutputColumnName = column.ColumnName;
                }
            }

            try
            {
                // Temporary variables.
                double        tempValue    = 0;
                DataRow       processedRow = null;
                List <double> tempInput    = null;

                for (int row = 0; row < ExtractedDataset.Rows.Count; ++row)
                {
                    // Process one row at time.
                    processedRow = ProcessedDataset.NewRow();
                    tempInput    = new List <double>();
                    foreach (DataColumn column in ExtractedDataset.Columns)
                    {
                        if (column.ColumnName != attributeToPredict)
                        {
                            Double.TryParse(
                                ExtractedDataset.Rows[row][column.Ordinal] as string,
                                System.Globalization.NumberStyles.Any,
                                System.Globalization.CultureInfo.InvariantCulture,
                                out tempValue);
                            // Create a row of numeric values to be
                            // added to ProcessedDataset and InputData.
                            processedRow[column.Ordinal] = tempValue;
                            tempInput.Add(tempValue);
                        }
                        else
                        {
                            // Don't convert the output column to a number yet, just copy the string
                            // value (conversion to number will be done later by CodeBook).
                            processedRow[column.Ordinal] = ExtractedDataset.Rows[row][column.Ordinal];
                        }
                    }
                    // Add/fill a row in ProcessedDataset and InputData
                    // before going to next row.
                    ProcessedDataset.Rows.Add(processedRow);
                    InputData[row] = tempInput.ToArray();
                }

                if (codeBook != null)
                {
                    // Use given codebook (codebook should be given only when dealing
                    // with testing datasets, in order to have the same codebook both
                    // for training and for testing data).
                    this.CodeBook = codeBook;
                }
                else
                {
                    // If no codebook is given, create one for the output column.
                    CodeBook = new Codification(ExtractedDataset, attributeToPredict);
                }

                // Apply codebook to the ProcessedDataset.
                ProcessedDataset = CodeBook.Apply(ProcessedDataset);

                // InputData is already set, OutputData is set to be the output
                // column codified with the codebook.
                OutputData = ProcessedDataset.ToArray <int>(attributeToPredict);

                // Number of input columns.
                InputAttributeNumber = ExtractedDataset.Columns.Count - 1;

                // Number of possible values the output column may assume.
                OutputPossibleValues = CodeBook[attributeToPredict].Symbols;
            }
            catch
            {
                return(false);
            }
            return(true);
        }
        public override IExtractedDataset <T> Extract <T>(IDataToImport dataToImport)
        {
            var extractedDataset = new ExtractedDataset <T>(_thresholdLevel);

            ChemistryFileChildObjectExtractConfiguration chemistryDataExtractConfiguration = null;
            SampleFileChildObjectExtractConfiguration    sampleDataExtractConfiguration    = null;



            var castedDataToImport = dataToImport as ESDATDataToImport;

            if (castedDataToImport == null)
            {
                extractedDataset.AddParsingResult(new BaseResult(ResultLevel.FATAL, "Data to Import needs to be ESDATDataToImport"));
            }

            try
            {
                chemistryDataExtractConfiguration = _extractConfigurations.Where(x => x is ChemistryFileChildObjectExtractConfiguration)
                                                    .Cast <ChemistryFileChildObjectExtractConfiguration>()
                                                    .SingleOrDefault();
            }
            catch (Exception ex)
            {
                chemistryDataExtractConfiguration = null;
                extractedDataset.AddParsingResult(new BaseResult(ResultLevel.FATAL, "ESDAT data importer needs to have one and only one Chemistry file extract configuration"));
            }

            try
            {
                sampleDataExtractConfiguration = _extractConfigurations.Where(x => x is SampleFileChildObjectExtractConfiguration)
                                                 .Cast <SampleFileChildObjectExtractConfiguration>()
                                                 .SingleOrDefault();
            }
            catch (Exception ex)
            {
                sampleDataExtractConfiguration = null;
                extractedDataset.AddParsingResult(new BaseResult(ResultLevel.FATAL, "ESDAT data importer needs to have one and only one Sample file extract configuration"));
            }

            var model = new T();

            if (castedDataToImport.HeaderFileToImport == null)
            {
                var castedModel = model as ESDATModel;
                castedModel.LabName = _wqDefaultValueProvider.OrganizationNameSampleCollection;
                extractedDataset.AddParsingResults(new List <IResult> {
                    new BaseResult(ResultLevel.WARN, "Header file is null, use the default organization name in the default value provider")
                });
            }
            else
            {
                var headerFileExtractResults = ExtractHeaderFile(model, _extractConfigurations, castedDataToImport.HeaderFileToImport);
                extractedDataset.AddParsingResults(headerFileExtractResults);
            }


            if (chemistryDataExtractConfiguration != null && sampleDataExtractConfiguration != null)
            {
                var chemistryFileExtractResults = ExtractChemistryFileData(model, chemistryDataExtractConfiguration, castedDataToImport.ChemistryFileToImport);
                extractedDataset.AddParsingResults(chemistryFileExtractResults);

                var sampleFileExtractResults = ExtractSampleFileData(model, sampleDataExtractConfiguration, castedDataToImport.SampleFileToImport);
                extractedDataset.AddParsingResults(sampleFileExtractResults);
            }

            extractedDataset.AddParsingResults(new List <IResult> {
                new ParsingResult(ResultLevel.DEBUG, "Extract data into ESDAT model", model, null)
            });

            return(extractedDataset);
        }