public override IRegressionProblemData LoadData(IDataDescriptor id)
        {
            var descriptor = (ResourceRegressionDataDescriptor)id;

            var instanceArchiveName = GetResourceName(FileName + @"\.zip");

            using (var instancesZipFile = new ZipArchive(GetType().Assembly.GetManifestResourceStream(instanceArchiveName), ZipArchiveMode.Read)) {
                var entry = instancesZipFile.GetEntry(descriptor.ResourceName);
                NumberFormatInfo   numberFormat;
                DateTimeFormatInfo dateFormat;
                char separator;
                using (Stream stream = entry.Open()) {
                    TableFileParser.DetermineFileFormat(stream, out numberFormat, out dateFormat, out separator);
                }

                TableFileParser csvFileParser = new TableFileParser();
                using (Stream stream = entry.Open()) {
                    csvFileParser.Parse(stream, numberFormat, dateFormat, separator, true);
                }

                Dataset dataset = new Dataset(csvFileParser.VariableNames, csvFileParser.Values);
                if (!descriptor.CheckVariableNames(csvFileParser.VariableNames))
                {
                    throw new ArgumentException("Parsed file contains variables which are not in the descriptor.");
                }

                return(descriptor.GenerateRegressionData(dataset));
            }
        }
Esempio n. 2
0
        public override IEnumerable <IDataDescriptor> GetDataDescriptors()
        {
            var instanceArchiveName = GetResourceName(FileName + @"\.zip");

            using (var instancesZipFile = new ZipArchive(GetType().Assembly.GetManifestResourceStream(instanceArchiveName), ZipArchiveMode.Read)) {
                foreach (var entry in instancesZipFile.Entries)
                {
                    NumberFormatInfo   numberFormat;
                    DateTimeFormatInfo dateFormat;
                    char separator;
                    using (var stream = entry.Open()) {
                        // the method below disposes the stream
                        TableFileParser.DetermineFileFormat(stream, out numberFormat, out dateFormat, out separator);
                    }

                    using (var stream = entry.Open()) {
                        using (var reader = new StreamReader(stream)) {
                            var header = reader.ReadLine(); // read the first line

                            // by convention each dataset from the PennML collection reserves the last column for the target
                            var variableNames         = header.Split(separator);
                            var allowedInputVariables = variableNames.Take(variableNames.Length - 1);
                            var target = variableNames.Last();

                            // count lines
                            int lines = 0; while (reader.ReadLine() != null)
                            {
                                lines++;
                            }

                            var trainEnd   = (int)Math.Round(lines * trainTestSplit);
                            var trainRange = new IntRange(0, trainEnd);
                            var testRange  = new IntRange(trainEnd, lines);

                            var descriptor = new PennMLRegressionDataDescriptor(entry.Name, variableNames, allowedInputVariables, target, trainRange, testRange);
                            yield return(descriptor);
                        }
                    }
                }
            }
        }