public override IRegressionProblemData LoadData(IDataDescriptor id) { var descriptor = (ResourceRegressionDataDescriptor)id; var instanceArchiveName = GetResourceName(FileName + @"\.zip"); using (var instancesZipFile = new ZipArchive(GetType().Assembly.GetManifestResourceStream(instanceArchiveName), ZipArchiveMode.Read)) { var entry = instancesZipFile.GetEntry(descriptor.ResourceName); NumberFormatInfo numberFormat; DateTimeFormatInfo dateFormat; char separator; using (Stream stream = entry.Open()) { TableFileParser.DetermineFileFormat(stream, out numberFormat, out dateFormat, out separator); } TableFileParser csvFileParser = new TableFileParser(); using (Stream stream = entry.Open()) { csvFileParser.Parse(stream, numberFormat, dateFormat, separator, true); } Dataset dataset = new Dataset(csvFileParser.VariableNames, csvFileParser.Values); if (!descriptor.CheckVariableNames(csvFileParser.VariableNames)) { throw new ArgumentException("Parsed file contains variables which are not in the descriptor."); } return(descriptor.GenerateRegressionData(dataset)); } }
public override IEnumerable <IDataDescriptor> GetDataDescriptors() { var instanceArchiveName = GetResourceName(FileName + @"\.zip"); using (var instancesZipFile = new ZipArchive(GetType().Assembly.GetManifestResourceStream(instanceArchiveName), ZipArchiveMode.Read)) { foreach (var entry in instancesZipFile.Entries) { NumberFormatInfo numberFormat; DateTimeFormatInfo dateFormat; char separator; using (var stream = entry.Open()) { // the method below disposes the stream TableFileParser.DetermineFileFormat(stream, out numberFormat, out dateFormat, out separator); } using (var stream = entry.Open()) { using (var reader = new StreamReader(stream)) { var header = reader.ReadLine(); // read the first line // by convention each dataset from the PennML collection reserves the last column for the target var variableNames = header.Split(separator); var allowedInputVariables = variableNames.Take(variableNames.Length - 1); var target = variableNames.Last(); // count lines int lines = 0; while (reader.ReadLine() != null) { lines++; } var trainEnd = (int)Math.Round(lines * trainTestSplit); var trainRange = new IntRange(0, trainEnd); var testRange = new IntRange(trainEnd, lines); var descriptor = new PennMLRegressionDataDescriptor(entry.Name, variableNames, allowedInputVariables, target, trainRange, testRange); yield return(descriptor); } } } } }