public void TestMultipleFlushes() { const string INPUT = "TYPE,FEATURE,MODEL,TIMESTAMP,VALUE,UNITS\n" + "Diag,Feature1,3231,t0,DDD,N/A\n" + "Diag,Feature1,3231,t1,DDD,N/A\n" + "Diag,Feature2,3231,t1,123,ms\n" + "Clinical,Feature2,2110,t4,456,ms\n" + "Clinical,Feature3,3231-40,t5,456.78,s\n" + "Clinical,Feature3,3231-40,t6,456.78,s\n"; // Same as the input, but with the columns in alphabetical order const string EXPECTED_OUTPUT = "FEATURE,MODEL,TIMESTAMP,TYPE,UNITS,VALUE\n" + "Feature1,3231,t0,Diag,N/A,DDD\n" + "Feature1,3231,t1,Diag,N/A,DDD\n" + "Feature2,3231,t1,Diag,ms,123\n" + "Feature2,2110,t4,Clinical,ms,456\n" + "Feature3,3231-40,t5,Clinical,s,456.78\n" + "Feature3,3231-40,t6,Clinical,s,456.78\n"; var partitionColumns = new string[0]; var keyColumns = new string[0]; var tableWriter = new PartitionedTableCSVWriter(_rootPath, "Table", partitionColumns, keyColumns); var rowsToWrite = INPUT.Split(new char[] { '\n' }, StringSplitOptions.RemoveEmptyEntries).Select(row => row.Split(',')).ToArray(); var columnNames = rowsToWrite[0]; // Write the first couple of rows (excluding the header) to the table writer then flush it int i = 1; for (; i < 3; ++i) { var rowValues = columnNames.Zip(rowsToWrite[i], (name, val) => new { name, val }).ToDictionary(item => item.name, item => item.val); tableWriter.AddRow(rowValues); } tableWriter.Flush(); // Write the remaining rows to the table writer then flush it for (; i < rowsToWrite.Length; ++i) { var rowValues = columnNames.Zip(rowsToWrite[i], (name, val) => new { name, val }).ToDictionary(item => item.name, item => item.val); tableWriter.AddRow(rowValues); } tableWriter.Flush(); // Display a dialog to allow for manual inspection of results (no easy way to read concatenated gzips // using the .NET GZipStream class). string[] files = Directory.GetFiles(_rootPath, "*.csv", SearchOption.AllDirectories); Assert.IsTrue(files.Length == 1, "Only one gzip file should exist across multiple writes"); string csv = File.ReadAllText(files[0]); Assert.AreEqual(EXPECTED_OUTPUT, csv, "CSV table writer output matches the input."); }
public void TestSinglePartitionWithKeys() { const string INPUT = "TYPE,FEATURE,MODEL,TIMESTAMP,VALUE,UNITS\n" + "Diag,Feature1,3231,t0,DDD,N/A\n" + "Diag,Feature1,3231,t1,DDD,N/A\n" + "Diag,Feature2,3231,t1,123,ms\n" + "Clinical,Feature2,2110,t4,456,ms\n" + "Clinical,Feature3,3231-40,t5,456.78,s\n" + "Clinical,Feature3,3231-40,t6,456.78,s\n"; // Same as the input, but starting with the key columns in the specified order, followed by remaining columns in alphabetical order const string EXPECTED_OUTPUT = "TIMESTAMP,MODEL,FEATURE,TYPE,UNITS,VALUE\n" + "t0,3231,Feature1,Diag,N/A,DDD\n" + "t1,3231,Feature1,Diag,N/A,DDD\n" + "t1,3231,Feature2,Diag,ms,123\n" + "t4,2110,Feature2,Clinical,ms,456\n" + "t5,3231-40,Feature3,Clinical,s,456.78\n" + "t6,3231-40,Feature3,Clinical,s,456.78\n"; var partitionColumns = new string[0]; var keyColumns = new string[] { "TIMESTAMP", "MODEL" }; var tableWriter = new PartitionedTableCSVWriter(_rootPath, "Table", partitionColumns, keyColumns); var rowsToWrite = INPUT.Split(new char[] { '\n' }, StringSplitOptions.RemoveEmptyEntries).Select(row => row.Split(',')).ToArray(); var columnNames = rowsToWrite[0]; // Write all of the rows (excluding the header) to the table writer then flush it for (int i = 1; i < rowsToWrite.Length; ++i) { var rowValues = columnNames.Zip(rowsToWrite[i], (name, val) => new { name, val }).ToDictionary(item => item.name, item => item.val); tableWriter.AddRow(rowValues); } tableWriter.Flush(); // Read back the table and compare to the original string csv = File.ReadAllText(Directory.GetFiles(Path.Combine(_rootPath, "Table.table"), "*.csv")[0]); Assert.AreEqual(EXPECTED_OUTPUT, csv, "CSV table writer output matches the input."); }
public void TestMultiplePartitions() { var tablePath = Path.Combine(_rootPath, "Table.table"); const string INPUT = "TYPE,FEATURE,MODEL,TIMESTAMP,VALUE,UNITS\n" + "Diag,Feature1,3231,t0,DDD,N/A\n" + "Diag,Feature1,3231,t1,DDD,N/A\n" + "Diag,Feature2,3231,t1,123,ms\n" + "Clinical,Feature2,2110,t4,456,ms\n" + "Clinical,Feature3,3231-40,t5,456.78,s\n" + "Clinical,Feature3,3231-40,t6,456.78,s\n"; // The above input should be partitioned into subdirectories per the following (directory, output): var expectedOutput = new Dictionary <string, string>() { { Path.Combine(tablePath, "TYPE=Diag", "FEATURE=Feature1"), "TIMESTAMP,MODEL,UNITS,VALUE\n" + "t0,3231,N/A,DDD\n" + "t1,3231,N/A,DDD\n" }, { Path.Combine(tablePath, "TYPE=Diag", "FEATURE=Feature2"), "TIMESTAMP,MODEL,UNITS,VALUE\n" + "t1,3231,ms,123\n" }, { Path.Combine(tablePath, "TYPE=Clinical", "FEATURE=Feature2"), "TIMESTAMP,MODEL,UNITS,VALUE\n" + "t4,2110,ms,456\n" }, { Path.Combine(tablePath, "TYPE=Clinical", "FEATURE=Feature3"), "TIMESTAMP,MODEL,UNITS,VALUE\n" + "t5,3231-40,s,456.78\n" + "t6,3231-40,s,456.78\n" }, }; var partitionColumns = new string[] { "TYPE", "FEATURE" }; var keyColumns = new string[] { "TIMESTAMP", "MODEL" }; var tableWriter = new PartitionedTableCSVWriter(_rootPath, "Table", partitionColumns, keyColumns); var rowsToWrite = INPUT.Split(new char[] { '\n' }, StringSplitOptions.RemoveEmptyEntries).Select(row => row.Split(',')).ToArray(); var columnNames = rowsToWrite[0]; // Write all of the rows (excluding the header) to the table writer then flush it for (int i = 1; i < rowsToWrite.Length; ++i) { var rowValues = columnNames.Zip(rowsToWrite[i], (name, val) => new { name, val }).ToDictionary(item => item.name, item => item.val); tableWriter.AddRow(rowValues); } tableWriter.Flush(); // Make sure that the partitioning directories have been created exactly as expected HashSet <string> expectedPartitionDirs = new HashSet <string>( new string[] { Path.Combine(tablePath, "TYPE=Diag"), Path.Combine(tablePath, "TYPE=Clinical"), Path.Combine(tablePath, "TYPE=Diag", "FEATURE=Feature1"), Path.Combine(tablePath, "TYPE=Diag", "FEATURE=Feature2"), Path.Combine(tablePath, "TYPE=Clinical", "FEATURE=Feature2"), Path.Combine(tablePath, "TYPE=Clinical", "FEATURE=Feature3"), }); HashSet <string> actualPartitionDirs = new HashSet <string>(Directory.GetDirectories(tablePath, "*", SearchOption.AllDirectories)); Assert.IsTrue(expectedPartitionDirs.SetEquals(actualPartitionDirs), "The partitioning directories were created as expected"); // Make sure there's a single file in every directory foreach (var partitionDir in actualPartitionDirs.Where(dirName => dirName.Contains("FEATURE"))) { if (Directory.GetFiles(partitionDir, "*.csv").Length != 1) { Assert.Fail(partitionDir + " is missing a CSV file."); } } // Make sure each files's contents is correct foreach (var kvp in expectedOutput) { var outputFile = Directory.GetFiles(kvp.Key, "*.csv").First(); string fileContents = File.ReadAllText(outputFile); Assert.AreEqual(kvp.Value, fileContents, "CSV table writer output matches the input."); } }
public void TestMultipleFlushesOfDifferentSchemas() { const string INPUT = "TYPE,FEATURE,MODEL,TIMESTAMP,VALUE,UNITS\n" + "Diag,Feature1,3231,t0,DDD,N/A\n" + "Diag,Feature1,3231,t1,DDD,N/A\n" + "Diag,Feature2,3231,t1,123,ms\n" + "Clinical,Feature2,2110,t4,456,ms\n" + "Clinical,Feature3,3231-40,t5,456.78,s\n" + "Clinical,Feature3,3231-40,t6,456.78,s\n"; // The first flush will be missing the VALUE column const string EXPECTED_OUTPUT_1 = "FEATURE,MODEL,TIMESTAMP,TYPE,UNITS\n" + "Feature1,3231,t0,Diag,N/A\n" + "Feature1,3231,t1,Diag,N/A\n" + "Feature2,3231,t1,Diag,ms\n"; const string EXPECTED_OUTPUT_2 = "FEATURE,MODEL,TIMESTAMP,TYPE,UNITS,VALUE\n" + "Feature2,2110,t4,Clinical,ms,456\n" + "Feature3,3231-40,t5,Clinical,s,456.78\n" + "Feature3,3231-40,t6,Clinical,s,456.78\n"; var partitionColumns = new string[0]; var keyColumns = new string[0]; var tableWriter = new PartitionedTableCSVWriter(_rootPath, "Table", partitionColumns, keyColumns); var rowsToWrite = INPUT.Split(new char[] { '\n' }, StringSplitOptions.RemoveEmptyEntries).Select(row => row.Split(',')).ToArray(); var columnNames = rowsToWrite[0]; // Write the first three rows (excluding the header) to the table writer then flush it int i = 1; for (; i < 4; ++i) { var rowValues = columnNames.Zip(rowsToWrite[i], (name, val) => new { name, val }).ToDictionary(item => item.name, item => item.val); rowValues.Remove("VALUE"); // Don't include the VALUE column in the first flush tableWriter.AddRow(rowValues); } tableWriter.Flush(); // Verify the file output string file1 = Directory.GetFiles(_rootPath, "*.csv", SearchOption.AllDirectories).Single(); string csv = File.ReadAllText(file1); Assert.AreEqual(EXPECTED_OUTPUT_1, csv, "File1 was written correctly."); // Write the remaining rows to the table writer then flush it for (; i < rowsToWrite.Length; ++i) { var rowValues = columnNames.Zip(rowsToWrite[i], (name, val) => new { name, val }).ToDictionary(item => item.name, item => item.val); tableWriter.AddRow(rowValues); } tableWriter.Flush(); // Verify the 2nd file output string file2 = Directory.GetFiles(_rootPath, "*.csv", SearchOption.AllDirectories) .Where(filename => filename != file1).Single(); csv = File.ReadAllText(file2); Assert.AreEqual(EXPECTED_OUTPUT_2, csv, "File2 was written correctly."); }
public void TestSinglePartitionWithDynamicColumns() { const string INPUT = "TYPE,FEATURE,MODEL,TIMESTAMP,VALUE,UNITS\n" + "Diag,Feature1,3231,t0,DDD,N/A\n" + "Diag,Feature1,3231,t1,DDD,N/A\n" + "Diag,Feature2,3231,t1,123,ms\n" + "Clinical,Feature2,2110,t4,456,ms\n" + "Clinical,Feature3,3231-40,t5,456.78,s\n" + "Clinical,Feature3,3231-40,t6,456.78,s\n"; // Same as the input, but starting with the key columns in the specified order, followed by remaining columns in alphabetical order const string EXPECTED_OUTPUT = "TIMESTAMP,MODEL,FEATURE,TYPE,UNITS,VALUE\n" + // First two VALUE should be <null> because this column wasn't added until the 3rd row "t0,3231,Feature1,Diag,N/A,\n" + "t1,3231,Feature1,Diag,N/A,\n" + // Row 3 is complete "t1,3231,Feature2,Diag,ms,123\n" + // Next two FEATURE should be <null> because it wasn't included in these rows "t4,2110,,Clinical,ms,456\n" + "t5,3231-40,,Clinical,s,456.78\n" + // All non-key columns are missing from the last row "t6,3231-40,,,,\n"; var partitionColumns = new string[0]; var keyColumns = new string[] { "TIMESTAMP", "MODEL" }; var tableWriter = new PartitionedTableCSVWriter(_rootPath, "Table", partitionColumns, keyColumns); var rowsToWrite = INPUT.Split(new char[] { '\n' }, StringSplitOptions.RemoveEmptyEntries).Select(row => row.Split(',')).ToArray(); var columnNames = rowsToWrite[0]; // Write all of the rows (excluding the header) to the table writer then flush it for (int i = 1; i < rowsToWrite.Length; ++i) { var rowValues = columnNames.Zip(rowsToWrite[i], (name, val) => new { name, val }).ToDictionary(item => item.name, item => item.val); // Selcetively remove columns based on which row this is switch (i) { case 1: case 2: // Omit VALUE from 1st two rows rowValues.Remove("VALUE"); break; case 4: case 5: // Omit VALUE from rows 4 and 5 rowValues.Remove("FEATURE"); break; case 6: // Omit all non-key values from the last row rowValues.Remove("FEATURE"); rowValues.Remove("TYPE"); rowValues.Remove("UNITS"); rowValues.Remove("VALUE"); break; default: break; } tableWriter.AddRow(rowValues); } tableWriter.Flush(); // Read back the table and compare to the original string csv = File.ReadAllText(Directory.GetFiles(Path.Combine(_rootPath, "Table.table"), "*.csv")[0]); Assert.AreEqual(EXPECTED_OUTPUT, csv, "CSV table writer output matches the input."); }