private static void NotStartsWith(string inputFilePath, string outputFilePath, string valueColumnIdentifier, string nameColumnIdentifier) { using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath)) { int valueColumnIndex = reader.ColumnIndex(valueColumnIdentifier); int nameColumnIndex = reader.ColumnIndex(nameColumnIdentifier); using (ITabularWriter writer = TabularFactory.BuildWriter(outputFilePath)) { writer.SetColumns(reader.Columns); while (reader.NextRow()) { String8 name = reader.Current(nameColumnIndex).ToString8(); String8 value = reader.Current(valueColumnIndex).ToString8(); if (!value.StartsWith(name)) { for (int i = 0; i < reader.CurrentRowColumns; ++i) { writer.Write(reader.Current(i).ToString8()); } writer.NextRow(); } } WriteSizeSummary(reader, writer); } } }
public void Sanitize_EndToEnd() { Assembly xsvTest = Assembly.GetExecutingAssembly(); Resource.SaveStreamTo("Xsv.Test.Sanitize.SanitizeSampleSource.csv", "SanitizeSampleSource.csv", xsvTest); Resource.SaveStreamTo("Xsv.Test.Sanitize.SanitizeSampleSource.sanispec", "SanitizeSampleSource.sanispec", xsvTest); // Verify UsageException if no key is passed Assert.AreEqual(-2, Program.Main(new string[] { "sanitize", @"SanitizeSampleSource.csv", "SanitizeOutput.csv", @"SanitizeSampleSource.sanispec" })); // Verify success for base sanitize File.Delete("SanitizeOutput.csv"); Assert.AreEqual(0, Program.Main(new string[] { "sanitize", @"SanitizeSampleSource.csv", "SanitizeOutput.csv", @"SanitizeSampleSource.sanispec", "Key1" })); // Validate the result using (ITabularReader r = TabularFactory.BuildReader("SanitizeOutput.csv")) { Assert.IsTrue(r.Columns.Contains("ID"), "ID column is kept (no spec line)"); Assert.IsTrue(r.Columns.Contains("Path"), "Path column is kept (mapped)"); Assert.IsTrue(r.Columns.Contains("IsEmptyPath"), "IsEmptyPath is kept (Keep line)"); Assert.IsFalse(r.Columns.Contains("IsUnderXsv"), "IxUnderXsv column is dropped (Drop line)"); int idColumnIndex = r.ColumnIndex("ID"); int pathColumnIndex = r.ColumnIndex("Path"); int isEmptyPathColumnIndex = r.ColumnIndex("IsEmptyPath"); while (r.NextRow()) { int id = r.Current(idColumnIndex).ToInteger(); string path = r.Current(pathColumnIndex).ToString(); Assert.AreEqual(r.Current(isEmptyPathColumnIndex).ToBoolean(), String.IsNullOrEmpty(path), "IsEmptyPath condition matches whether mapped path is empty"); if (id == 5) { Assert.AreEqual("Elfie", path, "'Elfie' is echoed (Echo in spec)"); } else if (!String.IsNullOrEmpty(path)) { Assert.IsTrue(path.StartsWith("WarmBeggedTruth\\"), "Verify path is mapped in parts, and 'Elfie' is consistently mapped."); } } Assert.IsTrue(r.RowCountRead < 7, "Verify sample excluded at least one row."); } // Run with another key Assert.AreEqual(0, Program.Main(new string[] { "sanitize", @"SanitizeSampleSource.csv", "SanitizeOutput2.csv", @"SanitizeSampleSource.sanispec", "Key2" })); // Verify mappings are different Assert.AreNotEqual(File.ReadAllText("SanitizeOutput2.csv"), File.ReadAllText("SanitizeOutput.csv")); }
private static void Compare(string oldFilePath, string newFilePath, string outputFilePath, string columnIdentifier) { String8Block block = new String8Block(); HashSet <String8> oldValues = new HashSet <String8>(); HashSet <String8> newValues = new HashSet <String8>(); using (ITabularReader oldReader = TabularFactory.BuildReader(oldFilePath)) { int leftColumnIndex = oldReader.ColumnIndex(columnIdentifier); while (oldReader.NextRow()) { oldValues.Add(block.GetCopy(oldReader.Current(leftColumnIndex))); } Trace.WriteLine(String.Format("Old: {0:n0} values for \"{1}\" in {2:n0} rows.", oldValues.Count, columnIdentifier, oldReader.RowCountRead)); } using (ITabularReader newReader = TabularFactory.BuildReader(newFilePath)) { int rightColumnIndex = newReader.ColumnIndex(columnIdentifier); while (newReader.NextRow()) { newValues.Add(block.GetCopy(newReader.Current(rightColumnIndex))); } Trace.WriteLine(String.Format("New: {0:n0} values for \"{1}\" in {2:n0} rows.", newValues.Count, columnIdentifier, newReader.RowCountRead)); } HashSet <String8> oldOnly = new HashSet <String8>(oldValues); oldOnly.ExceptWith(newValues); HashSet <String8> newOnly = new HashSet <String8>(newValues); newOnly.ExceptWith(oldValues); Trace.WriteLine(String.Format("{0:n0} values were only in \"{1}\".\r\n{2:n0} values were only in \"{3}\".", oldOnly.Count, oldFilePath, newOnly.Count, newFilePath)); String8 leftMarker = String8.Convert("-", new byte[1]); String8 rightMarker = String8.Convert("+", new byte[1]); using (ITabularWriter writer = TabularFactory.BuildWriter(outputFilePath)) { writer.SetColumns(new string[] { "In", columnIdentifier }); foreach (String8 value in oldOnly) { writer.Write(leftMarker); writer.Write(value); writer.NextRow(); } foreach (String8 value in newOnly) { writer.Write(rightMarker); writer.Write(value); writer.NextRow(); } } }
private static void Distinct(string inputFilePath, string outputFilePath, string columnIdentifier) { String8Block block = new String8Block(); HashSet <String8> distinctValues = new HashSet <String8>(); using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath)) { int columnIndex = reader.ColumnIndex(columnIdentifier); using (ITabularWriter writer = TabularFactory.BuildWriter(outputFilePath)) { writer.SetColumns(new string[] { reader.Columns[columnIndex] }); while (reader.NextRow()) { String8 value = reader.Current(columnIndex).ToString8(); if (!distinctValues.Contains(value)) { distinctValues.Add(block.GetCopy(value)); writer.Write(value); writer.NextRow(); } } WriteSizeSummary(reader, writer); } } }
private static void HtmlInnerText(string inputFilePath, string outputFilePath, string columnIdentifier) { using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath)) { int columnIndexToEscape = reader.ColumnIndex(columnIdentifier); using (ITabularWriter writer = TabularFactory.BuildWriter(outputFilePath)) { writer.SetColumns(reader.Columns); while (reader.NextRow()) { for (int i = 0; i < reader.CurrentRowColumns; ++i) { if (i == columnIndexToEscape) { WriteHtmlEscaped(reader.Current(i).ToString8(), writer); } else { writer.Write(reader.Current(i).ToString8()); } } writer.NextRow(); } WriteSizeSummary(reader, writer); } } }
private static void Copy(string inputFilePath, string outputFilePath, string columnsDelimited) { List <string> columns = new List <string>(); foreach (string columnName in columnsDelimited.Split(',')) { columns.Add(columnName.Trim()); } using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath)) { int[] columnIndices = new int[columns.Count]; for (int i = 0; i < columnIndices.Length; ++i) { columnIndices[i] = reader.ColumnIndex(columns[i]); } using (ITabularWriter writer = TabularFactory.BuildWriter(outputFilePath)) { writer.SetColumns(columns); while (reader.NextRow()) { for (int i = 0; i < columnIndices.Length; ++i) { writer.Write(reader.Current(columnIndices[i]).ToString8()); } writer.NextRow(); } WriteSizeSummary(reader, writer); } } }
private static TableMetadata Build(IStreamProvider streamProvider, string tableRootPath) { TableMetadata metadata = new TableMetadata(); string schemaFilePath = Path.Combine(tableRootPath, SchemaFileName); using (ITabularReader sr = TabularFactory.BuildReader(streamProvider.OpenRead(schemaFilePath), SchemaFileName)) { int nameIndex = sr.ColumnIndex("Name"); int typeIndex = sr.ColumnIndex("Type"); while (sr.NextRow()) { metadata.Schema.Add(new ColumnDetails(sr.Current(nameIndex).ToString(), TypeProviderFactory.Get(sr.Current(typeIndex).ToString()).Type)); } } using (ITabularReader mr = TabularFactory.BuildReader(streamProvider.OpenRead(Path.Combine(tableRootPath, MetadataFileName)), MetadataFileName)) { int nameIndex = mr.ColumnIndex("Name"); int contextIndex = mr.ColumnIndex("Context"); int valueIndex = mr.ColumnIndex("Value"); while (mr.NextRow()) { String8 name = mr.Current(nameIndex).ToString8(); String8 context = mr.Current(contextIndex).ToString8(); ITabularValue value = mr.Current(valueIndex); if (name.Equals("RowCount")) { metadata.RowCount = value.ToInteger(); } else { throw new NotImplementedException($"TableMetadataSerializer.Read doesn't know how to read Metadata '{name}'"); } } } metadata.Query = streamProvider.ReadAllText(Path.Combine(tableRootPath, ConfigQueryPath)); return(metadata); }
private static IEnumerable <DataBlock> ReadAsDataBlockBatch(ITabularReader reader, IList <string> columnNames) { // Build a DataBlock to hold a batch of rows int columnCount = columnNames.Count; DataBlock result = new DataBlock(columnNames, BatchSize); Value[][] columnArrays = new Value[columnCount][]; for (int i = 0; i < columnCount; ++i) { columnArrays[i] = new Value[BatchSize]; for (int j = 0; j < BatchSize; ++j) { columnArrays[i][j] = Value.Create(null); } result.SetColumn(i, columnArrays[i]); } // Look up indices of the columns int[] columnIndices = new int[columnCount]; for (int i = 0; i < columnCount; ++i) { columnIndices[i] = reader.ColumnIndex(columnNames[i]); } // Fill blocks with rows as we go int currentRowCount = 0; String8Block block = new String8Block(); while (reader.NextRow()) { for (int i = 0; i < columnCount; ++i) { String8 cell = block.GetCopy(reader.Current(columnIndices[i]).ToString8()); columnArrays[i][currentRowCount].Assign(new ByteBlock(cell.Array, cell.Index, cell.Length)); //columnArrays[i][currentRowCount].Assign(cell.ToString()); } currentRowCount++; if (currentRowCount == BatchSize) { yield return(result); currentRowCount = 0; block.Clear(); } } if (currentRowCount > 0) { yield return(result); } }
public void Reader_Performance(string sampleFilePath, Func <string, bool, ITabularReader> buildReader) { long rowCountRead = 0; long xsvLengthBytes = new FileInfo(sampleFilePath).Length; // Goal: 100MB/sec [Surface Book i7] Verify.PerformanceByBytes(50 * LongExtensions.Megabyte, () => { int iterations = 100; for (int iteration = 0; iteration < iterations; ++iteration) { using (ITabularReader r = buildReader(sampleFilePath, true)) { int lineNumberIndex = r.ColumnIndex("LineNumber"); int countIndex = r.ColumnIndex("Count"); int descriptionIndex = r.ColumnIndex("Description"); while (r.NextRow()) { rowCountRead++; if (r.CurrentRowColumns < 2) { continue; } int lineNumber; r.Current(lineNumberIndex).TryToInteger(out lineNumber); int count; r.Current(countIndex).TryToInteger(out count); String8 description = r.Current(descriptionIndex).ToString8(); } } } return(iterations * xsvLengthBytes); }); }
private static void OnlyIn(string inputFilePath, string outputFilePath, string onlyInInputFilePath, string onlyInColumnIdentifier) { String8Block block = new String8Block(); HashSet <String8> values = new HashSet <String8>(); // Read values in 'onlyInInputFilePath' using (ITabularReader reader = TabularFactory.BuildReader(onlyInInputFilePath)) { int leftColumnIndex = reader.ColumnIndex(onlyInColumnIdentifier); while (reader.NextRow()) { values.Add(block.GetCopy(reader.Current(leftColumnIndex))); } } // Copy from input to output where the column value is in the "only in" set using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath)) { int valueColumnIndex = reader.ColumnIndex(onlyInColumnIdentifier); using (ITabularWriter writer = TabularFactory.BuildWriter(outputFilePath)) { writer.SetColumns(reader.Columns); while (reader.NextRow()) { if (values.Contains(reader.Current(valueColumnIndex).ToString8())) { for (int i = 0; i < reader.CurrentRowColumns; ++i) { writer.Write(reader.Current(i).ToString8()); } writer.NextRow(); } } WriteSizeSummary(reader, writer); } } }
public static WhereResult Where(ITabularReader reader, string columnIdentifier, string operatorString, string valueString, ITabularWriter writer) { int colIndex = reader.ColumnIndex(columnIdentifier); WhereResult result = new WhereResult() { ColumnIndex = colIndex, ColumnName = reader.Columns[colIndex], Op = OperatorExtensions.Parse(operatorString), Value = ConvertToBestType(valueString) }; Type t = result.Value.GetType(); if (t == typeof(bool)) { MatchBoolCompare(reader, writer, result); } else if (t == typeof(int)) { MatchIntCompare(reader, writer, result); } else if (t == typeof(DateTime)) { MatchDateTimeCompare(reader, writer, result); } else if (result.Op == Operator.Contains) { MatchContains(reader, writer, result); } else if (result.Op == Operator.StartsWith) { MatchStartsWith(reader, writer, result); } else { MatchStringCompare(reader, writer, result); } result.RowCount = reader.RowCountRead; return(result); }
private static void Where(string inputFilePath, string columnIndentifier, string value, ITabularWriter writer) { int matchCount = 0; int rowCount = 0; using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath)) { int rowIndex = (value != null ? -1 : int.Parse(columnIndentifier)); int colIndex = (value != null ? reader.ColumnIndex(columnIndentifier) : -1); while (reader.NextRow()) { // Match the row index if no value was passed if (rowIndex != -1 && reader.RowCountRead != rowIndex) { continue; } // Match the column value if passed if (colIndex != -1) { if (reader.CurrentRowColumns <= colIndex) { continue; } if (reader.Current(colIndex).ToString8().CompareTo(value, true) != 0) { continue; } } matchCount++; // If this is the matching row, write it if (writer != null) { if (writer.RowCountWritten == 0) { List <string> columns = new List <string>(); columns.Add("RowIndex"); columns.AddRange(reader.Columns); writer.SetColumns(columns); } writer.Write(reader.RowCountRead); for (int i = 0; i < reader.CurrentRowColumns; ++i) { writer.Write(reader.Current(i).ToString8()); } writer.NextRow(); } // If we matched row index, we're done if (rowIndex != -1) { break; } } rowCount = reader.RowCountRead; } Console.WriteLine($"Done. {matchCount:n0} out of {rowCount:n0} rows matched."); }
public void Reader_Basics(string sampleFilePath, Func <string, bool, ITabularReader> buildReader) { // File Not Found Verify.Exception <FileNotFoundException>(() => buildReader("NonExistantFile.xsv", false)); // Empty File File.WriteAllText("Empty.xsv", ""); // Verify Reader throws on construction if trying to read headers Verify.Exception <IOException>(() => buildReader("Empty.xsv", true)); // Verify Reader returns false immediately if not reading headers using (ITabularReader r = buildReader("Empty.xsv", false)) { Assert.IsFalse(r.NextRow()); } // Verify Reader doesn't consume header row if asked not to using (ITabularReader r = buildReader(sampleFilePath, false)) { Assert.IsTrue(r.NextRow()); Assert.AreEqual("LineNumber", r.Current(0).ToString()); // Get column name (no header row read) Verify.Exception <ColumnNotFoundException>(() => r.ColumnIndex("Missing")); } // Open the sample Tsv the 'expected' way using (ITabularReader r = buildReader(sampleFilePath, true)) { // Get column name (valid) int lineNumberColumnIndex = r.ColumnIndex("LineNumber"); Assert.AreEqual(0, lineNumberColumnIndex, "LineNumber column not expected"); // Get column name (different case, but valid) int descriptionColumnIndex = r.ColumnIndex("deSCRiption"); Assert.AreEqual(2, descriptionColumnIndex, "Description column not expected"); // Get column name (unknown) Verify.Exception <ColumnNotFoundException>(() => r.ColumnIndex("UnknownColumn")); while (r.NextRow()) { int rowIndex = r.RowCountRead; if (rowIndex % 100 == 99) { // Verify empty rows return no columns, have empty row text, throw on value access Assert.AreEqual(0, r.CurrentRowColumns, "Expected column count 0 in empty rows"); Verify.Exception <ArgumentOutOfRangeException>(() => { var v = r.Current(lineNumberColumnIndex); }); } else if (rowIndex == 5000) { // Read row over 64k [block resizing logic, row values look right] String8 longDescription = r.Current(descriptionColumnIndex).ToString8(); Assert.AreEqual(100000, longDescription.Length); } else { // Get value (valid) String8 lineNumber8 = r.Current(lineNumberColumnIndex).ToString8(); int lineNumber = 0; if (lineNumber8.TryToInteger(out lineNumber)) { Assert.AreEqual(rowIndex, lineNumber, "Expected line number to equal row number"); } else { Assert.Fail(String.Format("\"{0}\" was not converted to an integer.", lineNumber8)); } // Get line number Assert.AreEqual(rowIndex, r.RowCountRead, "Expected lines read to equal row number"); } } } }
private static void ConcatenateColumn(string inputFilePath, string outputFilePath, string columnName1, string separator, string columnName2, string outputColumnName) { String8 separator8 = String8.Convert(separator, new byte[String8.GetLength(separator)]); using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath)) { // Find the columns to concatenate int columnIndex1 = reader.ColumnIndex(columnName1); int columnIndex2 = reader.ColumnIndex(columnName2); // Build an output column list and mapping from output order to input index, with '-1' for the concatenated value List <string> outputColumns = new List <string>(); int[] indexMapping = new int[reader.Columns.Count - 1]; bool hasConcatenatedColumn = false; for (int i = 0; i < reader.Columns.Count; ++i) { string columnName = reader.Columns[i]; // If this is a column to concatenate... if (columnName.Equals(reader.Columns[columnIndex1], StringComparison.OrdinalIgnoreCase) || columnName.Equals(reader.Columns[columnIndex2], StringComparison.OrdinalIgnoreCase)) { // .. if it's the first one, the output column will appear at this position if (!hasConcatenatedColumn) { hasConcatenatedColumn = true; indexMapping[outputColumns.Count] = -1; outputColumns.Add(outputColumnName); } } else { // Otherwise, copy this column through indexMapping[outputColumns.Count] = i; outputColumns.Add(columnName); } } using (ITabularWriter writer = TabularFactory.BuildWriter(outputFilePath)) { writer.SetColumns(outputColumns); while (reader.NextRow()) { // Write columns in mapped order for (int i = 0; i < indexMapping.Length; ++i) { int sourceColumnIndex = indexMapping[i]; if (sourceColumnIndex == -1) { // Write concatenated column writer.WriteValueStart(); writer.WriteValuePart(reader.Current(columnIndex1).ToString8()); writer.WriteValuePart(separator8); writer.WriteValuePart(reader.Current(columnIndex2).ToString8()); writer.WriteValueEnd(); } else { writer.Write(reader.Current(sourceColumnIndex).ToString8()); } } writer.NextRow(); } WriteSizeSummary(reader, writer); } } }
private static void OnlyLatest(string inputFolderPath, string outputFilePath, string idColumnIdentifier) { String8Block block = new String8Block(); Dictionary <String8, Tuple <string, int> > latestFileAndRowByID = new Dictionary <String8, Tuple <string, int> >(); IReadOnlyList <string> writerColumns = null; // Walk the input files to figure out the latest copy of each ID Trace.WriteLine($"Identifying latest {idColumnIdentifier} in all files in {inputFolderPath}..."); int rowCountRead = 0; foreach (string inputFilePath in Directory.GetFiles(inputFolderPath)) { using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath)) { int idColumnIndex = reader.ColumnIndex(idColumnIdentifier); while (reader.NextRow()) { rowCountRead++; String8 id = reader.Current(idColumnIndex).ToString8(); id.ToUpperInvariant(); // Record the file and row containing this ID, overwriting previous entries latestFileAndRowByID[block.GetCopy(id)] = new Tuple <string, int>(inputFilePath, reader.RowCountRead); } // Capture the columns from the last CSV to write writerColumns = reader.Columns; } } Trace.WriteLine($"Scan Complete. {rowCountRead:n0} rows read; {latestFileAndRowByID.Count:n0} distinct IDs found."); using (ITabularWriter writer = TabularFactory.BuildWriter(outputFilePath)) { writer.SetColumns(writerColumns); int[] writerColumnIndexInReader = new int[writerColumns.Count]; foreach (string inputFilePath in Directory.GetFiles(inputFolderPath)) { using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath)) { // Look up each output column's position in the input file for (int i = 0; i < writerColumns.Count; ++i) { reader.TryGetColumnIndex(writerColumns[i], out writerColumnIndexInReader[i]); } int idColumnIndex = reader.ColumnIndex(idColumnIdentifier); while (reader.NextRow()) { String8 id = reader.Current(idColumnIndex).ToString8(); id.ToUpperInvariant(); // Copy this row to the output file, *if* it's the latest for this ID Tuple <string, int> latestForID = latestFileAndRowByID[id]; if (latestForID.Item1 == inputFilePath && latestForID.Item2 == reader.RowCountRead) { for (int i = 0; i < writerColumns.Count; ++i) { int readerColumnIndex = writerColumnIndexInReader[i]; if (readerColumnIndex >= 0 && readerColumnIndex < reader.CurrentRowColumns) { writer.Write(reader.Current(i).ToString8()); } else { writer.Write(String8.Empty); } } writer.NextRow(); } } } } WriteSizeSummary(null, writer); } }
/// <summary> /// Sanitize an input file into a given output file using this Sanitizer's configuration. /// </summary> /// <param name="inputFile">File Path to input file</param> /// <param name="outputFile">File Path to output file</param> public void Sanitize(string inputFile, string outputFile) { using (ITabularReader reader = TabularFactory.BuildReader(inputFile)) { // Build an array of what we'll do with each input column, and the list of columns we'll actually write List <string> columnsToOutput; IColumnHandler[] handlers = GetHandlersByColumnIndex(reader.Columns, out columnsToOutput); // Find the sample column index, if any, and calculate a hash cutoff for including rows int sampleColumnIndex = (String.IsNullOrEmpty(this.SampleColumnName) ? -1 : reader.ColumnIndex(this.SampleColumnName)); uint sampleInclusionCutoff = (uint)(uint.MaxValue * this.SampleProbability); using (ITabularWriter writer = TabularFactory.BuildWriter(outputFile)) { writer.SetColumns(columnsToOutput); while (reader.NextRow()) { // If there's a sample column, decide whether to include this row if (sampleColumnIndex > -1) { // Sample *without* the hashkey, so the same rows are consistently included or excluded. uint sampleValueHash = Hashing.Hash(reader.Current(sampleColumnIndex).ToString8(), 0); if (sampleValueHash > sampleInclusionCutoff) { continue; } } // Run the handler for every input column, writing the output if there is one for (int i = 0; i < reader.CurrentRowColumns; ++i) { IColumnHandler handler = handlers[i]; if (handler != null) { String8 value = reader.Current(i).ToString8(); String8 replacement = handler.Sanitize(value); writer.Write(replacement); } } writer.NextRow(); } } } }
private static void HtmlInnerText(string inputFilePath, string outputFilePath, string columnsDelimited) { using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath)) { List <int> columnIndicesToEscape = columnsDelimited.Split(',').Select((col) => reader.ColumnIndex(col.Trim())).ToList(); using (ITabularWriter writer = TabularFactory.BuildWriter(outputFilePath)) { writer.SetColumns(reader.Columns); while (reader.NextRow()) { for (int i = 0; i < reader.CurrentRowColumns; ++i) { if (columnIndicesToEscape.Contains(i)) { WriteHtmlEscaped(reader.Current(i).ToString8(), writer); } else { writer.Write(reader.Current(i).ToString8()); } } writer.NextRow(); } WriteSizeSummary(reader, writer); } } }