public void Reader_NewlineVariations(Func <Stream, ITabularWriter> buildWriter, Func <string, bool, ITabularReader> buildReader) { string xsvPath = "NewlineVariations.xsv"; Stream stream = new FileStream(xsvPath, FileMode.Create, FileAccess.ReadWrite); using (ITabularWriter w = buildWriter(stream)) { w.SetColumns(new string[] { "One", "Two", "Three" }); for (int row = 0; row < 3; ++row) { w.Write(3 * row + 1); w.Write(3 * row + 2); w.Write(3 * row + 3); // Write the end of row but then override it long position = stream.Position; w.NextRow(); if (row == 0) { // Row 0 - newline only stream.Seek(position, SeekOrigin.Begin); stream.WriteByte(UTF8.Newline); } else if (row == 2) { // Row 2 - no end of line stream.SetLength(position); } } } using (ITabularReader r = buildReader(xsvPath, true)) { // Verify column heading not clipped even though no '\r' Assert.AreEqual("Three", r.Columns[2]); Assert.IsTrue(r.NextRow()); Assert.AreEqual(3, r.CurrentRowColumns); // Verify last column doesn't have extra '\r' when terminated with '\r\n' Assert.AreEqual("3", r.Current(2).ToString()); Assert.IsTrue(r.NextRow()); Assert.AreEqual(3, r.CurrentRowColumns); // Verify last column not clipped when terminated with '\n' Assert.AreEqual("6", r.Current(2).ToString()); Assert.IsTrue(r.NextRow()); Assert.AreEqual(3, r.CurrentRowColumns); // Verify last column not clipped when unterminated [EOF] Assert.AreEqual("9", r.Current(2).ToString()); Assert.IsFalse(r.NextRow(), "Reader didn't stop after last line without newline"); } }
public void Reader_Roundtrip(Func <string, bool, ITabularReader> buildReader, Func <Stream, ITabularWriter> buildWriter) { string filePath = "ValidSample.xsv"; // Write a valid file with some values which require CSV escaping WriteValidSample(new FileStream(filePath, FileMode.Create, FileAccess.ReadWrite), buildWriter); // Direct Copy the file from the reader to the writer - every value unescaped and then escaped using (ITabularReader reader = buildReader(filePath, true)) { using (ITabularWriter writer = buildWriter(new FileStream(filePath + ".new", FileMode.Create, FileAccess.ReadWrite))) { writer.SetColumns(reader.Columns); while (reader.NextRow()) { for (int i = 0; i < reader.CurrentRowColumns; ++i) { writer.Write(reader.Current(i).ToString8()); } writer.NextRow(); } } } // Verify files are identical string fileBefore = File.ReadAllText(filePath); string fileAfter = File.ReadAllText(filePath + ".new"); Assert.AreEqual(fileBefore, fileAfter); }
private static void Copy(string inputFilePath, string outputFilePath, string columnsDelimited) { List <string> columns = new List <string>(); foreach (string columnName in columnsDelimited.Split(',')) { columns.Add(columnName.Trim()); } using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath)) { int[] columnIndices = new int[columns.Count]; for (int i = 0; i < columnIndices.Length; ++i) { columnIndices[i] = reader.ColumnIndex(columns[i]); } using (ITabularWriter writer = TabularFactory.BuildWriter(outputFilePath)) { writer.SetColumns(columns); while (reader.NextRow()) { for (int i = 0; i < columnIndices.Length; ++i) { writer.Write(reader.Current(columnIndices[i]).ToString8()); } writer.NextRow(); } WriteSizeSummary(reader, writer); } } }
private static void HtmlInnerText(string inputFilePath, string outputFilePath, string columnsDelimited) { using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath)) { List <int> columnIndicesToEscape = columnsDelimited.Split(',').Select((col) => reader.ColumnIndex(col.Trim())).ToList(); using (ITabularWriter writer = TabularFactory.BuildWriter(outputFilePath)) { writer.SetColumns(reader.Columns); while (reader.NextRow()) { for (int i = 0; i < reader.CurrentRowColumns; ++i) { if (columnIndicesToEscape.Contains(i)) { WriteHtmlEscaped(reader.Current(i).ToString8(), writer); } else { writer.Write(reader.Current(i).ToString8()); } } writer.NextRow(); } WriteSizeSummary(reader, writer); } } }
private static void MatchContains(ITabularReader reader, ITabularWriter writer, WhereResult result) { string valueString = (string)result.Value; String8 value = String8.Convert(valueString, new byte[String8.GetLength(valueString)]); while (reader.NextRow()) { // Ensure the row has enough columns if (reader.CurrentRowColumns <= result.ColumnIndex) { continue; } // Match the value if (reader.Current(result.ColumnIndex).ToString8().IndexOf(value) == -1) { continue; } result.MatchCount++; // If this is the matching row, write it EchoRow(reader, writer); } }
private static void NotStartsWith(string inputFilePath, string outputFilePath, string valueColumnIdentifier, string nameColumnIdentifier) { using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath)) { int valueColumnIndex = reader.ColumnIndex(valueColumnIdentifier); int nameColumnIndex = reader.ColumnIndex(nameColumnIdentifier); using (ITabularWriter writer = TabularFactory.BuildWriter(outputFilePath)) { writer.SetColumns(reader.Columns); while (reader.NextRow()) { String8 name = reader.Current(nameColumnIndex).ToString8(); String8 value = reader.Current(valueColumnIndex).ToString8(); if (!value.StartsWith(name)) { for (int i = 0; i < reader.CurrentRowColumns; ++i) { writer.Write(reader.Current(i).ToString8()); } writer.NextRow(); } } WriteSizeSummary(reader, writer); } } }
private static void Compare(string oldFilePath, string newFilePath, string outputFilePath, string columnIdentifier) { String8Block block = new String8Block(); HashSet <String8> oldValues = new HashSet <String8>(); HashSet <String8> newValues = new HashSet <String8>(); using (ITabularReader oldReader = TabularFactory.BuildReader(oldFilePath)) { int leftColumnIndex = oldReader.ColumnIndex(columnIdentifier); while (oldReader.NextRow()) { oldValues.Add(block.GetCopy(oldReader.Current(leftColumnIndex))); } Trace.WriteLine(String.Format("Old: {0:n0} values for \"{1}\" in {2:n0} rows.", oldValues.Count, columnIdentifier, oldReader.RowCountRead)); } using (ITabularReader newReader = TabularFactory.BuildReader(newFilePath)) { int rightColumnIndex = newReader.ColumnIndex(columnIdentifier); while (newReader.NextRow()) { newValues.Add(block.GetCopy(newReader.Current(rightColumnIndex))); } Trace.WriteLine(String.Format("New: {0:n0} values for \"{1}\" in {2:n0} rows.", newValues.Count, columnIdentifier, newReader.RowCountRead)); } HashSet <String8> oldOnly = new HashSet <String8>(oldValues); oldOnly.ExceptWith(newValues); HashSet <String8> newOnly = new HashSet <String8>(newValues); newOnly.ExceptWith(oldValues); Trace.WriteLine(String.Format("{0:n0} values were only in \"{1}\".\r\n{2:n0} values were only in \"{3}\".", oldOnly.Count, oldFilePath, newOnly.Count, newFilePath)); String8 leftMarker = String8.Convert("-", new byte[1]); String8 rightMarker = String8.Convert("+", new byte[1]); using (ITabularWriter writer = TabularFactory.BuildWriter(outputFilePath)) { writer.SetColumns(new string[] { "In", columnIdentifier }); foreach (String8 value in oldOnly) { writer.Write(leftMarker); writer.Write(value); writer.NextRow(); } foreach (String8 value in newOnly) { writer.Write(rightMarker); writer.Write(value); writer.NextRow(); } } }
private static void HtmlInnerText(string inputFilePath, string outputFilePath, string columnIdentifier) { using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath)) { int columnIndexToEscape = reader.ColumnIndex(columnIdentifier); using (ITabularWriter writer = TabularFactory.BuildWriter(outputFilePath)) { writer.SetColumns(reader.Columns); while (reader.NextRow()) { for (int i = 0; i < reader.CurrentRowColumns; ++i) { if (i == columnIndexToEscape) { WriteHtmlEscaped(reader.Current(i).ToString8(), writer); } else { writer.Write(reader.Current(i).ToString8()); } } writer.NextRow(); } WriteSizeSummary(reader, writer); } } }
private static void MatchBoolCompare(ITabularReader reader, ITabularWriter writer, WhereResult result) { bool value = (bool)result.Value; while (reader.NextRow()) { // Ensure the row has enough columns if (reader.CurrentRowColumns <= result.ColumnIndex) { continue; } // Ensure the value converts bool columnValue; if (!reader.Current(result.ColumnIndex).ToString8().TryToBoolean(out columnValue)) { continue; } int compareResult = columnValue.CompareTo(value); if (!result.Op.Matches(compareResult)) { continue; } result.MatchCount++; // If this is the matching row, write it EchoRow(reader, writer); } }
private static void RowId(string inputFilePath, string outputFilePath, int firstId = 1) { int currentId = firstId; using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath)) { using (ITabularWriter writer = TabularFactory.BuildWriter(outputFilePath)) { List <string> columns = new List <string>(); columns.Add("ID"); columns.AddRange(reader.Columns); writer.SetColumns(columns); while (reader.NextRow()) { writer.Write(currentId); currentId++; for (int i = 0; i < reader.CurrentRowColumns; ++i) { writer.Write(reader.Current(i).ToString8()); } writer.NextRow(); } WriteSizeSummary(reader, writer); } } }
private static void Distinct(string inputFilePath, string outputFilePath, string columnIdentifier) { String8Block block = new String8Block(); HashSet <String8> distinctValues = new HashSet <String8>(); using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath)) { int columnIndex = reader.ColumnIndex(columnIdentifier); using (ITabularWriter writer = TabularFactory.BuildWriter(outputFilePath)) { writer.SetColumns(new string[] { reader.Columns[columnIndex] }); while (reader.NextRow()) { String8 value = reader.Current(columnIndex).ToString8(); if (!distinctValues.Contains(value)) { distinctValues.Add(block.GetCopy(value)); writer.Write(value); writer.NextRow(); } } WriteSizeSummary(reader, writer); } } }
private static void Copy(string inputFilePath, string outputFilePath, int rowLimit = -1) { using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath)) { using (ITabularWriter writer = TabularFactory.BuildWriter(outputFilePath)) { writer.SetColumns(reader.Columns); while (reader.NextRow()) { for (int i = 0; i < reader.CurrentRowColumns; ++i) { writer.Write(reader.Current(i).ToString8()); } writer.NextRow(); if (writer.RowCountWritten == rowLimit) { break; } } WriteSizeSummary(reader, writer); } } }
private static void Concatenate(string inputFilePath, string outputFilePath, String8 delimiter) { using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath)) { using (ITabularWriter writer = TabularFactory.BuildWriter(outputFilePath)) { writer.SetColumns(reader.Columns); String8Block block = new String8Block(); String8[] lastValues = new String8[reader.CurrentRowColumns]; String8[] combinedValues = new String8[reader.CurrentRowColumns]; while (reader.NextRow()) { String8 firstColumn = reader.Current(0).ToString8(); if (reader.RowCountRead == 2) { // First Row - Get the first ID only combinedValues[0] = block.GetCopy(firstColumn); } else if (firstColumn.CompareTo(combinedValues[0], true) != 0) { // If we have a new ID (and not first row) // Write concatenated values for previous ID WriteCombinedRow(writer, combinedValues); // Reset for this ID block.Clear(); combinedValues[0] = block.GetCopy(firstColumn); for (int i = 1; i < combinedValues.Length; ++i) { combinedValues[i] = String8.Empty; } } // Concatenate non-duplicate values to "row in progress" for (int i = 1; i < reader.CurrentRowColumns; ++i) { String8 value = reader.Current(i).ToString8(); if (lastValues[i] != value) { lastValues[i] = value; combinedValues[i] = block.Concatenate(combinedValues[i], delimiter, value); } } } // After last row, write out values so far WriteCombinedRow(writer, combinedValues); WriteSizeSummary(reader, writer); } } }
private static IEnumerable <DataBlock> ReadAsDataBlockBatch(ITabularReader reader, IList <string> columnNames) { // Build a DataBlock to hold a batch of rows int columnCount = columnNames.Count; DataBlock result = new DataBlock(columnNames, BatchSize); Value[][] columnArrays = new Value[columnCount][]; for (int i = 0; i < columnCount; ++i) { columnArrays[i] = new Value[BatchSize]; for (int j = 0; j < BatchSize; ++j) { columnArrays[i][j] = Value.Create(null); } result.SetColumn(i, columnArrays[i]); } // Look up indices of the columns int[] columnIndices = new int[columnCount]; for (int i = 0; i < columnCount; ++i) { columnIndices[i] = reader.ColumnIndex(columnNames[i]); } // Fill blocks with rows as we go int currentRowCount = 0; String8Block block = new String8Block(); while (reader.NextRow()) { for (int i = 0; i < columnCount; ++i) { String8 cell = block.GetCopy(reader.Current(columnIndices[i]).ToString8()); columnArrays[i][currentRowCount].Assign(new ByteBlock(cell.Array, cell.Index, cell.Length)); //columnArrays[i][currentRowCount].Assign(cell.ToString()); } currentRowCount++; if (currentRowCount == BatchSize) { yield return(result); currentRowCount = 0; block.Clear(); } } if (currentRowCount > 0) { yield return(result); } }
private static void CopyRows(ITabularReader reader, ITabularWriter writer) { while (reader.NextRow()) { for (int i = 0; i < reader.CurrentRowColumns; ++i) { writer.Write(reader.Current(i).ToString8()); } writer.NextRow(); } }
public void Sanitize_EndToEnd() { Assembly xsvTest = Assembly.GetExecutingAssembly(); Resource.SaveStreamTo("Xsv.Test.Sanitize.SanitizeSampleSource.csv", "SanitizeSampleSource.csv", xsvTest); Resource.SaveStreamTo("Xsv.Test.Sanitize.SanitizeSampleSource.sanispec", "SanitizeSampleSource.sanispec", xsvTest); // Verify UsageException if no key is passed Assert.AreEqual(-2, Program.Main(new string[] { "sanitize", @"SanitizeSampleSource.csv", "SanitizeOutput.csv", @"SanitizeSampleSource.sanispec" })); // Verify success for base sanitize File.Delete("SanitizeOutput.csv"); Assert.AreEqual(0, Program.Main(new string[] { "sanitize", @"SanitizeSampleSource.csv", "SanitizeOutput.csv", @"SanitizeSampleSource.sanispec", "Key1" })); // Validate the result using (ITabularReader r = TabularFactory.BuildReader("SanitizeOutput.csv")) { Assert.IsTrue(r.Columns.Contains("ID"), "ID column is kept (no spec line)"); Assert.IsTrue(r.Columns.Contains("Path"), "Path column is kept (mapped)"); Assert.IsTrue(r.Columns.Contains("IsEmptyPath"), "IsEmptyPath is kept (Keep line)"); Assert.IsFalse(r.Columns.Contains("IsUnderXsv"), "IxUnderXsv column is dropped (Drop line)"); int idColumnIndex = r.ColumnIndex("ID"); int pathColumnIndex = r.ColumnIndex("Path"); int isEmptyPathColumnIndex = r.ColumnIndex("IsEmptyPath"); while (r.NextRow()) { int id = r.Current(idColumnIndex).ToInteger(); string path = r.Current(pathColumnIndex).ToString(); Assert.AreEqual(r.Current(isEmptyPathColumnIndex).ToBoolean(), String.IsNullOrEmpty(path), "IsEmptyPath condition matches whether mapped path is empty"); if (id == 5) { Assert.AreEqual("Elfie", path, "'Elfie' is echoed (Echo in spec)"); } else if (!String.IsNullOrEmpty(path)) { Assert.IsTrue(path.StartsWith("WarmBeggedTruth\\"), "Verify path is mapped in parts, and 'Elfie' is consistently mapped."); } } Assert.IsTrue(r.RowCountRead < 7, "Verify sample excluded at least one row."); } // Run with another key Assert.AreEqual(0, Program.Main(new string[] { "sanitize", @"SanitizeSampleSource.csv", "SanitizeOutput2.csv", @"SanitizeSampleSource.sanispec", "Key2" })); // Verify mappings are different Assert.AreNotEqual(File.ReadAllText("SanitizeOutput2.csv"), File.ReadAllText("SanitizeOutput.csv")); }
/// <summary> /// Sanitize an input file into a given output file using this Sanitizer's configuration. /// </summary> /// <param name="inputFile">File Path to input file</param> /// <param name="outputFile">File Path to output file</param> public void Sanitize(string inputFile, string outputFile) { using (ITabularReader reader = TabularFactory.BuildReader(inputFile)) { // Build an array of what we'll do with each input column, and the list of columns we'll actually write List <string> columnsToOutput; IColumnHandler[] handlers = GetHandlersByColumnIndex(reader.Columns, out columnsToOutput); // Find the sample column index, if any, and calculate a hash cutoff for including rows int sampleColumnIndex = (String.IsNullOrEmpty(this.SampleColumnName) ? -1 : reader.ColumnIndex(this.SampleColumnName)); uint sampleInclusionCutoff = (uint)(uint.MaxValue * this.SampleProbability); using (ITabularWriter writer = TabularFactory.BuildWriter(outputFile)) { writer.SetColumns(columnsToOutput); while (reader.NextRow()) { // If there's a sample column, decide whether to include this row if (sampleColumnIndex > -1) { // Sample *without* the hashkey, so the same rows are consistently included or excluded. uint sampleValueHash = Hashing.Hash(reader.Current(sampleColumnIndex).ToString8(), 0); if (sampleValueHash > sampleInclusionCutoff) { continue; } } // Run the handler for every input column, writing the output if there is one for (int i = 0; i < reader.CurrentRowColumns; ++i) { IColumnHandler handler = handlers[i]; if (handler != null) { String8 value = reader.Current(i).ToString8(); String8 replacement = handler.Sanitize(value); writer.Write(replacement); } } writer.NextRow(); } } } }
private static TableMetadata Build(IStreamProvider streamProvider, string tableRootPath) { TableMetadata metadata = new TableMetadata(); string schemaFilePath = Path.Combine(tableRootPath, SchemaFileName); using (ITabularReader sr = TabularFactory.BuildReader(streamProvider.OpenRead(schemaFilePath), SchemaFileName)) { int nameIndex = sr.ColumnIndex("Name"); int typeIndex = sr.ColumnIndex("Type"); while (sr.NextRow()) { metadata.Schema.Add(new ColumnDetails(sr.Current(nameIndex).ToString(), TypeProviderFactory.Get(sr.Current(typeIndex).ToString()).Type)); } } using (ITabularReader mr = TabularFactory.BuildReader(streamProvider.OpenRead(Path.Combine(tableRootPath, MetadataFileName)), MetadataFileName)) { int nameIndex = mr.ColumnIndex("Name"); int contextIndex = mr.ColumnIndex("Context"); int valueIndex = mr.ColumnIndex("Value"); while (mr.NextRow()) { String8 name = mr.Current(nameIndex).ToString8(); String8 context = mr.Current(contextIndex).ToString8(); ITabularValue value = mr.Current(valueIndex); if (name.Equals("RowCount")) { metadata.RowCount = value.ToInteger(); } else { throw new NotImplementedException($"TableMetadataSerializer.Read doesn't know how to read Metadata '{name}'"); } } } metadata.Query = streamProvider.ReadAllText(Path.Combine(tableRootPath, ConfigQueryPath)); return(metadata); }
public int Next(int desiredCount, CancellationToken cancellationToken) { // Stop reading on cancellation if (cancellationToken.IsCancellationRequested) { return(0); } if (_cells[0] == null || _cells[0].Length < desiredCount) { for (int i = 0; i < _cells.Length; ++i) { Allocator.AllocateToSize(ref _cells[i], desiredCount); } } //return _reader.NextRow(); _block.Clear(); CurrentRowCount = 0; while (_reader.NextRow()) { for (int i = 0; i < _cells.Length; ++i) { _cells[i][CurrentRowCount] = _block.GetCopy(_reader.Current(i).ToString8()); } CurrentRowCount++; if (CurrentRowCount == desiredCount) { break; } } for (int i = 0; i < _columns.Length; ++i) { _columns[i].SetValues(_cells[i]); } return(CurrentRowCount); }
private static void OnlyIn(string inputFilePath, string outputFilePath, string onlyInInputFilePath, string onlyInColumnIdentifier) { String8Block block = new String8Block(); HashSet <String8> values = new HashSet <String8>(); // Read values in 'onlyInInputFilePath' using (ITabularReader reader = TabularFactory.BuildReader(onlyInInputFilePath)) { int leftColumnIndex = reader.ColumnIndex(onlyInColumnIdentifier); while (reader.NextRow()) { values.Add(block.GetCopy(reader.Current(leftColumnIndex))); } } // Copy from input to output where the column value is in the "only in" set using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath)) { int valueColumnIndex = reader.ColumnIndex(onlyInColumnIdentifier); using (ITabularWriter writer = TabularFactory.BuildWriter(outputFilePath)) { writer.SetColumns(reader.Columns); while (reader.NextRow()) { if (values.Contains(reader.Current(valueColumnIndex).ToString8())) { for (int i = 0; i < reader.CurrentRowColumns; ++i) { writer.Write(reader.Current(i).ToString8()); } writer.NextRow(); } } WriteSizeSummary(reader, writer); } } }
public void Reader_Performance(string sampleFilePath, Func <string, bool, ITabularReader> buildReader) { long rowCountRead = 0; long xsvLengthBytes = new FileInfo(sampleFilePath).Length; // Goal: 100MB/sec [Surface Book i7] Verify.PerformanceByBytes(50 * LongExtensions.Megabyte, () => { int iterations = 100; for (int iteration = 0; iteration < iterations; ++iteration) { using (ITabularReader r = buildReader(sampleFilePath, true)) { int lineNumberIndex = r.ColumnIndex("LineNumber"); int countIndex = r.ColumnIndex("Count"); int descriptionIndex = r.ColumnIndex("Description"); while (r.NextRow()) { rowCountRead++; if (r.CurrentRowColumns < 2) { continue; } int lineNumber; r.Current(lineNumberIndex).TryToInteger(out lineNumber); int count; r.Current(countIndex).TryToInteger(out count); String8 description = r.Current(descriptionIndex).ToString8(); } } } return(iterations * xsvLengthBytes); }); }
private static void OnlyLatest(string inputFolderPath, string outputFilePath, string idColumnIdentifier) { String8Block block = new String8Block(); Dictionary <String8, Tuple <string, int> > latestFileAndRowByID = new Dictionary <String8, Tuple <string, int> >(); IReadOnlyList <string> writerColumns = null; // Walk the input files to figure out the latest copy of each ID Trace.WriteLine($"Identifying latest {idColumnIdentifier} in all files in {inputFolderPath}..."); int rowCountRead = 0; foreach (string inputFilePath in Directory.GetFiles(inputFolderPath)) { using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath)) { int idColumnIndex = reader.ColumnIndex(idColumnIdentifier); while (reader.NextRow()) { rowCountRead++; String8 id = reader.Current(idColumnIndex).ToString8(); id.ToUpperInvariant(); // Record the file and row containing this ID, overwriting previous entries latestFileAndRowByID[block.GetCopy(id)] = new Tuple <string, int>(inputFilePath, reader.RowCountRead); } // Capture the columns from the last CSV to write writerColumns = reader.Columns; } } Trace.WriteLine($"Scan Complete. {rowCountRead:n0} rows read; {latestFileAndRowByID.Count:n0} distinct IDs found."); using (ITabularWriter writer = TabularFactory.BuildWriter(outputFilePath)) { writer.SetColumns(writerColumns); int[] writerColumnIndexInReader = new int[writerColumns.Count]; foreach (string inputFilePath in Directory.GetFiles(inputFolderPath)) { using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath)) { // Look up each output column's position in the input file for (int i = 0; i < writerColumns.Count; ++i) { reader.TryGetColumnIndex(writerColumns[i], out writerColumnIndexInReader[i]); } int idColumnIndex = reader.ColumnIndex(idColumnIdentifier); while (reader.NextRow()) { String8 id = reader.Current(idColumnIndex).ToString8(); id.ToUpperInvariant(); // Copy this row to the output file, *if* it's the latest for this ID Tuple <string, int> latestForID = latestFileAndRowByID[id]; if (latestForID.Item1 == inputFilePath && latestForID.Item2 == reader.RowCountRead) { for (int i = 0; i < writerColumns.Count; ++i) { int readerColumnIndex = writerColumnIndexInReader[i]; if (readerColumnIndex >= 0 && readerColumnIndex < reader.CurrentRowColumns) { writer.Write(reader.Current(i).ToString8()); } else { writer.Write(String8.Empty); } } writer.NextRow(); } } } } WriteSizeSummary(null, writer); } }
public void TsvSplit() { Stream tsvStream = new MemoryStream(); //Stream tsvStream = new FileStream("Sample.tsv", FileMode.Create); int rowCount = 1000 * 1000; WriteSampleTsv(tsvStream, 5, 1000 * 1000); byte[] content = new byte[64 * 1024]; BitVector cells = new BitVector(content.Length); BitVector rows = new BitVector(content.Length); int[] rowEnds = new int[1024]; byte[] allContent = new byte[tsvStream.Length]; tsvStream.Seek(0, SeekOrigin.Begin); tsvStream.Read(allContent, 0, allContent.Length); BitVector allCells = new BitVector(allContent.Length); BitVector allRows = new BitVector(allContent.Length); using (Benchmarker b = new Benchmarker($"Tsv Parse [{rowCount:n0}] | count", DefaultMeasureMilliseconds)) { b.Measure("Read only", (int)tsvStream.Length, () => { tsvStream.Seek(0, SeekOrigin.Begin); while (true) { int lengthRead = tsvStream.Read(content, 0, content.Length); if (lengthRead == 0) { break; } } return(rowCount); }); b.Measure("ReadLine | Split", (int)tsvStream.Length, () => { tsvStream.Seek(0, SeekOrigin.Begin); int count = 0; StreamReader reader = new StreamReader(tsvStream); { // Header row reader.ReadLine(); while (!reader.EndOfStream) { string line = reader.ReadLine(); string[] cellSet = line.Split('\t'); count++; } } return(count); }); b.Measure("Elfie TsvReader", (int)tsvStream.Length, () => { tsvStream.Seek(0, SeekOrigin.Begin); int count = 0; ITabularReader reader = TabularFactory.BuildReader(tsvStream, "Unused.tsv"); { while (reader.NextRow()) { count++; } } return(count); }); Func <byte[], int, int, ulong[], ulong[], int> splitTsvN = NativeAccelerator.GetMethod <Func <byte[], int, int, ulong[], ulong[], int> >("XForm.Native.String8N", "SplitTsv"); b.Measure("XForm Native Split", (int)tsvStream.Length, () => { tsvStream.Seek(0, SeekOrigin.Begin); int count = -1; while (true) { int lengthRead = tsvStream.Read(content, 0, content.Length); if (lengthRead == 0) { break; } if (lengthRead < content.Length) { Array.Clear(content, lengthRead, content.Length - lengthRead); } int lineCount = splitTsvN(content, 0, lengthRead, cells.Array, rows.Array); count += lineCount; int fromRow = 0; int countCopy = cells.Page(rowEnds, ref fromRow); } return(count); }); b.MeasureParallel("XForm Native Split Parallel", (int)tsvStream.Length, (index, length) => { return(splitTsvN(allContent, index, length, allCells.Array, allRows.Array) - 1); }); } }
private static void ConcatenateColumn(string inputFilePath, string outputFilePath, string columnName1, string separator, string columnName2, string outputColumnName) { String8 separator8 = String8.Convert(separator, new byte[String8.GetLength(separator)]); using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath)) { // Find the columns to concatenate int columnIndex1 = reader.ColumnIndex(columnName1); int columnIndex2 = reader.ColumnIndex(columnName2); // Build an output column list and mapping from output order to input index, with '-1' for the concatenated value List <string> outputColumns = new List <string>(); int[] indexMapping = new int[reader.Columns.Count - 1]; bool hasConcatenatedColumn = false; for (int i = 0; i < reader.Columns.Count; ++i) { string columnName = reader.Columns[i]; // If this is a column to concatenate... if (columnName.Equals(reader.Columns[columnIndex1], StringComparison.OrdinalIgnoreCase) || columnName.Equals(reader.Columns[columnIndex2], StringComparison.OrdinalIgnoreCase)) { // .. if it's the first one, the output column will appear at this position if (!hasConcatenatedColumn) { hasConcatenatedColumn = true; indexMapping[outputColumns.Count] = -1; outputColumns.Add(outputColumnName); } } else { // Otherwise, copy this column through indexMapping[outputColumns.Count] = i; outputColumns.Add(columnName); } } using (ITabularWriter writer = TabularFactory.BuildWriter(outputFilePath)) { writer.SetColumns(outputColumns); while (reader.NextRow()) { // Write columns in mapped order for (int i = 0; i < indexMapping.Length; ++i) { int sourceColumnIndex = indexMapping[i]; if (sourceColumnIndex == -1) { // Write concatenated column writer.WriteValueStart(); writer.WriteValuePart(reader.Current(columnIndex1).ToString8()); writer.WriteValuePart(separator8); writer.WriteValuePart(reader.Current(columnIndex2).ToString8()); writer.WriteValueEnd(); } else { writer.Write(reader.Current(sourceColumnIndex).ToString8()); } } writer.NextRow(); } WriteSizeSummary(reader, writer); } } }
public void Reader_Basics(string sampleFilePath, Func <string, bool, ITabularReader> buildReader) { // File Not Found Verify.Exception <FileNotFoundException>(() => buildReader("NonExistantFile.xsv", false)); // Empty File File.WriteAllText("Empty.xsv", ""); // Verify Reader throws on construction if trying to read headers Verify.Exception <IOException>(() => buildReader("Empty.xsv", true)); // Verify Reader returns false immediately if not reading headers using (ITabularReader r = buildReader("Empty.xsv", false)) { Assert.IsFalse(r.NextRow()); } // Verify Reader doesn't consume header row if asked not to using (ITabularReader r = buildReader(sampleFilePath, false)) { Assert.IsTrue(r.NextRow()); Assert.AreEqual("LineNumber", r.Current(0).ToString()); // Get column name (no header row read) Verify.Exception <ColumnNotFoundException>(() => r.ColumnIndex("Missing")); } // Open the sample Tsv the 'expected' way using (ITabularReader r = buildReader(sampleFilePath, true)) { // Get column name (valid) int lineNumberColumnIndex = r.ColumnIndex("LineNumber"); Assert.AreEqual(0, lineNumberColumnIndex, "LineNumber column not expected"); // Get column name (different case, but valid) int descriptionColumnIndex = r.ColumnIndex("deSCRiption"); Assert.AreEqual(2, descriptionColumnIndex, "Description column not expected"); // Get column name (unknown) Verify.Exception <ColumnNotFoundException>(() => r.ColumnIndex("UnknownColumn")); while (r.NextRow()) { int rowIndex = r.RowCountRead; if (rowIndex % 100 == 99) { // Verify empty rows return no columns, have empty row text, throw on value access Assert.AreEqual(0, r.CurrentRowColumns, "Expected column count 0 in empty rows"); Verify.Exception <ArgumentOutOfRangeException>(() => { var v = r.Current(lineNumberColumnIndex); }); } else if (rowIndex == 5000) { // Read row over 64k [block resizing logic, row values look right] String8 longDescription = r.Current(descriptionColumnIndex).ToString8(); Assert.AreEqual(100000, longDescription.Length); } else { // Get value (valid) String8 lineNumber8 = r.Current(lineNumberColumnIndex).ToString8(); int lineNumber = 0; if (lineNumber8.TryToInteger(out lineNumber)) { Assert.AreEqual(rowIndex, lineNumber, "Expected line number to equal row number"); } else { Assert.Fail(String.Format("\"{0}\" was not converted to an integer.", lineNumber8)); } // Get line number Assert.AreEqual(rowIndex, r.RowCountRead, "Expected lines read to equal row number"); } } } }
private static void Where(string inputFilePath, string columnIndentifier, string value, ITabularWriter writer) { int matchCount = 0; int rowCount = 0; using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath)) { int rowIndex = (value != null ? -1 : int.Parse(columnIndentifier)); int colIndex = (value != null ? reader.ColumnIndex(columnIndentifier) : -1); while (reader.NextRow()) { // Match the row index if no value was passed if (rowIndex != -1 && reader.RowCountRead != rowIndex) { continue; } // Match the column value if passed if (colIndex != -1) { if (reader.CurrentRowColumns <= colIndex) { continue; } if (reader.Current(colIndex).ToString8().CompareTo(value, true) != 0) { continue; } } matchCount++; // If this is the matching row, write it if (writer != null) { if (writer.RowCountWritten == 0) { List <string> columns = new List <string>(); columns.Add("RowIndex"); columns.AddRange(reader.Columns); writer.SetColumns(columns); } writer.Write(reader.RowCountRead); for (int i = 0; i < reader.CurrentRowColumns; ++i) { writer.Write(reader.Current(i).ToString8()); } writer.NextRow(); } // If we matched row index, we're done if (rowIndex != -1) { break; } } rowCount = reader.RowCountRead; } Console.WriteLine($"Done. {matchCount:n0} out of {rowCount:n0} rows matched."); }