private static void OnlyIn(string inputFilePath, string outputFilePath, string onlyInInputFilePath, string onlyInColumnIdentifier) { String8Block block = new String8Block(); HashSet <String8> values = new HashSet <String8>(); // Read values in 'onlyInInputFilePath' using (ITabularReader reader = TabularFactory.BuildReader(onlyInInputFilePath)) { int leftColumnIndex = reader.ColumnIndex(onlyInColumnIdentifier); while (reader.NextRow()) { values.Add(block.GetCopy(reader.Current(leftColumnIndex))); } } // Copy from input to output where the column value is in the "only in" set using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath)) { int valueColumnIndex = reader.ColumnIndex(onlyInColumnIdentifier); using (ITabularWriter writer = TabularFactory.BuildWriter(outputFilePath)) { writer.SetColumns(reader.Columns); while (reader.NextRow()) { if (values.Contains(reader.Current(valueColumnIndex).ToString8())) { for (int i = 0; i < reader.CurrentRowColumns; ++i) { writer.Write(reader.Current(i).ToString8()); } writer.NextRow(); } } WriteSizeSummary(reader, writer); } } }
public void Reader_Roundtrip_NoHeader(Func <string, bool, ITabularReader> buildReader, Func <Stream, ITabularWriter> buildWriter) { string filePath = "ValidSample.xsv"; // Write a valid file with some values which require CSV escaping WriteValidSample(new FileStream(filePath, FileMode.Create, FileAccess.ReadWrite), buildWriter); // Direct Copy the file from the reader to the writer - every value unescaped and then escaped using (ITabularReader reader = buildReader(filePath, false)) { using (ITabularWriter writer = buildWriter(new FileStream(filePath + ".new", FileMode.Create, FileAccess.ReadWrite))) { // Get first row and output as header reader.NextRow(); List <string> firstRowValues = new List <string>(); for (int i = 0; i < reader.CurrentRowColumns; ++i) { firstRowValues.Add(reader.Current(i).ToString()); } writer.SetColumns(firstRowValues); // Copy remaining rows while (reader.NextRow()) { for (int i = 0; i < reader.CurrentRowColumns; ++i) { writer.Write(reader.Current(i).ToString8()); } writer.NextRow(); } } } // Verify files are identical string fileBefore = File.ReadAllText(filePath); string fileAfter = File.ReadAllText(filePath + ".new"); Assert.AreEqual(fileBefore, fileAfter); }
private static void Copy(string inputFilePath, string outputFilePath) { using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath)) { using (ITabularWriter writer = TabularFactory.BuildWriter(outputFilePath)) { writer.SetColumns(reader.Columns); while (reader.NextRow()) { for (int i = 0; i < reader.CurrentRowColumns; ++i) { writer.Write(reader.Current(i).ToString8()); } writer.NextRow(); } WriteSizeSummary(reader, writer); } } }
public void WriteTo(ITabularWriter writer, String8Block block, int id, WebRequestWriteMode mode) { if (writer.RowCountWritten == 0) { List <string> columnNames = new List <string>(new string[] { "ID", "EventTime", "DataCenter", "ServerName", "ServerPort", "HttpMethod", "HttpStatus", "RequestBytes", "ResponseBytes", "TimeTakenMs", "Protocol", "WasEncrypted", "WasCachedResponse", "ClientRegion", "ClientBrowser", "ClientOs", }); if (mode != WebRequestWriteMode.Minimal) { columnNames.Add("ClientIP"); columnNames.Add("UriStem"); columnNames.Add("UserID"); if (mode != WebRequestWriteMode.UserIdentityOnly) { columnNames.AddRange(new string[] { "UserEmailAddress", "UserGuid", "IsPremiumUser", "JoinDate" }); } } writer.SetColumns(columnNames); } block.Clear(); writer.Write(id); writer.Write(this.EventTime); writer.Write(block.GetCopy(this.DataCenter)); writer.Write(block.GetCopy(this.ServerName)); writer.Write(this.ServerPort); writer.Write(block.GetCopy(this.HttpMethod)); writer.Write(this.HttpStatus); if (this.RequestBytes.HasValue) { writer.Write(this.RequestBytes.Value); } else { writer.Write(String8.Empty); } writer.Write(this.ResponseBytes); writer.Write((int)this.TimeTakenMs); writer.Write(block.GetCopy(this.Protocol)); writer.Write(this.WasEncrypted); writer.Write(this.WasCachedResponse); writer.Write(block.GetCopy(this.User.Region)); writer.Write(block.GetCopy(this.User.Browser)); writer.Write(block.GetCopy(this.User.OS)); if (mode != WebRequestWriteMode.Minimal) { writer.Write(this.ClientIP); writer.Write(block.GetCopy(this.UriStem)); if (this.IsAnonymous) { writer.Write(String8.Empty); } else { writer.Write(this.User.ID); } if (mode != WebRequestWriteMode.UserIdentityOnly) { if (this.IsAnonymous) { writer.Write(String8.Empty); } else { writer.Write(block.GetCopy(this.User.EmailAddress)); } if (this.IsAnonymous) { writer.Write(String8.Empty); } else { writer.Write(block.GetCopy(this.User.Guid.ToString())); } if (this.IsAnonymous) { writer.Write(String8.Empty); } else { writer.Write(this.User.IsPremiumUser); } if (this.IsAnonymous) { writer.Write(String8.Empty); } else { writer.Write(this.User.JoinDate); } } } writer.NextRow(); }
private static void Where(string inputFilePath, string columnIndentifier, string value, ITabularWriter writer) { int matchCount = 0; int rowCount = 0; using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath)) { int rowIndex = (value != null ? -1 : int.Parse(columnIndentifier)); int colIndex = (value != null ? reader.ColumnIndex(columnIndentifier) : -1); while (reader.NextRow()) { // Match the row index if no value was passed if (rowIndex != -1 && reader.RowCountRead != rowIndex) { continue; } // Match the column value if passed if (colIndex != -1) { if (reader.CurrentRowColumns <= colIndex) { continue; } if (reader.Current(colIndex).ToString8().CompareTo(value, true) != 0) { continue; } } matchCount++; // If this is the matching row, write it if (writer != null) { if (writer.RowCountWritten == 0) { List <string> columns = new List <string>(); columns.Add("RowIndex"); columns.AddRange(reader.Columns); writer.SetColumns(columns); } writer.Write(reader.RowCountRead); for (int i = 0; i < reader.CurrentRowColumns; ++i) { writer.Write(reader.Current(i).ToString8()); } writer.NextRow(); } // If we matched row index, we're done if (rowIndex != -1) { break; } } rowCount = reader.RowCountRead; } Console.WriteLine($"Done. {matchCount:n0} out of {rowCount:n0} rows matched."); }
private static void ConcatenateColumn(string inputFilePath, string outputFilePath, string columnName1, string separator, string columnName2, string outputColumnName) { String8 separator8 = String8.Convert(separator, new byte[String8.GetLength(separator)]); using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath)) { // Find the columns to concatenate int columnIndex1 = reader.ColumnIndex(columnName1); int columnIndex2 = reader.ColumnIndex(columnName2); // Build an output column list and mapping from output order to input index, with '-1' for the concatenated value List <string> outputColumns = new List <string>(); int[] indexMapping = new int[reader.Columns.Count - 1]; bool hasConcatenatedColumn = false; for (int i = 0; i < reader.Columns.Count; ++i) { string columnName = reader.Columns[i]; // If this is a column to concatenate... if (columnName.Equals(reader.Columns[columnIndex1], StringComparison.OrdinalIgnoreCase) || columnName.Equals(reader.Columns[columnIndex2], StringComparison.OrdinalIgnoreCase)) { // .. if it's the first one, the output column will appear at this position if (!hasConcatenatedColumn) { hasConcatenatedColumn = true; indexMapping[outputColumns.Count] = -1; outputColumns.Add(outputColumnName); } } else { // Otherwise, copy this column through indexMapping[outputColumns.Count] = i; outputColumns.Add(columnName); } } using (ITabularWriter writer = TabularFactory.BuildWriter(outputFilePath)) { writer.SetColumns(outputColumns); while (reader.NextRow()) { // Write columns in mapped order for (int i = 0; i < indexMapping.Length; ++i) { int sourceColumnIndex = indexMapping[i]; if (sourceColumnIndex == -1) { // Write concatenated column writer.WriteValueStart(); writer.WriteValuePart(reader.Current(columnIndex1).ToString8()); writer.WriteValuePart(separator8); writer.WriteValuePart(reader.Current(columnIndex2).ToString8()); writer.WriteValueEnd(); } else { writer.Write(reader.Current(sourceColumnIndex).ToString8()); } } writer.NextRow(); } WriteSizeSummary(reader, writer); } } }
private static void OnlyLatest(string inputFolderPath, string outputFilePath, string idColumnIdentifier) { String8Block block = new String8Block(); Dictionary <String8, Tuple <string, int> > latestFileAndRowByID = new Dictionary <String8, Tuple <string, int> >(); IReadOnlyList <string> writerColumns = null; // Walk the input files to figure out the latest copy of each ID Trace.WriteLine($"Identifying latest {idColumnIdentifier} in all files in {inputFolderPath}..."); int rowCountRead = 0; foreach (string inputFilePath in Directory.GetFiles(inputFolderPath)) { using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath)) { int idColumnIndex = reader.ColumnIndex(idColumnIdentifier); while (reader.NextRow()) { rowCountRead++; String8 id = reader.Current(idColumnIndex).ToString8(); id.ToUpperInvariant(); // Record the file and row containing this ID, overwriting previous entries latestFileAndRowByID[block.GetCopy(id)] = new Tuple <string, int>(inputFilePath, reader.RowCountRead); } // Capture the columns from the last CSV to write writerColumns = reader.Columns; } } Trace.WriteLine($"Scan Complete. {rowCountRead:n0} rows read; {latestFileAndRowByID.Count:n0} distinct IDs found."); using (ITabularWriter writer = TabularFactory.BuildWriter(outputFilePath)) { writer.SetColumns(writerColumns); int[] writerColumnIndexInReader = new int[writerColumns.Count]; foreach (string inputFilePath in Directory.GetFiles(inputFolderPath)) { using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath)) { // Look up each output column's position in the input file for (int i = 0; i < writerColumns.Count; ++i) { reader.TryGetColumnIndex(writerColumns[i], out writerColumnIndexInReader[i]); } int idColumnIndex = reader.ColumnIndex(idColumnIdentifier); while (reader.NextRow()) { String8 id = reader.Current(idColumnIndex).ToString8(); id.ToUpperInvariant(); // Copy this row to the output file, *if* it's the latest for this ID Tuple <string, int> latestForID = latestFileAndRowByID[id]; if (latestForID.Item1 == inputFilePath && latestForID.Item2 == reader.RowCountRead) { for (int i = 0; i < writerColumns.Count; ++i) { int readerColumnIndex = writerColumnIndexInReader[i]; if (readerColumnIndex >= 0 && readerColumnIndex < reader.CurrentRowColumns) { writer.Write(reader.Current(i).ToString8()); } else { writer.Write(String8.Empty); } } writer.NextRow(); } } } } WriteSizeSummary(null, writer); } }