private static void Compare(string oldFilePath, string newFilePath, string outputFilePath, string columnIdentifier) { String8Block block = new String8Block(); HashSet <String8> oldValues = new HashSet <String8>(); HashSet <String8> newValues = new HashSet <String8>(); using (ITabularReader oldReader = TabularFactory.BuildReader(oldFilePath)) { int leftColumnIndex = oldReader.ColumnIndex(columnIdentifier); while (oldReader.NextRow()) { oldValues.Add(block.GetCopy(oldReader.Current(leftColumnIndex))); } Trace.WriteLine(String.Format("Old: {0:n0} values for \"{1}\" in {2:n0} rows.", oldValues.Count, columnIdentifier, oldReader.RowCountRead)); } using (ITabularReader newReader = TabularFactory.BuildReader(newFilePath)) { int rightColumnIndex = newReader.ColumnIndex(columnIdentifier); while (newReader.NextRow()) { newValues.Add(block.GetCopy(newReader.Current(rightColumnIndex))); } Trace.WriteLine(String.Format("New: {0:n0} values for \"{1}\" in {2:n0} rows.", newValues.Count, columnIdentifier, newReader.RowCountRead)); } HashSet <String8> oldOnly = new HashSet <String8>(oldValues); oldOnly.ExceptWith(newValues); HashSet <String8> newOnly = new HashSet <String8>(newValues); newOnly.ExceptWith(oldValues); Trace.WriteLine(String.Format("{0:n0} values were only in \"{1}\".\r\n{2:n0} values were only in \"{3}\".", oldOnly.Count, oldFilePath, newOnly.Count, newFilePath)); String8 leftMarker = String8.Convert("-", new byte[1]); String8 rightMarker = String8.Convert("+", new byte[1]); using (ITabularWriter writer = TabularFactory.BuildWriter(outputFilePath)) { writer.SetColumns(new string[] { "In", columnIdentifier }); foreach (String8 value in oldOnly) { writer.Write(leftMarker); writer.Write(value); writer.NextRow(); } foreach (String8 value in newOnly) { writer.Write(rightMarker); writer.Write(value); writer.NextRow(); } } }
private static void Append(string inputFileOrFolderPath, string outputFilePath, string inputFileNamePattern = null) { string[] inputFilePaths; if (Directory.Exists(inputFileOrFolderPath)) { if (String.IsNullOrEmpty(inputFileNamePattern)) { inputFileNamePattern = "*.*"; } inputFilePaths = Directory.GetFiles(inputFileOrFolderPath, inputFileNamePattern); } else { inputFilePaths = new string[] { inputFileOrFolderPath }; } ITabularWriter writer = null; string writerColumns = null; try { foreach (string inputFilePath in inputFilePaths) { using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath)) { // Build the writer, if this is the first file if (writer == null) { writer = TabularFactory.AppendWriter(outputFilePath, reader.Columns); writerColumns = String.Join(", ", reader.Columns); } // Validate columns match string sourceColumns = String.Join(", ", reader.Columns); if (string.Compare(writerColumns, sourceColumns, true) != 0) { throw new InvalidOperationException(string.Format("Can't append to \"{0}\" because the column names don't match.\r\nExpect: {1}\r\nActual: {2}", outputFilePath, writerColumns, sourceColumns)); } // Copy the rows CopyRows(reader, writer); // Write a summary for this input file Trace.WriteLine($" {inputFilePath}, {reader.RowCountRead:n0} rows; {reader.BytesRead.SizeString()}"); } } // Write a summary for the output file WriteSizeSummary(null, writer); } finally { if (writer != null) { writer.Dispose(); writer = null; } } }
private static void RowId(string inputFilePath, string outputFilePath, int firstId = 1) { int currentId = firstId; using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath)) { using (ITabularWriter writer = TabularFactory.BuildWriter(outputFilePath)) { List <string> columns = new List <string>(); columns.Add("ID"); columns.AddRange(reader.Columns); writer.SetColumns(columns); while (reader.NextRow()) { writer.Write(currentId); currentId++; for (int i = 0; i < reader.CurrentRowColumns; ++i) { writer.Write(reader.Current(i).ToString8()); } writer.NextRow(); } WriteSizeSummary(reader, writer); } } }
private static void Copy(string inputFilePath, string outputFilePath, string columnsDelimited) { List <string> columns = new List <string>(); foreach (string columnName in columnsDelimited.Split(',')) { columns.Add(columnName.Trim()); } using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath)) { int[] columnIndices = new int[columns.Count]; for (int i = 0; i < columnIndices.Length; ++i) { columnIndices[i] = reader.ColumnIndex(columns[i]); } using (ITabularWriter writer = TabularFactory.BuildWriter(outputFilePath)) { writer.SetColumns(columns); while (reader.NextRow()) { for (int i = 0; i < columnIndices.Length; ++i) { writer.Write(reader.Current(columnIndices[i]).ToString8()); } writer.NextRow(); } WriteSizeSummary(reader, writer); } } }
private static void Copy(string inputFilePath, string outputFilePath, int rowLimit = -1) { using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath)) { using (ITabularWriter writer = TabularFactory.BuildWriter(outputFilePath)) { writer.SetColumns(reader.Columns); while (reader.NextRow()) { for (int i = 0; i < reader.CurrentRowColumns; ++i) { writer.Write(reader.Current(i).ToString8()); } writer.NextRow(); if (writer.RowCountWritten == rowLimit) { break; } } WriteSizeSummary(reader, writer); } } }
public void Initialize() { String8Block block = new String8Block(); String8[] names = new String8[] { block.GetCopy("Scott"), block.GetCopy("Mike"), block.GetCopy("Jeff"), block.GetCopy("Sophie") }; using (ITabularWriter sample = TabularFactory.BuildWriter(s_sampleFilePath)) { sample.SetColumns(new string[] { "ID", "IsEven", "Count", "WhenAdded", "Name" }); int sum = 0; for (int i = 0; i < 1000; ++i) { sum += i; sample.Write(i); sample.Write((i & 0x1) == 0); sample.Write(sum); sample.Write(new DateTime(2017, 05, 23).AddMinutes(i)); sample.Write(names[i % names.Length]); sample.NextRow(); } } }
private static int WhereMatchCount(string inputPath, string columnIdentifier, string op, string value) { using (ITabularReader reader = TabularFactory.BuildReader(inputPath)) { return(WhereMatcher.Where(reader, columnIdentifier, op, value, null).MatchCount); } }
private static void Generate_WebRequestSample(string basePath, int randomSeed, int userCount, int eventCount, int numberOfDays) { Random r = new Random(randomSeed); DateTime asOfDate = DateTime.UtcNow.Date; String8Block block = new String8Block(); WebRequestGenerator generator; string path; // Generate a set of users and write them out [for a week ago] asOfDate = asOfDate.AddDays(-8); path = Path.Combine(basePath, $"Users.{asOfDate:yyyyMMdd}.r{randomSeed}.{userCount}.csv"); Console.WriteLine($"Writing {path}..."); UserGenerator userGenerator = new UserGenerator(r, asOfDate); List <User> users = userGenerator.Next(userCount); using (ITabularWriter writer = TabularFactory.BuildWriter(path)) { foreach (User user in users) { user.WriteTo(writer, block); } } File.SetLastWriteTimeUtc(path, asOfDate); // Generate WebRequest Data [for a week ago] generator = new WebRequestGenerator(users, r, asOfDate, (eventCount < 1001 ? 10 : 100)); BuildWebRequests(basePath, generator, eventCount, WebRequestWriteMode.All); asOfDate = asOfDate.AddDays(8); // Generate batches of WebRequest sample data [current] for (int day = 0; day < numberOfDays; ++day) { generator = new WebRequestGenerator(users, r, asOfDate, (eventCount < 1001 ? 10 : 100)); if (day == 0) { generator.Issue = new PremiumUserOutage(asOfDate.AddMinutes(18), asOfDate.AddMinutes(104), r); } BuildWebRequests(basePath, generator, eventCount, WebRequestWriteMode.UserIdentityOnly); asOfDate = asOfDate.AddDays(-1); } // Generate one big joinable batch eventCount = 10 * 1000 * 1000; generator = new WebRequestGenerator(users, r, asOfDate, 1000); generator.Issue = new PortRangeBlocked(asOfDate.AddMinutes(1), asOfDate.AddMinutes(180), 11450, 11480); BuildWebRequests(basePath, generator, eventCount, WebRequestWriteMode.UserIdentityOnly); asOfDate = asOfDate.AddDays(-1); // Generate one huge minimal batch eventCount = 100 * 1000 * 1000; generator = new WebRequestGenerator(users, r, asOfDate, 1000); generator.Issue = new UncachedSlowness(asOfDate.AddMinutes(4), asOfDate.AddMinutes(36), r); BuildWebRequests(basePath, generator, eventCount, WebRequestWriteMode.Minimal); Console.WriteLine("Done."); }
private static void Concatenate(string inputFilePath, string outputFilePath, String8 delimiter) { using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath)) { using (ITabularWriter writer = TabularFactory.BuildWriter(outputFilePath)) { writer.SetColumns(reader.Columns); String8Block block = new String8Block(); String8[] lastValues = new String8[reader.CurrentRowColumns]; String8[] combinedValues = new String8[reader.CurrentRowColumns]; while (reader.NextRow()) { String8 firstColumn = reader.Current(0).ToString8(); if (reader.RowCountRead == 2) { // First Row - Get the first ID only combinedValues[0] = block.GetCopy(firstColumn); } else if (firstColumn.CompareTo(combinedValues[0], true) != 0) { // If we have a new ID (and not first row) // Write concatenated values for previous ID WriteCombinedRow(writer, combinedValues); // Reset for this ID block.Clear(); combinedValues[0] = block.GetCopy(firstColumn); for (int i = 1; i < combinedValues.Length; ++i) { combinedValues[i] = String8.Empty; } } // Concatenate non-duplicate values to "row in progress" for (int i = 1; i < reader.CurrentRowColumns; ++i) { String8 value = reader.Current(i).ToString8(); if (lastValues[i] != value) { lastValues[i] = value; combinedValues[i] = block.Concatenate(combinedValues[i], delimiter, value); } } } // After last row, write out values so far WriteCombinedRow(writer, combinedValues); WriteSizeSummary(reader, writer); } } }
public Logger(IStreamProvider streamProvider, string outputFilePath) { string logFilePath = Path.Combine(outputFilePath, "Log.csv"); _writer = TabularFactory.BuildWriter(streamProvider.OpenWrite(logFilePath), logFilePath); _writer.SetColumns(new string[] { "WhenUtc", "MessageType", "SourceComponent", "Message" }); _block = new String8Block(); }
public void Reset() { _reader = TabularFactory.BuildReader(_streamProvider.OpenRead(_filePath), _filePath); _columns = new TabularColumn[_reader.Columns.Count]; _cells = new String8[_reader.Columns.Count][]; for (int i = 0; i < _reader.Columns.Count; ++i) { _columns[i] = new TabularColumn(this, _reader.Columns[i]); } }
public void Sanitize_EndToEnd() { Assembly xsvTest = Assembly.GetExecutingAssembly(); Resource.SaveStreamTo("Xsv.Test.Sanitize.SanitizeSampleSource.csv", "SanitizeSampleSource.csv", xsvTest); Resource.SaveStreamTo("Xsv.Test.Sanitize.SanitizeSampleSource.sanispec", "SanitizeSampleSource.sanispec", xsvTest); // Verify UsageException if no key is passed Assert.AreEqual(-2, Program.Main(new string[] { "sanitize", @"SanitizeSampleSource.csv", "SanitizeOutput.csv", @"SanitizeSampleSource.sanispec" })); // Verify success for base sanitize File.Delete("SanitizeOutput.csv"); Assert.AreEqual(0, Program.Main(new string[] { "sanitize", @"SanitizeSampleSource.csv", "SanitizeOutput.csv", @"SanitizeSampleSource.sanispec", "Key1" })); // Validate the result using (ITabularReader r = TabularFactory.BuildReader("SanitizeOutput.csv")) { Assert.IsTrue(r.Columns.Contains("ID"), "ID column is kept (no spec line)"); Assert.IsTrue(r.Columns.Contains("Path"), "Path column is kept (mapped)"); Assert.IsTrue(r.Columns.Contains("IsEmptyPath"), "IsEmptyPath is kept (Keep line)"); Assert.IsFalse(r.Columns.Contains("IsUnderXsv"), "IxUnderXsv column is dropped (Drop line)"); int idColumnIndex = r.ColumnIndex("ID"); int pathColumnIndex = r.ColumnIndex("Path"); int isEmptyPathColumnIndex = r.ColumnIndex("IsEmptyPath"); while (r.NextRow()) { int id = r.Current(idColumnIndex).ToInteger(); string path = r.Current(pathColumnIndex).ToString(); Assert.AreEqual(r.Current(isEmptyPathColumnIndex).ToBoolean(), String.IsNullOrEmpty(path), "IsEmptyPath condition matches whether mapped path is empty"); if (id == 5) { Assert.AreEqual("Elfie", path, "'Elfie' is echoed (Echo in spec)"); } else if (!String.IsNullOrEmpty(path)) { Assert.IsTrue(path.StartsWith("WarmBeggedTruth\\"), "Verify path is mapped in parts, and 'Elfie' is consistently mapped."); } } Assert.IsTrue(r.RowCountRead < 7, "Verify sample excluded at least one row."); } // Run with another key Assert.AreEqual(0, Program.Main(new string[] { "sanitize", @"SanitizeSampleSource.csv", "SanitizeOutput2.csv", @"SanitizeSampleSource.sanispec", "Key2" })); // Verify mappings are different Assert.AreNotEqual(File.ReadAllText("SanitizeOutput2.csv"), File.ReadAllText("SanitizeOutput.csv")); }
public int Next(int desiredCount, CancellationToken cancellationToken) { // Build the writer only when we start getting rows if (_writer == null) { if (_outputFilePath == null) { throw new InvalidOperationException("TabularFileWriter can't reset when passed an ITabularWriter instance"); } if (_outputFilePath.Equals("cout", StringComparison.OrdinalIgnoreCase)) { _writer = new ConsoleTabularWriter(); } else { _writer = TabularFactory.BuildWriter(_streamProvider.OpenWrite(_outputFilePath), _outputFilePath); } _writer.SetColumns(_source.Columns.Select((col) => col.ColumnDetails.Name)); } // Or smaller batch? int rowCount = _source.Next(desiredCount, cancellationToken); if (rowCount == 0) { return(0); } XArray[] arrays = new XArray[_stringColumnGetters.Length]; for (int i = 0; i < _stringColumnGetters.Length; ++i) { arrays[i] = _stringColumnGetters[i](); } for (int rowIndex = 0; rowIndex < rowCount; ++rowIndex) { for (int colIndex = 0; colIndex < _stringColumnGetters.Length; ++colIndex) { String8 value = ((String8[])arrays[colIndex].Array)[arrays[colIndex].Index(rowIndex)]; _writer.Write(value); } _writer.NextRow(); } return(rowCount); }
/// <summary> /// Sanitize an input file into a given output file using this Sanitizer's configuration. /// </summary> /// <param name="inputFile">File Path to input file</param> /// <param name="outputFile">File Path to output file</param> public void Sanitize(string inputFile, string outputFile) { using (ITabularReader reader = TabularFactory.BuildReader(inputFile)) { // Build an array of what we'll do with each input column, and the list of columns we'll actually write List <string> columnsToOutput; IColumnHandler[] handlers = GetHandlersByColumnIndex(reader.Columns, out columnsToOutput); // Find the sample column index, if any, and calculate a hash cutoff for including rows int sampleColumnIndex = (String.IsNullOrEmpty(this.SampleColumnName) ? -1 : reader.ColumnIndex(this.SampleColumnName)); uint sampleInclusionCutoff = (uint)(uint.MaxValue * this.SampleProbability); using (ITabularWriter writer = TabularFactory.BuildWriter(outputFile)) { writer.SetColumns(columnsToOutput); while (reader.NextRow()) { // If there's a sample column, decide whether to include this row if (sampleColumnIndex > -1) { // Sample *without* the hashkey, so the same rows are consistently included or excluded. uint sampleValueHash = Hashing.Hash(reader.Current(sampleColumnIndex).ToString8(), 0); if (sampleValueHash > sampleInclusionCutoff) { continue; } } // Run the handler for every input column, writing the output if there is one for (int i = 0; i < reader.CurrentRowColumns; ++i) { IColumnHandler handler = handlers[i]; if (handler != null) { String8 value = reader.Current(i).ToString8(); String8 replacement = handler.Sanitize(value); writer.Write(replacement); } } writer.NextRow(); } } } }
private static void WriteSampleTsv(Stream stream, int seed, int rowCount) { DateTime start = new DateTime(2018, 01, 01, 0, 0, 0, DateTimeKind.Utc); Random r = new Random(seed); ITabularWriter writer = TabularFactory.BuildWriter(stream, "Unused.tsv"); { writer.SetColumns(new string[] { "Zip", "LastScan", "IsArchived" }); for (int i = 0; i < rowCount; ++i) { writer.Write(r.Next(10000, 99999)); writer.Write(start.AddDays(-180.0 * r.NextDouble())); writer.Write(r.Next(100) < 50); writer.NextRow(); } } }
public void WhereMatcher_Basics() { Assert.AreEqual(1000, WhereMatchCount(s_sampleFilePath, "0", ">=", "0"), "Should match all rows (by column index)"); Assert.AreEqual(500, WhereMatchCount(s_sampleFilePath, "ID", ">=", "500"), "Should match half of rows (int)"); Assert.AreEqual(500, WhereMatchCount(s_sampleFilePath, "IsEven", "==", "true"), "Should match half of rows (boolean)"); Assert.AreEqual(90, WhereMatchCount(s_sampleFilePath, "WhenAdded", "<", "2017-05-23 01:30:00 AM"), "Should match 90 rows (DateTime)"); Assert.AreEqual(250, WhereMatchCount(s_sampleFilePath, "Name", "=", "Sophie"), "Should match 250 rows (string)"); Assert.AreEqual(750, WhereMatchCount(s_sampleFilePath, "Name", "!=", "Sophie"), "Should match 250 rows (string !=)"); Assert.AreEqual(250, WhereMatchCount(s_sampleFilePath, "Name", "|>", "Sop"), "Should match 250 rows (string StartsWith)"); Assert.AreEqual(250, WhereMatchCount(s_sampleFilePath, "Name", ":", "ophie"), "Should match 250 rows (string Contains)"); Assert.AreEqual(250, WhereMatchCount(s_sampleFilePath, "Name", ">", "Scott"), "Should match 250 rows (string >)"); Assert.AreEqual(500, WhereMatchCount(s_sampleFilePath, "Name", ">=", "Scott"), "Should match 500 rows (string >=)"); Assert.AreEqual(500, WhereMatchCount(s_sampleFilePath, "Name", "<", "Scott"), "Should match 500 rows (string <)"); Assert.AreEqual(750, WhereMatchCount(s_sampleFilePath, "Name", "<=", "Scott"), "Should match 750 rows (string <=)"); Assert.AreEqual(0, WhereMatchCount(s_sampleFilePath, "Name", "!=", "false"), "Should match 0 rows (bool, can't convert type)"); Assert.AreEqual(0, WhereMatchCount(s_sampleFilePath, "Name", "!=", "0"), "Should match 0 rows (int, can't convert type)"); Assert.AreEqual(0, WhereMatchCount(s_sampleFilePath, "Name", "!=", "2017-01-01"), "Should match 0 rows (DateTime, can't convert type)"); // Column name doesn't exist Verify.Exception <ColumnNotFoundException>(() => WhereMatchCount(s_sampleFilePath, "MissingColumn", "==", "Jeff")); // Column index out of range Verify.Exception <ColumnNotFoundException>(() => WhereMatchCount(s_sampleFilePath, "-1", "==", "Jeff")); // Unknown operator Verify.Exception <UsageException>(() => WhereMatchCount(s_sampleFilePath, "Name", "->", "Jeff")); // Try with output enabled using (ITabularReader reader = TabularFactory.BuildReader(s_sampleFilePath)) { using (ITabularWriter writer = TabularFactory.BuildWriter("Sample.Under2.csv")) { WhereMatcher.Where(reader, "ID", "<", "2", writer); Assert.AreEqual(2, writer.RowCountWritten); } string content = File.ReadAllText("Sample.Under2.csv"); Assert.IsTrue(content.Contains("\"0\"")); Assert.IsTrue(content.Contains("\"1\"")); Assert.IsFalse(content.Contains("\"2\"")); } }
public static void Write(IStreamProvider streamProvider, string tableRootPath, TableMetadata metadata) { String8Block block = new String8Block(); using (ITabularWriter sw = TabularFactory.BuildWriter(streamProvider.OpenWrite(Path.Combine(tableRootPath, SchemaFileName)), SchemaFileName)) { sw.SetColumns(new string[] { "Name", "Type" }); foreach (ColumnDetails column in metadata.Schema) { sw.Write(block.GetCopy(column.Name)); sw.Write(block.GetCopy(column.Type.Name.ToString())); sw.NextRow(); } } using (ITabularWriter mw = TabularFactory.BuildWriter(streamProvider.OpenWrite(Path.Combine(tableRootPath, MetadataFileName)), MetadataFileName)) { mw.SetColumns(new string[] { "Name", "Context", "Value" }); mw.Write(block.GetCopy("RowCount")); mw.Write(String8.Empty); mw.Write(metadata.RowCount); mw.NextRow(); } streamProvider.WriteAllText(Path.Combine(tableRootPath, ConfigQueryPath), metadata.Query); if (metadata.Partitions.Count > 0) { using (ITabularWriter pw = TabularFactory.BuildWriter(streamProvider.OpenWrite(Path.Combine(tableRootPath, PartitionsFileName)), PartitionsFileName)) { pw.SetColumns(new string[] { "Name" }); foreach (string partition in metadata.Partitions) { pw.Write(block.GetCopy(partition)); pw.NextRow(); } } } s_Cache.Add($"{streamProvider}|{tableRootPath}", metadata); }
private static TableMetadata Build(IStreamProvider streamProvider, string tableRootPath) { TableMetadata metadata = new TableMetadata(); string schemaFilePath = Path.Combine(tableRootPath, SchemaFileName); using (ITabularReader sr = TabularFactory.BuildReader(streamProvider.OpenRead(schemaFilePath), SchemaFileName)) { int nameIndex = sr.ColumnIndex("Name"); int typeIndex = sr.ColumnIndex("Type"); while (sr.NextRow()) { metadata.Schema.Add(new ColumnDetails(sr.Current(nameIndex).ToString(), TypeProviderFactory.Get(sr.Current(typeIndex).ToString()).Type)); } } using (ITabularReader mr = TabularFactory.BuildReader(streamProvider.OpenRead(Path.Combine(tableRootPath, MetadataFileName)), MetadataFileName)) { int nameIndex = mr.ColumnIndex("Name"); int contextIndex = mr.ColumnIndex("Context"); int valueIndex = mr.ColumnIndex("Value"); while (mr.NextRow()) { String8 name = mr.Current(nameIndex).ToString8(); String8 context = mr.Current(contextIndex).ToString8(); ITabularValue value = mr.Current(valueIndex); if (name.Equals("RowCount")) { metadata.RowCount = value.ToInteger(); } else { throw new NotImplementedException($"TableMetadataSerializer.Read doesn't know how to read Metadata '{name}'"); } } } metadata.Query = streamProvider.ReadAllText(Path.Combine(tableRootPath, ConfigQueryPath)); return(metadata); }
private static void BuildWebRequests(string basePath, string fileNamePrefix, WebRequestGenerator generator, int eventCount, WebRequestWriteMode mode) { DateTime asOfDate = generator.Current; String8Block block = new String8Block(); string path = Path.Combine(basePath, $"{fileNamePrefix}.{asOfDate:yyyyMMdd}.r5.{eventCount}.csv"); Console.WriteLine($"Writing {path}..."); using (ITabularWriter writer = TabularFactory.BuildWriter(path)) { for (int i = 0; i < eventCount; ++i) { WebRequest request = generator.Next(); request.WriteTo(writer, block, i, mode); } } File.SetLastWriteTimeUtc(path, asOfDate); }
private static void OnlyIn(string inputFilePath, string outputFilePath, string onlyInInputFilePath, string onlyInColumnIdentifier) { String8Block block = new String8Block(); HashSet <String8> values = new HashSet <String8>(); // Read values in 'onlyInInputFilePath' using (ITabularReader reader = TabularFactory.BuildReader(onlyInInputFilePath)) { int leftColumnIndex = reader.ColumnIndex(onlyInColumnIdentifier); while (reader.NextRow()) { values.Add(block.GetCopy(reader.Current(leftColumnIndex))); } } // Copy from input to output where the column value is in the "only in" set using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath)) { int valueColumnIndex = reader.ColumnIndex(onlyInColumnIdentifier); using (ITabularWriter writer = TabularFactory.BuildWriter(outputFilePath)) { writer.SetColumns(reader.Columns); while (reader.NextRow()) { if (values.Contains(reader.Current(valueColumnIndex).ToString8())) { for (int i = 0; i < reader.CurrentRowColumns; ++i) { writer.Write(reader.Current(i).ToString8()); } writer.NextRow(); } } WriteSizeSummary(reader, writer); } } }
private static void Where(string inputFilePath, string columnIdentifier, string op, string value, string outputFilePath) { WhereResult result; using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath)) { using (ITabularWriter writer = (String.IsNullOrEmpty(outputFilePath) ? null : TabularFactory.BuildWriter(outputFilePath))) { if (writer == null) { Console.WriteLine($"Counting from '{inputFilePath}' where {columnIdentifier} {op} {value}..."); } else { Console.WriteLine($"Writing from '{inputFilePath}' where {columnIdentifier} {op} {value} into '{outputFilePath ?? ""}'..."); } result = WhereMatcher.Where(reader, columnIdentifier, op, value, writer); } } Console.WriteLine($"Done. {result.MatchCount:n0} out of {result.RowCount:n0} rows matched."); }
public void TsvSplit() { Stream tsvStream = new MemoryStream(); //Stream tsvStream = new FileStream("Sample.tsv", FileMode.Create); int rowCount = 1000 * 1000; WriteSampleTsv(tsvStream, 5, 1000 * 1000); byte[] content = new byte[64 * 1024]; BitVector cells = new BitVector(content.Length); BitVector rows = new BitVector(content.Length); int[] rowEnds = new int[1024]; byte[] allContent = new byte[tsvStream.Length]; tsvStream.Seek(0, SeekOrigin.Begin); tsvStream.Read(allContent, 0, allContent.Length); BitVector allCells = new BitVector(allContent.Length); BitVector allRows = new BitVector(allContent.Length); using (Benchmarker b = new Benchmarker($"Tsv Parse [{rowCount:n0}] | count", DefaultMeasureMilliseconds)) { b.Measure("Read only", (int)tsvStream.Length, () => { tsvStream.Seek(0, SeekOrigin.Begin); while (true) { int lengthRead = tsvStream.Read(content, 0, content.Length); if (lengthRead == 0) { break; } } return(rowCount); }); b.Measure("ReadLine | Split", (int)tsvStream.Length, () => { tsvStream.Seek(0, SeekOrigin.Begin); int count = 0; StreamReader reader = new StreamReader(tsvStream); { // Header row reader.ReadLine(); while (!reader.EndOfStream) { string line = reader.ReadLine(); string[] cellSet = line.Split('\t'); count++; } } return(count); }); b.Measure("Elfie TsvReader", (int)tsvStream.Length, () => { tsvStream.Seek(0, SeekOrigin.Begin); int count = 0; ITabularReader reader = TabularFactory.BuildReader(tsvStream, "Unused.tsv"); { while (reader.NextRow()) { count++; } } return(count); }); Func <byte[], int, int, ulong[], ulong[], int> splitTsvN = NativeAccelerator.GetMethod <Func <byte[], int, int, ulong[], ulong[], int> >("XForm.Native.String8N", "SplitTsv"); b.Measure("XForm Native Split", (int)tsvStream.Length, () => { tsvStream.Seek(0, SeekOrigin.Begin); int count = -1; while (true) { int lengthRead = tsvStream.Read(content, 0, content.Length); if (lengthRead == 0) { break; } if (lengthRead < content.Length) { Array.Clear(content, lengthRead, content.Length - lengthRead); } int lineCount = splitTsvN(content, 0, lengthRead, cells.Array, rows.Array); count += lineCount; int fromRow = 0; int countCopy = cells.Page(rowEnds, ref fromRow); } return(count); }); b.MeasureParallel("XForm Native Split Parallel", (int)tsvStream.Length, (index, length) => { return(splitTsvN(allContent, index, length, allCells.Array, allRows.Array) - 1); }); } }
private static void OnlyLatest(string inputFolderPath, string outputFilePath, string idColumnIdentifier) { String8Block block = new String8Block(); Dictionary <String8, Tuple <string, int> > latestFileAndRowByID = new Dictionary <String8, Tuple <string, int> >(); IReadOnlyList <string> writerColumns = null; // Walk the input files to figure out the latest copy of each ID Trace.WriteLine($"Identifying latest {idColumnIdentifier} in all files in {inputFolderPath}..."); int rowCountRead = 0; foreach (string inputFilePath in Directory.GetFiles(inputFolderPath)) { using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath)) { int idColumnIndex = reader.ColumnIndex(idColumnIdentifier); while (reader.NextRow()) { rowCountRead++; String8 id = reader.Current(idColumnIndex).ToString8(); id.ToUpperInvariant(); // Record the file and row containing this ID, overwriting previous entries latestFileAndRowByID[block.GetCopy(id)] = new Tuple <string, int>(inputFilePath, reader.RowCountRead); } // Capture the columns from the last CSV to write writerColumns = reader.Columns; } } Trace.WriteLine($"Scan Complete. {rowCountRead:n0} rows read; {latestFileAndRowByID.Count:n0} distinct IDs found."); using (ITabularWriter writer = TabularFactory.BuildWriter(outputFilePath)) { writer.SetColumns(writerColumns); int[] writerColumnIndexInReader = new int[writerColumns.Count]; foreach (string inputFilePath in Directory.GetFiles(inputFolderPath)) { using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath)) { // Look up each output column's position in the input file for (int i = 0; i < writerColumns.Count; ++i) { reader.TryGetColumnIndex(writerColumns[i], out writerColumnIndexInReader[i]); } int idColumnIndex = reader.ColumnIndex(idColumnIdentifier); while (reader.NextRow()) { String8 id = reader.Current(idColumnIndex).ToString8(); id.ToUpperInvariant(); // Copy this row to the output file, *if* it's the latest for this ID Tuple <string, int> latestForID = latestFileAndRowByID[id]; if (latestForID.Item1 == inputFilePath && latestForID.Item2 == reader.RowCountRead) { for (int i = 0; i < writerColumns.Count; ++i) { int readerColumnIndex = writerColumnIndexInReader[i]; if (readerColumnIndex >= 0 && readerColumnIndex < reader.CurrentRowColumns) { writer.Write(reader.Current(i).ToString8()); } else { writer.Write(String8.Empty); } } writer.NextRow(); } } } } WriteSizeSummary(null, writer); } }
private static void ConcatenateColumn(string inputFilePath, string outputFilePath, string columnName1, string separator, string columnName2, string outputColumnName) { String8 separator8 = String8.Convert(separator, new byte[String8.GetLength(separator)]); using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath)) { // Find the columns to concatenate int columnIndex1 = reader.ColumnIndex(columnName1); int columnIndex2 = reader.ColumnIndex(columnName2); // Build an output column list and mapping from output order to input index, with '-1' for the concatenated value List <string> outputColumns = new List <string>(); int[] indexMapping = new int[reader.Columns.Count - 1]; bool hasConcatenatedColumn = false; for (int i = 0; i < reader.Columns.Count; ++i) { string columnName = reader.Columns[i]; // If this is a column to concatenate... if (columnName.Equals(reader.Columns[columnIndex1], StringComparison.OrdinalIgnoreCase) || columnName.Equals(reader.Columns[columnIndex2], StringComparison.OrdinalIgnoreCase)) { // .. if it's the first one, the output column will appear at this position if (!hasConcatenatedColumn) { hasConcatenatedColumn = true; indexMapping[outputColumns.Count] = -1; outputColumns.Add(outputColumnName); } } else { // Otherwise, copy this column through indexMapping[outputColumns.Count] = i; outputColumns.Add(columnName); } } using (ITabularWriter writer = TabularFactory.BuildWriter(outputFilePath)) { writer.SetColumns(outputColumns); while (reader.NextRow()) { // Write columns in mapped order for (int i = 0; i < indexMapping.Length; ++i) { int sourceColumnIndex = indexMapping[i]; if (sourceColumnIndex == -1) { // Write concatenated column writer.WriteValueStart(); writer.WriteValuePart(reader.Current(columnIndex1).ToString8()); writer.WriteValuePart(separator8); writer.WriteValuePart(reader.Current(columnIndex2).ToString8()); writer.WriteValueEnd(); } else { writer.Write(reader.Current(sourceColumnIndex).ToString8()); } } writer.NextRow(); } WriteSizeSummary(reader, writer); } } }
private static void Build(AddMode mode, string tableName, string csvFilePath, int maximumCount, string columns, string settingsJsonPath = null) { Stopwatch w = Stopwatch.StartNew(); Console.WriteLine("{0} Arriba table '{1}' from '{2}'...", mode, tableName, csvFilePath); IList <string> columnNames = null; if (!String.IsNullOrEmpty(columns)) { columnNames = SplitAndTrim(columns); } // Build or load table Table table; if (mode == AddMode.Build) { table = new Table(tableName, maximumCount); } else { table = new Table(); table.Load(tableName); } // Configure table if (!String.IsNullOrEmpty(settingsJsonPath)) { SetSettings(table, LoadSettings(settingsJsonPath)); } // Always add missing columns. Add rows only when not in 'decorate' mode AddOrUpdateOptions options = new AddOrUpdateOptions(); options.AddMissingColumns = true; options.Mode = (mode == AddMode.Decorate ? AddOrUpdateMode.UpdateAndIgnoreAdds : AddOrUpdateMode.AddOrUpdate); using (ITabularReader reader = TabularFactory.BuildReader(csvFilePath)) { long rowsImported = 0; if (columnNames == null) { columnNames = new List <string>(reader.Columns); } foreach (DataBlock block in ReadAsDataBlockBatch(reader, columnNames)) { table.AddOrUpdate(block, options); rowsImported += block.RowCount; Console.Write("."); } Console.WriteLine(); Console.WriteLine("Imported {0:n0} rows; table has {1:n0} rows. Saving...", rowsImported, table.Count); } table.Save(); w.Stop(); Console.WriteLine("Done in {0}.", w.Elapsed.ToFriendlyString()); }
public static int Main(string[] args) { Trace.Listeners.Add(new ConsoleTraceListener()); if (args == null || args.Length < 3) { Trace.WriteLine(Usage); return(-1); } string mode = args[0].ToLowerInvariant(); try { using (new TraceWatch(String.Empty)) { switch (mode) { case "copy": Trace.WriteLine(String.Format("Copy \"{0}\" to \"{1}\"...", args[1], args[2])); if (args.Length < 4) { Copy(args[1], args[2]); } else { Copy(args[1], args[2], args[3]); } break; case "concat": Trace.WriteLine(String.Format("Concatenating \"{0}\" values on first column into \"{1}\"...", args[1], args[2])); Concatenate(args[1], args[2], String8.Convert("; ", new byte[2])); break; case "notstartswith": if (args.Length < 5) { throw new UsageException("notStartsWith requires a value and name column to be passed."); } Trace.WriteLine(String.Format("Writing \"{0}\" values into \"{1}\" where !row[{2}].StartsWith(row[{3}])", args[1], args[2], args[3], args[4])); NotStartsWith(args[1], args[2], args[3], args[4]); break; case "compare": if (args.Length < 5) { throw new UsageException("compare requires two input files, an output file, and a column identifier to compare."); } Trace.WriteLine(String.Format("Comparing values for \"{0}\" values between \"{1}\" and \"{2}\"...", args[1], args[2], args[3], args[4])); Compare(args[1], args[2], args[3], args[4]); break; case "onlyin": if (args.Length < 5) { throw new UsageException("onlyIn requires a second input file and column identifier"); } Trace.WriteLine(String.Format("Writing \"{0}\" values into \"{1}\" where \"{2}\" also had the same \"{3}\"...", args[1], args[2], args[3], args[4])); OnlyIn(args[1], args[2], args[3], args[4]); break; case "sanitize": if (args.Length < 5) { throw new UsageException("sanitize requires input, output, specFile, hashKey"); } Trace.WriteLine(String.Format("Sanitizing \"{0}\" into \"{1}\" using \"{2}\"...", args[1], args[2], args[3])); Xsv.Sanitize.Sanitizer s = new Xsv.Sanitize.Sanitizer(args[3], args[4]); s.Sanitize(args[1], args[2]); break; case "sanitizevalue": if (args.Length < 5) { throw new UsageException("sanitize requires value, columnName, specFile, hashKey"); } Trace.WriteLine(String.Format("Sanitizing \"{0}\" from column \"{1}\" using \"{2}\"...", args[1], args[2], args[3])); Trace.WriteLine(new Xsv.Sanitize.Sanitizer(args[3], args[4]).Translate(args[1], args[2])); break; case "where": if (args.Length < 3) { throw new UsageException("row requires input and rowIndex"); } Where(args[1], args[2], (args.Length > 3 ? args[3] : null), (args.Length > 4 ? TabularFactory.BuildWriter(args[4]) : null)); break; default: throw new NotSupportedException(String.Format("XSV mode \"{0}\" is unknown. Run without arguments to see valid modes.", mode)); } } return(0); } catch (UsageException ex) { Trace.WriteLine(ex.Message); Trace.WriteLine(Usage); return(-2); } catch (Exception ex) when(!Debugger.IsAttached) { Trace.WriteLine("ERROR: " + ex.ToString()); return(-1); } }
private static void Where(string inputFilePath, string columnIndentifier, string value, ITabularWriter writer) { int matchCount = 0; int rowCount = 0; using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath)) { int rowIndex = (value != null ? -1 : int.Parse(columnIndentifier)); int colIndex = (value != null ? reader.ColumnIndex(columnIndentifier) : -1); while (reader.NextRow()) { // Match the row index if no value was passed if (rowIndex != -1 && reader.RowCountRead != rowIndex) { continue; } // Match the column value if passed if (colIndex != -1) { if (reader.CurrentRowColumns <= colIndex) { continue; } if (reader.Current(colIndex).ToString8().CompareTo(value, true) != 0) { continue; } } matchCount++; // If this is the matching row, write it if (writer != null) { if (writer.RowCountWritten == 0) { List <string> columns = new List <string>(); columns.Add("RowIndex"); columns.AddRange(reader.Columns); writer.SetColumns(columns); } writer.Write(reader.RowCountRead); for (int i = 0; i < reader.CurrentRowColumns; ++i) { writer.Write(reader.Current(i).ToString8()); } writer.NextRow(); } // If we matched row index, we're done if (rowIndex != -1) { break; } } rowCount = reader.RowCountRead; } Console.WriteLine($"Done. {matchCount:n0} out of {rowCount:n0} rows matched."); }