Пример #1
0
        private static void Compare(string oldFilePath, string newFilePath, string outputFilePath, string columnIdentifier)
        {
            String8Block      block     = new String8Block();
            HashSet <String8> oldValues = new HashSet <String8>();
            HashSet <String8> newValues = new HashSet <String8>();

            using (ITabularReader oldReader = TabularFactory.BuildReader(oldFilePath))
            {
                int leftColumnIndex = oldReader.ColumnIndex(columnIdentifier);
                while (oldReader.NextRow())
                {
                    oldValues.Add(block.GetCopy(oldReader.Current(leftColumnIndex)));
                }

                Trace.WriteLine(String.Format("Old: {0:n0} values for \"{1}\" in {2:n0} rows.", oldValues.Count, columnIdentifier, oldReader.RowCountRead));
            }

            using (ITabularReader newReader = TabularFactory.BuildReader(newFilePath))
            {
                int rightColumnIndex = newReader.ColumnIndex(columnIdentifier);
                while (newReader.NextRow())
                {
                    newValues.Add(block.GetCopy(newReader.Current(rightColumnIndex)));
                }

                Trace.WriteLine(String.Format("New: {0:n0} values for \"{1}\" in {2:n0} rows.", newValues.Count, columnIdentifier, newReader.RowCountRead));
            }

            HashSet <String8> oldOnly = new HashSet <String8>(oldValues);

            oldOnly.ExceptWith(newValues);

            HashSet <String8> newOnly = new HashSet <String8>(newValues);

            newOnly.ExceptWith(oldValues);

            Trace.WriteLine(String.Format("{0:n0} values were only in \"{1}\".\r\n{2:n0} values were only in \"{3}\".", oldOnly.Count, oldFilePath, newOnly.Count, newFilePath));

            String8 leftMarker  = String8.Convert("-", new byte[1]);
            String8 rightMarker = String8.Convert("+", new byte[1]);

            using (ITabularWriter writer = TabularFactory.BuildWriter(outputFilePath))
            {
                writer.SetColumns(new string[] { "In", columnIdentifier });

                foreach (String8 value in oldOnly)
                {
                    writer.Write(leftMarker);
                    writer.Write(value);
                    writer.NextRow();
                }

                foreach (String8 value in newOnly)
                {
                    writer.Write(rightMarker);
                    writer.Write(value);
                    writer.NextRow();
                }
            }
        }
Пример #2
0
        private static void Append(string inputFileOrFolderPath, string outputFilePath, string inputFileNamePattern = null)
        {
            string[] inputFilePaths;

            if (Directory.Exists(inputFileOrFolderPath))
            {
                if (String.IsNullOrEmpty(inputFileNamePattern))
                {
                    inputFileNamePattern = "*.*";
                }
                inputFilePaths = Directory.GetFiles(inputFileOrFolderPath, inputFileNamePattern);
            }
            else
            {
                inputFilePaths = new string[] { inputFileOrFolderPath };
            }

            ITabularWriter writer        = null;
            string         writerColumns = null;

            try
            {
                foreach (string inputFilePath in inputFilePaths)
                {
                    using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath))
                    {
                        // Build the writer, if this is the first file
                        if (writer == null)
                        {
                            writer        = TabularFactory.AppendWriter(outputFilePath, reader.Columns);
                            writerColumns = String.Join(", ", reader.Columns);
                        }

                        // Validate columns match
                        string sourceColumns = String.Join(", ", reader.Columns);
                        if (string.Compare(writerColumns, sourceColumns, true) != 0)
                        {
                            throw new InvalidOperationException(string.Format("Can't append to \"{0}\" because the column names don't match.\r\nExpect: {1}\r\nActual: {2}", outputFilePath, writerColumns, sourceColumns));
                        }

                        // Copy the rows
                        CopyRows(reader, writer);

                        // Write a summary for this input file
                        Trace.WriteLine($" {inputFilePath}, {reader.RowCountRead:n0} rows; {reader.BytesRead.SizeString()}");
                    }
                }

                // Write a summary for the output file
                WriteSizeSummary(null, writer);
            }
            finally
            {
                if (writer != null)
                {
                    writer.Dispose();
                    writer = null;
                }
            }
        }
Пример #3
0
        private static void RowId(string inputFilePath, string outputFilePath, int firstId = 1)
        {
            int currentId = firstId;

            using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath))
            {
                using (ITabularWriter writer = TabularFactory.BuildWriter(outputFilePath))
                {
                    List <string> columns = new List <string>();
                    columns.Add("ID");
                    columns.AddRange(reader.Columns);

                    writer.SetColumns(columns);

                    while (reader.NextRow())
                    {
                        writer.Write(currentId);
                        currentId++;

                        for (int i = 0; i < reader.CurrentRowColumns; ++i)
                        {
                            writer.Write(reader.Current(i).ToString8());
                        }

                        writer.NextRow();
                    }

                    WriteSizeSummary(reader, writer);
                }
            }
        }
Пример #4
0
        private static void Copy(string inputFilePath, string outputFilePath, string columnsDelimited)
        {
            List <string> columns = new List <string>();

            foreach (string columnName in columnsDelimited.Split(','))
            {
                columns.Add(columnName.Trim());
            }

            using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath))
            {
                int[] columnIndices = new int[columns.Count];
                for (int i = 0; i < columnIndices.Length; ++i)
                {
                    columnIndices[i] = reader.ColumnIndex(columns[i]);
                }

                using (ITabularWriter writer = TabularFactory.BuildWriter(outputFilePath))
                {
                    writer.SetColumns(columns);

                    while (reader.NextRow())
                    {
                        for (int i = 0; i < columnIndices.Length; ++i)
                        {
                            writer.Write(reader.Current(columnIndices[i]).ToString8());
                        }

                        writer.NextRow();
                    }

                    WriteSizeSummary(reader, writer);
                }
            }
        }
Пример #5
0
        private static void Copy(string inputFilePath, string outputFilePath, int rowLimit = -1)
        {
            using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath))
            {
                using (ITabularWriter writer = TabularFactory.BuildWriter(outputFilePath))
                {
                    writer.SetColumns(reader.Columns);

                    while (reader.NextRow())
                    {
                        for (int i = 0; i < reader.CurrentRowColumns; ++i)
                        {
                            writer.Write(reader.Current(i).ToString8());
                        }

                        writer.NextRow();
                        if (writer.RowCountWritten == rowLimit)
                        {
                            break;
                        }
                    }

                    WriteSizeSummary(reader, writer);
                }
            }
        }
Пример #6
0
        public void Initialize()
        {
            String8Block block = new String8Block();

            String8[] names = new String8[] { block.GetCopy("Scott"), block.GetCopy("Mike"), block.GetCopy("Jeff"), block.GetCopy("Sophie") };

            using (ITabularWriter sample = TabularFactory.BuildWriter(s_sampleFilePath))
            {
                sample.SetColumns(new string[] { "ID", "IsEven", "Count", "WhenAdded", "Name" });

                int sum = 0;
                for (int i = 0; i < 1000; ++i)
                {
                    sum += i;

                    sample.Write(i);
                    sample.Write((i & 0x1) == 0);
                    sample.Write(sum);
                    sample.Write(new DateTime(2017, 05, 23).AddMinutes(i));
                    sample.Write(names[i % names.Length]);

                    sample.NextRow();
                }
            }
        }
Пример #7
0
 private static int WhereMatchCount(string inputPath, string columnIdentifier, string op, string value)
 {
     using (ITabularReader reader = TabularFactory.BuildReader(inputPath))
     {
         return(WhereMatcher.Where(reader, columnIdentifier, op, value, null).MatchCount);
     }
 }
Пример #8
0
        private static void Generate_WebRequestSample(string basePath, int randomSeed, int userCount, int eventCount, int numberOfDays)
        {
            Random              r        = new Random(randomSeed);
            DateTime            asOfDate = DateTime.UtcNow.Date;
            String8Block        block    = new String8Block();
            WebRequestGenerator generator;

            string path;

            // Generate a set of users and write them out [for a week ago]
            asOfDate = asOfDate.AddDays(-8);
            path     = Path.Combine(basePath, $"Users.{asOfDate:yyyyMMdd}.r{randomSeed}.{userCount}.csv");
            Console.WriteLine($"Writing {path}...");
            UserGenerator userGenerator = new UserGenerator(r, asOfDate);
            List <User>   users         = userGenerator.Next(userCount);

            using (ITabularWriter writer = TabularFactory.BuildWriter(path))
            {
                foreach (User user in users)
                {
                    user.WriteTo(writer, block);
                }
            }

            File.SetLastWriteTimeUtc(path, asOfDate);

            // Generate WebRequest Data [for a week ago]
            generator = new WebRequestGenerator(users, r, asOfDate, (eventCount < 1001 ? 10 : 100));
            BuildWebRequests(basePath, generator, eventCount, WebRequestWriteMode.All);

            asOfDate = asOfDate.AddDays(8);

            // Generate batches of WebRequest sample data [current]
            for (int day = 0; day < numberOfDays; ++day)
            {
                generator = new WebRequestGenerator(users, r, asOfDate, (eventCount < 1001 ? 10 : 100));
                if (day == 0)
                {
                    generator.Issue = new PremiumUserOutage(asOfDate.AddMinutes(18), asOfDate.AddMinutes(104), r);
                }
                BuildWebRequests(basePath, generator, eventCount, WebRequestWriteMode.UserIdentityOnly);
                asOfDate = asOfDate.AddDays(-1);
            }

            // Generate one big joinable batch
            eventCount      = 10 * 1000 * 1000;
            generator       = new WebRequestGenerator(users, r, asOfDate, 1000);
            generator.Issue = new PortRangeBlocked(asOfDate.AddMinutes(1), asOfDate.AddMinutes(180), 11450, 11480);
            BuildWebRequests(basePath, generator, eventCount, WebRequestWriteMode.UserIdentityOnly);
            asOfDate = asOfDate.AddDays(-1);

            // Generate one huge minimal batch
            eventCount      = 100 * 1000 * 1000;
            generator       = new WebRequestGenerator(users, r, asOfDate, 1000);
            generator.Issue = new UncachedSlowness(asOfDate.AddMinutes(4), asOfDate.AddMinutes(36), r);
            BuildWebRequests(basePath, generator, eventCount, WebRequestWriteMode.Minimal);

            Console.WriteLine("Done.");
        }
Пример #9
0
        private static void Concatenate(string inputFilePath, string outputFilePath, String8 delimiter)
        {
            using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath))
            {
                using (ITabularWriter writer = TabularFactory.BuildWriter(outputFilePath))
                {
                    writer.SetColumns(reader.Columns);

                    String8Block block          = new String8Block();
                    String8[]    lastValues     = new String8[reader.CurrentRowColumns];
                    String8[]    combinedValues = new String8[reader.CurrentRowColumns];

                    while (reader.NextRow())
                    {
                        String8 firstColumn = reader.Current(0).ToString8();

                        if (reader.RowCountRead == 2)
                        {
                            // First Row - Get the first ID only
                            combinedValues[0] = block.GetCopy(firstColumn);
                        }
                        else if (firstColumn.CompareTo(combinedValues[0], true) != 0)
                        {
                            // If we have a new ID (and not first row)

                            // Write concatenated values for previous ID
                            WriteCombinedRow(writer, combinedValues);

                            // Reset for this ID
                            block.Clear();
                            combinedValues[0] = block.GetCopy(firstColumn);

                            for (int i = 1; i < combinedValues.Length; ++i)
                            {
                                combinedValues[i] = String8.Empty;
                            }
                        }

                        // Concatenate non-duplicate values to "row in progress"
                        for (int i = 1; i < reader.CurrentRowColumns; ++i)
                        {
                            String8 value = reader.Current(i).ToString8();

                            if (lastValues[i] != value)
                            {
                                lastValues[i]     = value;
                                combinedValues[i] = block.Concatenate(combinedValues[i], delimiter, value);
                            }
                        }
                    }

                    // After last row, write out values so far
                    WriteCombinedRow(writer, combinedValues);
                    WriteSizeSummary(reader, writer);
                }
            }
        }
Пример #10
0
        public Logger(IStreamProvider streamProvider, string outputFilePath)
        {
            string logFilePath = Path.Combine(outputFilePath, "Log.csv");

            _writer = TabularFactory.BuildWriter(streamProvider.OpenWrite(logFilePath), logFilePath);
            _writer.SetColumns(new string[] { "WhenUtc", "MessageType", "SourceComponent", "Message" });

            _block = new String8Block();
        }
Пример #11
0
        public void Reset()
        {
            _reader = TabularFactory.BuildReader(_streamProvider.OpenRead(_filePath), _filePath);

            _columns = new TabularColumn[_reader.Columns.Count];
            _cells   = new String8[_reader.Columns.Count][];
            for (int i = 0; i < _reader.Columns.Count; ++i)
            {
                _columns[i] = new TabularColumn(this, _reader.Columns[i]);
            }
        }
Пример #12
0
        public void Sanitize_EndToEnd()
        {
            Assembly xsvTest = Assembly.GetExecutingAssembly();

            Resource.SaveStreamTo("Xsv.Test.Sanitize.SanitizeSampleSource.csv", "SanitizeSampleSource.csv", xsvTest);
            Resource.SaveStreamTo("Xsv.Test.Sanitize.SanitizeSampleSource.sanispec", "SanitizeSampleSource.sanispec", xsvTest);

            // Verify UsageException if no key is passed
            Assert.AreEqual(-2, Program.Main(new string[] { "sanitize", @"SanitizeSampleSource.csv", "SanitizeOutput.csv", @"SanitizeSampleSource.sanispec" }));

            // Verify success for base sanitize
            File.Delete("SanitizeOutput.csv");
            Assert.AreEqual(0, Program.Main(new string[] { "sanitize", @"SanitizeSampleSource.csv", "SanitizeOutput.csv", @"SanitizeSampleSource.sanispec", "Key1" }));

            // Validate the result
            using (ITabularReader r = TabularFactory.BuildReader("SanitizeOutput.csv"))
            {
                Assert.IsTrue(r.Columns.Contains("ID"), "ID column is kept (no spec line)");
                Assert.IsTrue(r.Columns.Contains("Path"), "Path column is kept (mapped)");
                Assert.IsTrue(r.Columns.Contains("IsEmptyPath"), "IsEmptyPath is kept (Keep line)");
                Assert.IsFalse(r.Columns.Contains("IsUnderXsv"), "IxUnderXsv column is dropped (Drop line)");

                int idColumnIndex          = r.ColumnIndex("ID");
                int pathColumnIndex        = r.ColumnIndex("Path");
                int isEmptyPathColumnIndex = r.ColumnIndex("IsEmptyPath");

                while (r.NextRow())
                {
                    int    id   = r.Current(idColumnIndex).ToInteger();
                    string path = r.Current(pathColumnIndex).ToString();

                    Assert.AreEqual(r.Current(isEmptyPathColumnIndex).ToBoolean(), String.IsNullOrEmpty(path), "IsEmptyPath condition matches whether mapped path is empty");

                    if (id == 5)
                    {
                        Assert.AreEqual("Elfie", path, "'Elfie' is echoed (Echo in spec)");
                    }
                    else if (!String.IsNullOrEmpty(path))
                    {
                        Assert.IsTrue(path.StartsWith("WarmBeggedTruth\\"), "Verify path is mapped in parts, and 'Elfie' is consistently mapped.");
                    }
                }

                Assert.IsTrue(r.RowCountRead < 7, "Verify sample excluded at least one row.");
            }

            // Run with another key
            Assert.AreEqual(0, Program.Main(new string[] { "sanitize", @"SanitizeSampleSource.csv", "SanitizeOutput2.csv", @"SanitizeSampleSource.sanispec", "Key2" }));

            // Verify mappings are different
            Assert.AreNotEqual(File.ReadAllText("SanitizeOutput2.csv"), File.ReadAllText("SanitizeOutput.csv"));
        }
Пример #13
0
        public int Next(int desiredCount, CancellationToken cancellationToken)
        {
            // Build the writer only when we start getting rows
            if (_writer == null)
            {
                if (_outputFilePath == null)
                {
                    throw new InvalidOperationException("TabularFileWriter can't reset when passed an ITabularWriter instance");
                }
                if (_outputFilePath.Equals("cout", StringComparison.OrdinalIgnoreCase))
                {
                    _writer = new ConsoleTabularWriter();
                }
                else
                {
                    _writer = TabularFactory.BuildWriter(_streamProvider.OpenWrite(_outputFilePath), _outputFilePath);
                }

                _writer.SetColumns(_source.Columns.Select((col) => col.ColumnDetails.Name));
            }

            // Or smaller batch?
            int rowCount = _source.Next(desiredCount, cancellationToken);

            if (rowCount == 0)
            {
                return(0);
            }

            XArray[] arrays = new XArray[_stringColumnGetters.Length];
            for (int i = 0; i < _stringColumnGetters.Length; ++i)
            {
                arrays[i] = _stringColumnGetters[i]();
            }

            for (int rowIndex = 0; rowIndex < rowCount; ++rowIndex)
            {
                for (int colIndex = 0; colIndex < _stringColumnGetters.Length; ++colIndex)
                {
                    String8 value = ((String8[])arrays[colIndex].Array)[arrays[colIndex].Index(rowIndex)];
                    _writer.Write(value);
                }

                _writer.NextRow();
            }

            return(rowCount);
        }
Пример #14
0
        /// <summary>
        ///  Sanitize an input file into a given output file using this Sanitizer's configuration.
        /// </summary>
        /// <param name="inputFile">File Path to input file</param>
        /// <param name="outputFile">File Path to output file</param>
        public void Sanitize(string inputFile, string outputFile)
        {
            using (ITabularReader reader = TabularFactory.BuildReader(inputFile))
            {
                // Build an array of what we'll do with each input column, and the list of columns we'll actually write
                List <string>    columnsToOutput;
                IColumnHandler[] handlers = GetHandlersByColumnIndex(reader.Columns, out columnsToOutput);

                // Find the sample column index, if any, and calculate a hash cutoff for including rows
                int  sampleColumnIndex     = (String.IsNullOrEmpty(this.SampleColumnName) ? -1 : reader.ColumnIndex(this.SampleColumnName));
                uint sampleInclusionCutoff = (uint)(uint.MaxValue * this.SampleProbability);

                using (ITabularWriter writer = TabularFactory.BuildWriter(outputFile))
                {
                    writer.SetColumns(columnsToOutput);

                    while (reader.NextRow())
                    {
                        // If there's a sample column, decide whether to include this row
                        if (sampleColumnIndex > -1)
                        {
                            // Sample *without* the hashkey, so the same rows are consistently included or excluded.
                            uint sampleValueHash = Hashing.Hash(reader.Current(sampleColumnIndex).ToString8(), 0);
                            if (sampleValueHash > sampleInclusionCutoff)
                            {
                                continue;
                            }
                        }

                        // Run the handler for every input column, writing the output if there is one
                        for (int i = 0; i < reader.CurrentRowColumns; ++i)
                        {
                            IColumnHandler handler = handlers[i];
                            if (handler != null)
                            {
                                String8 value       = reader.Current(i).ToString8();
                                String8 replacement = handler.Sanitize(value);
                                writer.Write(replacement);
                            }
                        }

                        writer.NextRow();
                    }
                }
            }
        }
Пример #15
0
        private static void WriteSampleTsv(Stream stream, int seed, int rowCount)
        {
            DateTime start = new DateTime(2018, 01, 01, 0, 0, 0, DateTimeKind.Utc);

            Random         r      = new Random(seed);
            ITabularWriter writer = TabularFactory.BuildWriter(stream, "Unused.tsv");
            {
                writer.SetColumns(new string[] { "Zip", "LastScan", "IsArchived" });
                for (int i = 0; i < rowCount; ++i)
                {
                    writer.Write(r.Next(10000, 99999));
                    writer.Write(start.AddDays(-180.0 * r.NextDouble()));
                    writer.Write(r.Next(100) < 50);
                    writer.NextRow();
                }
            }
        }
Пример #16
0
        public void WhereMatcher_Basics()
        {
            Assert.AreEqual(1000, WhereMatchCount(s_sampleFilePath, "0", ">=", "0"), "Should match all rows (by column index)");

            Assert.AreEqual(500, WhereMatchCount(s_sampleFilePath, "ID", ">=", "500"), "Should match half of rows (int)");
            Assert.AreEqual(500, WhereMatchCount(s_sampleFilePath, "IsEven", "==", "true"), "Should match half of rows (boolean)");
            Assert.AreEqual(90, WhereMatchCount(s_sampleFilePath, "WhenAdded", "<", "2017-05-23 01:30:00 AM"), "Should match 90 rows (DateTime)");
            Assert.AreEqual(250, WhereMatchCount(s_sampleFilePath, "Name", "=", "Sophie"), "Should match 250 rows (string)");

            Assert.AreEqual(750, WhereMatchCount(s_sampleFilePath, "Name", "!=", "Sophie"), "Should match 250 rows (string !=)");
            Assert.AreEqual(250, WhereMatchCount(s_sampleFilePath, "Name", "|>", "Sop"), "Should match 250 rows (string StartsWith)");
            Assert.AreEqual(250, WhereMatchCount(s_sampleFilePath, "Name", ":", "ophie"), "Should match 250 rows (string Contains)");
            Assert.AreEqual(250, WhereMatchCount(s_sampleFilePath, "Name", ">", "Scott"), "Should match 250 rows (string >)");
            Assert.AreEqual(500, WhereMatchCount(s_sampleFilePath, "Name", ">=", "Scott"), "Should match 500 rows (string >=)");
            Assert.AreEqual(500, WhereMatchCount(s_sampleFilePath, "Name", "<", "Scott"), "Should match 500 rows (string <)");
            Assert.AreEqual(750, WhereMatchCount(s_sampleFilePath, "Name", "<=", "Scott"), "Should match 750 rows (string <=)");

            Assert.AreEqual(0, WhereMatchCount(s_sampleFilePath, "Name", "!=", "false"), "Should match 0 rows (bool, can't convert type)");
            Assert.AreEqual(0, WhereMatchCount(s_sampleFilePath, "Name", "!=", "0"), "Should match 0 rows (int, can't convert type)");
            Assert.AreEqual(0, WhereMatchCount(s_sampleFilePath, "Name", "!=", "2017-01-01"), "Should match 0 rows (DateTime, can't convert type)");

            // Column name doesn't exist
            Verify.Exception <ColumnNotFoundException>(() => WhereMatchCount(s_sampleFilePath, "MissingColumn", "==", "Jeff"));

            // Column index out of range
            Verify.Exception <ColumnNotFoundException>(() => WhereMatchCount(s_sampleFilePath, "-1", "==", "Jeff"));

            // Unknown operator
            Verify.Exception <UsageException>(() => WhereMatchCount(s_sampleFilePath, "Name", "->", "Jeff"));

            // Try with output enabled
            using (ITabularReader reader = TabularFactory.BuildReader(s_sampleFilePath))
            {
                using (ITabularWriter writer = TabularFactory.BuildWriter("Sample.Under2.csv"))
                {
                    WhereMatcher.Where(reader, "ID", "<", "2", writer);
                    Assert.AreEqual(2, writer.RowCountWritten);
                }

                string content = File.ReadAllText("Sample.Under2.csv");
                Assert.IsTrue(content.Contains("\"0\""));
                Assert.IsTrue(content.Contains("\"1\""));
                Assert.IsFalse(content.Contains("\"2\""));
            }
        }
Пример #17
0
        public static void Write(IStreamProvider streamProvider, string tableRootPath, TableMetadata metadata)
        {
            String8Block block = new String8Block();

            using (ITabularWriter sw = TabularFactory.BuildWriter(streamProvider.OpenWrite(Path.Combine(tableRootPath, SchemaFileName)), SchemaFileName))
            {
                sw.SetColumns(new string[] { "Name", "Type" });

                foreach (ColumnDetails column in metadata.Schema)
                {
                    sw.Write(block.GetCopy(column.Name));
                    sw.Write(block.GetCopy(column.Type.Name.ToString()));
                    sw.NextRow();
                }
            }

            using (ITabularWriter mw = TabularFactory.BuildWriter(streamProvider.OpenWrite(Path.Combine(tableRootPath, MetadataFileName)), MetadataFileName))
            {
                mw.SetColumns(new string[] { "Name", "Context", "Value" });

                mw.Write(block.GetCopy("RowCount"));
                mw.Write(String8.Empty);
                mw.Write(metadata.RowCount);
                mw.NextRow();
            }

            streamProvider.WriteAllText(Path.Combine(tableRootPath, ConfigQueryPath), metadata.Query);

            if (metadata.Partitions.Count > 0)
            {
                using (ITabularWriter pw = TabularFactory.BuildWriter(streamProvider.OpenWrite(Path.Combine(tableRootPath, PartitionsFileName)), PartitionsFileName))
                {
                    pw.SetColumns(new string[] { "Name" });

                    foreach (string partition in metadata.Partitions)
                    {
                        pw.Write(block.GetCopy(partition));
                        pw.NextRow();
                    }
                }
            }

            s_Cache.Add($"{streamProvider}|{tableRootPath}", metadata);
        }
Пример #18
0
        private static TableMetadata Build(IStreamProvider streamProvider, string tableRootPath)
        {
            TableMetadata metadata       = new TableMetadata();
            string        schemaFilePath = Path.Combine(tableRootPath, SchemaFileName);

            using (ITabularReader sr = TabularFactory.BuildReader(streamProvider.OpenRead(schemaFilePath), SchemaFileName))
            {
                int nameIndex = sr.ColumnIndex("Name");
                int typeIndex = sr.ColumnIndex("Type");

                while (sr.NextRow())
                {
                    metadata.Schema.Add(new ColumnDetails(sr.Current(nameIndex).ToString(), TypeProviderFactory.Get(sr.Current(typeIndex).ToString()).Type));
                }
            }

            using (ITabularReader mr = TabularFactory.BuildReader(streamProvider.OpenRead(Path.Combine(tableRootPath, MetadataFileName)), MetadataFileName))
            {
                int nameIndex    = mr.ColumnIndex("Name");
                int contextIndex = mr.ColumnIndex("Context");
                int valueIndex   = mr.ColumnIndex("Value");

                while (mr.NextRow())
                {
                    String8       name    = mr.Current(nameIndex).ToString8();
                    String8       context = mr.Current(contextIndex).ToString8();
                    ITabularValue value   = mr.Current(valueIndex);

                    if (name.Equals("RowCount"))
                    {
                        metadata.RowCount = value.ToInteger();
                    }
                    else
                    {
                        throw new NotImplementedException($"TableMetadataSerializer.Read doesn't know how to read Metadata '{name}'");
                    }
                }
            }

            metadata.Query = streamProvider.ReadAllText(Path.Combine(tableRootPath, ConfigQueryPath));

            return(metadata);
        }
Пример #19
0
        private static void BuildWebRequests(string basePath, string fileNamePrefix, WebRequestGenerator generator, int eventCount, WebRequestWriteMode mode)
        {
            DateTime     asOfDate = generator.Current;
            String8Block block    = new String8Block();

            string path = Path.Combine(basePath, $"{fileNamePrefix}.{asOfDate:yyyyMMdd}.r5.{eventCount}.csv");

            Console.WriteLine($"Writing {path}...");

            using (ITabularWriter writer = TabularFactory.BuildWriter(path))
            {
                for (int i = 0; i < eventCount; ++i)
                {
                    WebRequest request = generator.Next();
                    request.WriteTo(writer, block, i, mode);
                }
            }

            File.SetLastWriteTimeUtc(path, asOfDate);
        }
Пример #20
0
        private static void OnlyIn(string inputFilePath, string outputFilePath, string onlyInInputFilePath, string onlyInColumnIdentifier)
        {
            String8Block      block  = new String8Block();
            HashSet <String8> values = new HashSet <String8>();

            // Read values in 'onlyInInputFilePath'
            using (ITabularReader reader = TabularFactory.BuildReader(onlyInInputFilePath))
            {
                int leftColumnIndex = reader.ColumnIndex(onlyInColumnIdentifier);
                while (reader.NextRow())
                {
                    values.Add(block.GetCopy(reader.Current(leftColumnIndex)));
                }
            }

            // Copy from input to output where the column value is in the "only in" set
            using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath))
            {
                int valueColumnIndex = reader.ColumnIndex(onlyInColumnIdentifier);

                using (ITabularWriter writer = TabularFactory.BuildWriter(outputFilePath))
                {
                    writer.SetColumns(reader.Columns);

                    while (reader.NextRow())
                    {
                        if (values.Contains(reader.Current(valueColumnIndex).ToString8()))
                        {
                            for (int i = 0; i < reader.CurrentRowColumns; ++i)
                            {
                                writer.Write(reader.Current(i).ToString8());
                            }

                            writer.NextRow();
                        }
                    }

                    WriteSizeSummary(reader, writer);
                }
            }
        }
Пример #21
0
        private static void Where(string inputFilePath, string columnIdentifier, string op, string value, string outputFilePath)
        {
            WhereResult result;

            using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath))
            {
                using (ITabularWriter writer = (String.IsNullOrEmpty(outputFilePath) ? null : TabularFactory.BuildWriter(outputFilePath)))
                {
                    if (writer == null)
                    {
                        Console.WriteLine($"Counting from '{inputFilePath}' where {columnIdentifier} {op} {value}...");
                    }
                    else
                    {
                        Console.WriteLine($"Writing from '{inputFilePath}' where {columnIdentifier} {op} {value} into '{outputFilePath ?? ""}'...");
                    }

                    result = WhereMatcher.Where(reader, columnIdentifier, op, value, writer);
                }
            }

            Console.WriteLine($"Done. {result.MatchCount:n0} out of {result.RowCount:n0} rows matched.");
        }
Пример #22
0
        public void TsvSplit()
        {
            Stream tsvStream = new MemoryStream();
            //Stream tsvStream = new FileStream("Sample.tsv", FileMode.Create);
            int rowCount = 1000 * 1000;

            WriteSampleTsv(tsvStream, 5, 1000 * 1000);

            byte[]    content = new byte[64 * 1024];
            BitVector cells   = new BitVector(content.Length);
            BitVector rows    = new BitVector(content.Length);

            int[] rowEnds = new int[1024];

            byte[] allContent = new byte[tsvStream.Length];
            tsvStream.Seek(0, SeekOrigin.Begin);
            tsvStream.Read(allContent, 0, allContent.Length);
            BitVector allCells = new BitVector(allContent.Length);
            BitVector allRows  = new BitVector(allContent.Length);

            using (Benchmarker b = new Benchmarker($"Tsv Parse [{rowCount:n0}] | count", DefaultMeasureMilliseconds))
            {
                b.Measure("Read only", (int)tsvStream.Length, () =>
                {
                    tsvStream.Seek(0, SeekOrigin.Begin);
                    while (true)
                    {
                        int lengthRead = tsvStream.Read(content, 0, content.Length);
                        if (lengthRead == 0)
                        {
                            break;
                        }
                    }

                    return(rowCount);
                });

                b.Measure("ReadLine | Split", (int)tsvStream.Length, () =>
                {
                    tsvStream.Seek(0, SeekOrigin.Begin);
                    int count           = 0;
                    StreamReader reader = new StreamReader(tsvStream);
                    {
                        // Header row
                        reader.ReadLine();

                        while (!reader.EndOfStream)
                        {
                            string line      = reader.ReadLine();
                            string[] cellSet = line.Split('\t');
                            count++;
                        }
                    }
                    return(count);
                });

                b.Measure("Elfie TsvReader", (int)tsvStream.Length, () =>
                {
                    tsvStream.Seek(0, SeekOrigin.Begin);
                    int count             = 0;
                    ITabularReader reader = TabularFactory.BuildReader(tsvStream, "Unused.tsv");
                    {
                        while (reader.NextRow())
                        {
                            count++;
                        }
                    }
                    return(count);
                });


                Func <byte[], int, int, ulong[], ulong[], int> splitTsvN = NativeAccelerator.GetMethod <Func <byte[], int, int, ulong[], ulong[], int> >("XForm.Native.String8N", "SplitTsv");
                b.Measure("XForm Native Split", (int)tsvStream.Length, () =>
                {
                    tsvStream.Seek(0, SeekOrigin.Begin);

                    int count = -1;
                    while (true)
                    {
                        int lengthRead = tsvStream.Read(content, 0, content.Length);
                        if (lengthRead == 0)
                        {
                            break;
                        }
                        if (lengthRead < content.Length)
                        {
                            Array.Clear(content, lengthRead, content.Length - lengthRead);
                        }

                        int lineCount = splitTsvN(content, 0, lengthRead, cells.Array, rows.Array);
                        count        += lineCount;

                        int fromRow   = 0;
                        int countCopy = cells.Page(rowEnds, ref fromRow);
                    }

                    return(count);
                });

                b.MeasureParallel("XForm Native Split Parallel", (int)tsvStream.Length, (index, length) =>
                {
                    return(splitTsvN(allContent, index, length, allCells.Array, allRows.Array) - 1);
                });
            }
        }
Пример #23
0
        private static void OnlyLatest(string inputFolderPath, string outputFilePath, string idColumnIdentifier)
        {
            String8Block block = new String8Block();
            Dictionary <String8, Tuple <string, int> > latestFileAndRowByID = new Dictionary <String8, Tuple <string, int> >();
            IReadOnlyList <string> writerColumns = null;

            // Walk the input files to figure out the latest copy of each ID
            Trace.WriteLine($"Identifying latest {idColumnIdentifier} in all files in {inputFolderPath}...");
            int rowCountRead = 0;

            foreach (string inputFilePath in Directory.GetFiles(inputFolderPath))
            {
                using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath))
                {
                    int idColumnIndex = reader.ColumnIndex(idColumnIdentifier);

                    while (reader.NextRow())
                    {
                        rowCountRead++;
                        String8 id = reader.Current(idColumnIndex).ToString8();
                        id.ToUpperInvariant();

                        // Record the file and row containing this ID, overwriting previous entries
                        latestFileAndRowByID[block.GetCopy(id)] = new Tuple <string, int>(inputFilePath, reader.RowCountRead);
                    }

                    // Capture the columns from the last CSV to write
                    writerColumns = reader.Columns;
                }
            }
            Trace.WriteLine($"Scan Complete. {rowCountRead:n0} rows read; {latestFileAndRowByID.Count:n0} distinct IDs found.");

            using (ITabularWriter writer = TabularFactory.BuildWriter(outputFilePath))
            {
                writer.SetColumns(writerColumns);
                int[] writerColumnIndexInReader = new int[writerColumns.Count];

                foreach (string inputFilePath in Directory.GetFiles(inputFolderPath))
                {
                    using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath))
                    {
                        // Look up each output column's position in the input file
                        for (int i = 0; i < writerColumns.Count; ++i)
                        {
                            reader.TryGetColumnIndex(writerColumns[i], out writerColumnIndexInReader[i]);
                        }

                        int idColumnIndex = reader.ColumnIndex(idColumnIdentifier);

                        while (reader.NextRow())
                        {
                            String8 id = reader.Current(idColumnIndex).ToString8();
                            id.ToUpperInvariant();

                            // Copy this row to the output file, *if* it's the latest for this ID
                            Tuple <string, int> latestForID = latestFileAndRowByID[id];
                            if (latestForID.Item1 == inputFilePath && latestForID.Item2 == reader.RowCountRead)
                            {
                                for (int i = 0; i < writerColumns.Count; ++i)
                                {
                                    int readerColumnIndex = writerColumnIndexInReader[i];
                                    if (readerColumnIndex >= 0 && readerColumnIndex < reader.CurrentRowColumns)
                                    {
                                        writer.Write(reader.Current(i).ToString8());
                                    }
                                    else
                                    {
                                        writer.Write(String8.Empty);
                                    }
                                }

                                writer.NextRow();
                            }
                        }
                    }
                }

                WriteSizeSummary(null, writer);
            }
        }
Пример #24
0
        private static void ConcatenateColumn(string inputFilePath, string outputFilePath, string columnName1, string separator, string columnName2, string outputColumnName)
        {
            String8 separator8 = String8.Convert(separator, new byte[String8.GetLength(separator)]);

            using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath))
            {
                // Find the columns to concatenate
                int columnIndex1 = reader.ColumnIndex(columnName1);
                int columnIndex2 = reader.ColumnIndex(columnName2);

                // Build an output column list and mapping from output order to input index, with '-1' for the concatenated value
                List <string> outputColumns         = new List <string>();
                int[]         indexMapping          = new int[reader.Columns.Count - 1];
                bool          hasConcatenatedColumn = false;

                for (int i = 0; i < reader.Columns.Count; ++i)
                {
                    string columnName = reader.Columns[i];

                    // If this is a column to concatenate...
                    if (columnName.Equals(reader.Columns[columnIndex1], StringComparison.OrdinalIgnoreCase) ||
                        columnName.Equals(reader.Columns[columnIndex2], StringComparison.OrdinalIgnoreCase))
                    {
                        // .. if it's the first one, the output column will appear at this position
                        if (!hasConcatenatedColumn)
                        {
                            hasConcatenatedColumn = true;

                            indexMapping[outputColumns.Count] = -1;
                            outputColumns.Add(outputColumnName);
                        }
                    }
                    else
                    {
                        // Otherwise, copy this column through
                        indexMapping[outputColumns.Count] = i;
                        outputColumns.Add(columnName);
                    }
                }

                using (ITabularWriter writer = TabularFactory.BuildWriter(outputFilePath))
                {
                    writer.SetColumns(outputColumns);

                    while (reader.NextRow())
                    {
                        // Write columns in mapped order
                        for (int i = 0; i < indexMapping.Length; ++i)
                        {
                            int sourceColumnIndex = indexMapping[i];

                            if (sourceColumnIndex == -1)
                            {
                                // Write concatenated column
                                writer.WriteValueStart();
                                writer.WriteValuePart(reader.Current(columnIndex1).ToString8());
                                writer.WriteValuePart(separator8);
                                writer.WriteValuePart(reader.Current(columnIndex2).ToString8());
                                writer.WriteValueEnd();
                            }
                            else
                            {
                                writer.Write(reader.Current(sourceColumnIndex).ToString8());
                            }
                        }

                        writer.NextRow();
                    }

                    WriteSizeSummary(reader, writer);
                }
            }
        }
Пример #25
0
        private static void Build(AddMode mode, string tableName, string csvFilePath, int maximumCount, string columns, string settingsJsonPath = null)
        {
            Stopwatch w = Stopwatch.StartNew();

            Console.WriteLine("{0} Arriba table '{1}' from '{2}'...", mode, tableName, csvFilePath);

            IList <string> columnNames = null;

            if (!String.IsNullOrEmpty(columns))
            {
                columnNames = SplitAndTrim(columns);
            }

            // Build or load table
            Table table;

            if (mode == AddMode.Build)
            {
                table = new Table(tableName, maximumCount);
            }
            else
            {
                table = new Table();
                table.Load(tableName);
            }

            // Configure table
            if (!String.IsNullOrEmpty(settingsJsonPath))
            {
                SetSettings(table, LoadSettings(settingsJsonPath));
            }

            // Always add missing columns. Add rows only when not in 'decorate' mode
            AddOrUpdateOptions options = new AddOrUpdateOptions();

            options.AddMissingColumns = true;
            options.Mode = (mode == AddMode.Decorate ? AddOrUpdateMode.UpdateAndIgnoreAdds : AddOrUpdateMode.AddOrUpdate);

            using (ITabularReader reader = TabularFactory.BuildReader(csvFilePath))
            {
                long rowsImported = 0;
                if (columnNames == null)
                {
                    columnNames = new List <string>(reader.Columns);
                }

                foreach (DataBlock block in ReadAsDataBlockBatch(reader, columnNames))
                {
                    table.AddOrUpdate(block, options);
                    rowsImported += block.RowCount;
                    Console.Write(".");
                }

                Console.WriteLine();
                Console.WriteLine("Imported {0:n0} rows; table has {1:n0} rows. Saving...", rowsImported, table.Count);
            }

            table.Save();
            w.Stop();
            Console.WriteLine("Done in {0}.", w.Elapsed.ToFriendlyString());
        }
Пример #26
0
        public static int Main(string[] args)
        {
            Trace.Listeners.Add(new ConsoleTraceListener());

            if (args == null || args.Length < 3)
            {
                Trace.WriteLine(Usage);
                return(-1);
            }

            string mode = args[0].ToLowerInvariant();

            try
            {
                using (new TraceWatch(String.Empty))
                {
                    switch (mode)
                    {
                    case "copy":
                        Trace.WriteLine(String.Format("Copy \"{0}\" to \"{1}\"...", args[1], args[2]));
                        if (args.Length < 4)
                        {
                            Copy(args[1], args[2]);
                        }
                        else
                        {
                            Copy(args[1], args[2], args[3]);
                        }
                        break;

                    case "concat":
                        Trace.WriteLine(String.Format("Concatenating \"{0}\" values on first column into \"{1}\"...", args[1], args[2]));
                        Concatenate(args[1], args[2], String8.Convert("; ", new byte[2]));
                        break;

                    case "notstartswith":
                        if (args.Length < 5)
                        {
                            throw new UsageException("notStartsWith requires a value and name column to be passed.");
                        }
                        Trace.WriteLine(String.Format("Writing \"{0}\" values into \"{1}\" where !row[{2}].StartsWith(row[{3}])", args[1], args[2], args[3], args[4]));
                        NotStartsWith(args[1], args[2], args[3], args[4]);
                        break;

                    case "compare":
                        if (args.Length < 5)
                        {
                            throw new UsageException("compare requires two input files, an output file, and a column identifier to compare.");
                        }
                        Trace.WriteLine(String.Format("Comparing values for \"{0}\" values between \"{1}\" and \"{2}\"...", args[1], args[2], args[3], args[4]));
                        Compare(args[1], args[2], args[3], args[4]);
                        break;

                    case "onlyin":
                        if (args.Length < 5)
                        {
                            throw new UsageException("onlyIn requires a second input file and column identifier");
                        }
                        Trace.WriteLine(String.Format("Writing \"{0}\" values into \"{1}\" where \"{2}\" also had the same \"{3}\"...", args[1], args[2], args[3], args[4]));
                        OnlyIn(args[1], args[2], args[3], args[4]);
                        break;

                    case "sanitize":
                        if (args.Length < 5)
                        {
                            throw new UsageException("sanitize requires input, output, specFile, hashKey");
                        }
                        Trace.WriteLine(String.Format("Sanitizing \"{0}\" into \"{1}\" using \"{2}\"...", args[1], args[2], args[3]));
                        Xsv.Sanitize.Sanitizer s = new Xsv.Sanitize.Sanitizer(args[3], args[4]);
                        s.Sanitize(args[1], args[2]);
                        break;

                    case "sanitizevalue":
                        if (args.Length < 5)
                        {
                            throw new UsageException("sanitize requires value, columnName, specFile, hashKey");
                        }
                        Trace.WriteLine(String.Format("Sanitizing \"{0}\" from column \"{1}\" using \"{2}\"...", args[1], args[2], args[3]));
                        Trace.WriteLine(new Xsv.Sanitize.Sanitizer(args[3], args[4]).Translate(args[1], args[2]));
                        break;

                    case "where":
                        if (args.Length < 3)
                        {
                            throw new UsageException("row requires input and rowIndex");
                        }
                        Where(args[1], args[2], (args.Length > 3 ? args[3] : null), (args.Length > 4 ? TabularFactory.BuildWriter(args[4]) : null));
                        break;

                    default:
                        throw new NotSupportedException(String.Format("XSV mode \"{0}\" is unknown. Run without arguments to see valid modes.", mode));
                    }
                }

                return(0);
            }
            catch (UsageException ex)
            {
                Trace.WriteLine(ex.Message);
                Trace.WriteLine(Usage);
                return(-2);
            }
            catch (Exception ex) when(!Debugger.IsAttached)
            {
                Trace.WriteLine("ERROR: " + ex.ToString());
                return(-1);
            }
        }
Пример #27
0
        private static void Where(string inputFilePath, string columnIndentifier, string value, ITabularWriter writer)
        {
            int matchCount = 0;
            int rowCount   = 0;

            using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath))
            {
                int rowIndex = (value != null ? -1 : int.Parse(columnIndentifier));
                int colIndex = (value != null ? reader.ColumnIndex(columnIndentifier) : -1);

                while (reader.NextRow())
                {
                    // Match the row index if no value was passed
                    if (rowIndex != -1 && reader.RowCountRead != rowIndex)
                    {
                        continue;
                    }

                    // Match the column value if passed
                    if (colIndex != -1)
                    {
                        if (reader.CurrentRowColumns <= colIndex)
                        {
                            continue;
                        }
                        if (reader.Current(colIndex).ToString8().CompareTo(value, true) != 0)
                        {
                            continue;
                        }
                    }

                    matchCount++;

                    // If this is the matching row, write it
                    if (writer != null)
                    {
                        if (writer.RowCountWritten == 0)
                        {
                            List <string> columns = new List <string>();
                            columns.Add("RowIndex");
                            columns.AddRange(reader.Columns);
                            writer.SetColumns(columns);
                        }

                        writer.Write(reader.RowCountRead);
                        for (int i = 0; i < reader.CurrentRowColumns; ++i)
                        {
                            writer.Write(reader.Current(i).ToString8());
                        }
                        writer.NextRow();
                    }

                    // If we matched row index, we're done
                    if (rowIndex != -1)
                    {
                        break;
                    }
                }

                rowCount = reader.RowCountRead;
            }

            Console.WriteLine($"Done. {matchCount:n0} out of {rowCount:n0} rows matched.");
        }