예제 #1
0
        public void Reader_NewlineVariations(Func <Stream, ITabularWriter> buildWriter, Func <string, bool, ITabularReader> buildReader)
        {
            string xsvPath = "NewlineVariations.xsv";
            Stream stream  = new FileStream(xsvPath, FileMode.Create, FileAccess.ReadWrite);

            using (ITabularWriter w = buildWriter(stream))
            {
                w.SetColumns(new string[] { "One", "Two", "Three" });

                for (int row = 0; row < 3; ++row)
                {
                    w.Write(3 * row + 1);
                    w.Write(3 * row + 2);
                    w.Write(3 * row + 3);

                    // Write the end of row but then override it
                    long position = stream.Position;
                    w.NextRow();

                    if (row == 0)
                    {
                        // Row 0 - newline only
                        stream.Seek(position, SeekOrigin.Begin);
                        stream.WriteByte(UTF8.Newline);
                    }
                    else if (row == 2)
                    {
                        // Row 2 - no end of line
                        stream.SetLength(position);
                    }
                }
            }

            using (ITabularReader r = buildReader(xsvPath, true))
            {
                // Verify column heading not clipped even though no '\r'
                Assert.AreEqual("Three", r.Columns[2]);

                Assert.IsTrue(r.NextRow());
                Assert.AreEqual(3, r.CurrentRowColumns);

                // Verify last column doesn't have extra '\r' when terminated with '\r\n'
                Assert.AreEqual("3", r.Current(2).ToString());

                Assert.IsTrue(r.NextRow());
                Assert.AreEqual(3, r.CurrentRowColumns);

                // Verify last column not clipped when terminated with '\n'
                Assert.AreEqual("6", r.Current(2).ToString());

                Assert.IsTrue(r.NextRow());
                Assert.AreEqual(3, r.CurrentRowColumns);

                // Verify last column not clipped when unterminated [EOF]
                Assert.AreEqual("9", r.Current(2).ToString());

                Assert.IsFalse(r.NextRow(), "Reader didn't stop after last line without newline");
            }
        }
예제 #2
0
        public void Reader_Roundtrip(Func <string, bool, ITabularReader> buildReader, Func <Stream, ITabularWriter> buildWriter)
        {
            string filePath = "ValidSample.xsv";

            // Write a valid file with some values which require CSV escaping
            WriteValidSample(new FileStream(filePath, FileMode.Create, FileAccess.ReadWrite), buildWriter);

            // Direct Copy the file from the reader to the writer - every value unescaped and then escaped
            using (ITabularReader reader = buildReader(filePath, true))
            {
                using (ITabularWriter writer = buildWriter(new FileStream(filePath + ".new", FileMode.Create, FileAccess.ReadWrite)))
                {
                    writer.SetColumns(reader.Columns);

                    while (reader.NextRow())
                    {
                        for (int i = 0; i < reader.CurrentRowColumns; ++i)
                        {
                            writer.Write(reader.Current(i).ToString8());
                        }

                        writer.NextRow();
                    }
                }
            }

            // Verify files are identical
            string fileBefore = File.ReadAllText(filePath);
            string fileAfter  = File.ReadAllText(filePath + ".new");

            Assert.AreEqual(fileBefore, fileAfter);
        }
예제 #3
0
        private static void Copy(string inputFilePath, string outputFilePath, string columnsDelimited)
        {
            List <string> columns = new List <string>();

            foreach (string columnName in columnsDelimited.Split(','))
            {
                columns.Add(columnName.Trim());
            }

            using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath))
            {
                int[] columnIndices = new int[columns.Count];
                for (int i = 0; i < columnIndices.Length; ++i)
                {
                    columnIndices[i] = reader.ColumnIndex(columns[i]);
                }

                using (ITabularWriter writer = TabularFactory.BuildWriter(outputFilePath))
                {
                    writer.SetColumns(columns);

                    while (reader.NextRow())
                    {
                        for (int i = 0; i < columnIndices.Length; ++i)
                        {
                            writer.Write(reader.Current(columnIndices[i]).ToString8());
                        }

                        writer.NextRow();
                    }

                    WriteSizeSummary(reader, writer);
                }
            }
        }
예제 #4
0
        private static void HtmlInnerText(string inputFilePath, string outputFilePath, string columnsDelimited)
        {
            using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath))
            {
                List <int> columnIndicesToEscape = columnsDelimited.Split(',').Select((col) => reader.ColumnIndex(col.Trim())).ToList();

                using (ITabularWriter writer = TabularFactory.BuildWriter(outputFilePath))
                {
                    writer.SetColumns(reader.Columns);

                    while (reader.NextRow())
                    {
                        for (int i = 0; i < reader.CurrentRowColumns; ++i)
                        {
                            if (columnIndicesToEscape.Contains(i))
                            {
                                WriteHtmlEscaped(reader.Current(i).ToString8(), writer);
                            }
                            else
                            {
                                writer.Write(reader.Current(i).ToString8());
                            }
                        }

                        writer.NextRow();
                    }

                    WriteSizeSummary(reader, writer);
                }
            }
        }
예제 #5
0
        private static void MatchContains(ITabularReader reader, ITabularWriter writer, WhereResult result)
        {
            string  valueString = (string)result.Value;
            String8 value       = String8.Convert(valueString, new byte[String8.GetLength(valueString)]);

            while (reader.NextRow())
            {
                // Ensure the row has enough columns
                if (reader.CurrentRowColumns <= result.ColumnIndex)
                {
                    continue;
                }

                // Match the value
                if (reader.Current(result.ColumnIndex).ToString8().IndexOf(value) == -1)
                {
                    continue;
                }

                result.MatchCount++;

                // If this is the matching row, write it
                EchoRow(reader, writer);
            }
        }
예제 #6
0
        private static void NotStartsWith(string inputFilePath, string outputFilePath, string valueColumnIdentifier, string nameColumnIdentifier)
        {
            using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath))
            {
                int valueColumnIndex = reader.ColumnIndex(valueColumnIdentifier);
                int nameColumnIndex  = reader.ColumnIndex(nameColumnIdentifier);

                using (ITabularWriter writer = TabularFactory.BuildWriter(outputFilePath))
                {
                    writer.SetColumns(reader.Columns);

                    while (reader.NextRow())
                    {
                        String8 name  = reader.Current(nameColumnIndex).ToString8();
                        String8 value = reader.Current(valueColumnIndex).ToString8();

                        if (!value.StartsWith(name))
                        {
                            for (int i = 0; i < reader.CurrentRowColumns; ++i)
                            {
                                writer.Write(reader.Current(i).ToString8());
                            }

                            writer.NextRow();
                        }
                    }

                    WriteSizeSummary(reader, writer);
                }
            }
        }
예제 #7
0
        private static void Compare(string oldFilePath, string newFilePath, string outputFilePath, string columnIdentifier)
        {
            String8Block      block     = new String8Block();
            HashSet <String8> oldValues = new HashSet <String8>();
            HashSet <String8> newValues = new HashSet <String8>();

            using (ITabularReader oldReader = TabularFactory.BuildReader(oldFilePath))
            {
                int leftColumnIndex = oldReader.ColumnIndex(columnIdentifier);
                while (oldReader.NextRow())
                {
                    oldValues.Add(block.GetCopy(oldReader.Current(leftColumnIndex)));
                }

                Trace.WriteLine(String.Format("Old: {0:n0} values for \"{1}\" in {2:n0} rows.", oldValues.Count, columnIdentifier, oldReader.RowCountRead));
            }

            using (ITabularReader newReader = TabularFactory.BuildReader(newFilePath))
            {
                int rightColumnIndex = newReader.ColumnIndex(columnIdentifier);
                while (newReader.NextRow())
                {
                    newValues.Add(block.GetCopy(newReader.Current(rightColumnIndex)));
                }

                Trace.WriteLine(String.Format("New: {0:n0} values for \"{1}\" in {2:n0} rows.", newValues.Count, columnIdentifier, newReader.RowCountRead));
            }

            HashSet <String8> oldOnly = new HashSet <String8>(oldValues);

            oldOnly.ExceptWith(newValues);

            HashSet <String8> newOnly = new HashSet <String8>(newValues);

            newOnly.ExceptWith(oldValues);

            Trace.WriteLine(String.Format("{0:n0} values were only in \"{1}\".\r\n{2:n0} values were only in \"{3}\".", oldOnly.Count, oldFilePath, newOnly.Count, newFilePath));

            String8 leftMarker  = String8.Convert("-", new byte[1]);
            String8 rightMarker = String8.Convert("+", new byte[1]);

            using (ITabularWriter writer = TabularFactory.BuildWriter(outputFilePath))
            {
                writer.SetColumns(new string[] { "In", columnIdentifier });

                foreach (String8 value in oldOnly)
                {
                    writer.Write(leftMarker);
                    writer.Write(value);
                    writer.NextRow();
                }

                foreach (String8 value in newOnly)
                {
                    writer.Write(rightMarker);
                    writer.Write(value);
                    writer.NextRow();
                }
            }
        }
예제 #8
0
        private static void HtmlInnerText(string inputFilePath, string outputFilePath, string columnIdentifier)
        {
            using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath))
            {
                int columnIndexToEscape = reader.ColumnIndex(columnIdentifier);

                using (ITabularWriter writer = TabularFactory.BuildWriter(outputFilePath))
                {
                    writer.SetColumns(reader.Columns);

                    while (reader.NextRow())
                    {
                        for (int i = 0; i < reader.CurrentRowColumns; ++i)
                        {
                            if (i == columnIndexToEscape)
                            {
                                WriteHtmlEscaped(reader.Current(i).ToString8(), writer);
                            }
                            else
                            {
                                writer.Write(reader.Current(i).ToString8());
                            }
                        }

                        writer.NextRow();
                    }

                    WriteSizeSummary(reader, writer);
                }
            }
        }
예제 #9
0
        private static void MatchBoolCompare(ITabularReader reader, ITabularWriter writer, WhereResult result)
        {
            bool value = (bool)result.Value;

            while (reader.NextRow())
            {
                // Ensure the row has enough columns
                if (reader.CurrentRowColumns <= result.ColumnIndex)
                {
                    continue;
                }

                // Ensure the value converts
                bool columnValue;
                if (!reader.Current(result.ColumnIndex).ToString8().TryToBoolean(out columnValue))
                {
                    continue;
                }

                int compareResult = columnValue.CompareTo(value);
                if (!result.Op.Matches(compareResult))
                {
                    continue;
                }

                result.MatchCount++;

                // If this is the matching row, write it
                EchoRow(reader, writer);
            }
        }
예제 #10
0
        private static void RowId(string inputFilePath, string outputFilePath, int firstId = 1)
        {
            int currentId = firstId;

            using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath))
            {
                using (ITabularWriter writer = TabularFactory.BuildWriter(outputFilePath))
                {
                    List <string> columns = new List <string>();
                    columns.Add("ID");
                    columns.AddRange(reader.Columns);

                    writer.SetColumns(columns);

                    while (reader.NextRow())
                    {
                        writer.Write(currentId);
                        currentId++;

                        for (int i = 0; i < reader.CurrentRowColumns; ++i)
                        {
                            writer.Write(reader.Current(i).ToString8());
                        }

                        writer.NextRow();
                    }

                    WriteSizeSummary(reader, writer);
                }
            }
        }
예제 #11
0
        private static void Distinct(string inputFilePath, string outputFilePath, string columnIdentifier)
        {
            String8Block      block          = new String8Block();
            HashSet <String8> distinctValues = new HashSet <String8>();

            using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath))
            {
                int columnIndex = reader.ColumnIndex(columnIdentifier);

                using (ITabularWriter writer = TabularFactory.BuildWriter(outputFilePath))
                {
                    writer.SetColumns(new string[] { reader.Columns[columnIndex] });

                    while (reader.NextRow())
                    {
                        String8 value = reader.Current(columnIndex).ToString8();

                        if (!distinctValues.Contains(value))
                        {
                            distinctValues.Add(block.GetCopy(value));
                            writer.Write(value);
                            writer.NextRow();
                        }
                    }

                    WriteSizeSummary(reader, writer);
                }
            }
        }
예제 #12
0
        private static void Copy(string inputFilePath, string outputFilePath, int rowLimit = -1)
        {
            using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath))
            {
                using (ITabularWriter writer = TabularFactory.BuildWriter(outputFilePath))
                {
                    writer.SetColumns(reader.Columns);

                    while (reader.NextRow())
                    {
                        for (int i = 0; i < reader.CurrentRowColumns; ++i)
                        {
                            writer.Write(reader.Current(i).ToString8());
                        }

                        writer.NextRow();
                        if (writer.RowCountWritten == rowLimit)
                        {
                            break;
                        }
                    }

                    WriteSizeSummary(reader, writer);
                }
            }
        }
예제 #13
0
        private static void Concatenate(string inputFilePath, string outputFilePath, String8 delimiter)
        {
            using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath))
            {
                using (ITabularWriter writer = TabularFactory.BuildWriter(outputFilePath))
                {
                    writer.SetColumns(reader.Columns);

                    String8Block block          = new String8Block();
                    String8[]    lastValues     = new String8[reader.CurrentRowColumns];
                    String8[]    combinedValues = new String8[reader.CurrentRowColumns];

                    while (reader.NextRow())
                    {
                        String8 firstColumn = reader.Current(0).ToString8();

                        if (reader.RowCountRead == 2)
                        {
                            // First Row - Get the first ID only
                            combinedValues[0] = block.GetCopy(firstColumn);
                        }
                        else if (firstColumn.CompareTo(combinedValues[0], true) != 0)
                        {
                            // If we have a new ID (and not first row)

                            // Write concatenated values for previous ID
                            WriteCombinedRow(writer, combinedValues);

                            // Reset for this ID
                            block.Clear();
                            combinedValues[0] = block.GetCopy(firstColumn);

                            for (int i = 1; i < combinedValues.Length; ++i)
                            {
                                combinedValues[i] = String8.Empty;
                            }
                        }

                        // Concatenate non-duplicate values to "row in progress"
                        for (int i = 1; i < reader.CurrentRowColumns; ++i)
                        {
                            String8 value = reader.Current(i).ToString8();

                            if (lastValues[i] != value)
                            {
                                lastValues[i]     = value;
                                combinedValues[i] = block.Concatenate(combinedValues[i], delimiter, value);
                            }
                        }
                    }

                    // After last row, write out values so far
                    WriteCombinedRow(writer, combinedValues);
                    WriteSizeSummary(reader, writer);
                }
            }
        }
예제 #14
0
        private static IEnumerable <DataBlock> ReadAsDataBlockBatch(ITabularReader reader, IList <string> columnNames)
        {
            // Build a DataBlock to hold a batch of rows
            int       columnCount = columnNames.Count;
            DataBlock result      = new DataBlock(columnNames, BatchSize);

            Value[][] columnArrays = new Value[columnCount][];
            for (int i = 0; i < columnCount; ++i)
            {
                columnArrays[i] = new Value[BatchSize];
                for (int j = 0; j < BatchSize; ++j)
                {
                    columnArrays[i][j] = Value.Create(null);
                }

                result.SetColumn(i, columnArrays[i]);
            }

            // Look up indices of the columns
            int[] columnIndices = new int[columnCount];
            for (int i = 0; i < columnCount; ++i)
            {
                columnIndices[i] = reader.ColumnIndex(columnNames[i]);
            }

            // Fill blocks with rows as we go
            int          currentRowCount = 0;
            String8Block block           = new String8Block();

            while (reader.NextRow())
            {
                for (int i = 0; i < columnCount; ++i)
                {
                    String8 cell = block.GetCopy(reader.Current(columnIndices[i]).ToString8());
                    columnArrays[i][currentRowCount].Assign(new ByteBlock(cell.Array, cell.Index, cell.Length));
                    //columnArrays[i][currentRowCount].Assign(cell.ToString());
                }

                currentRowCount++;

                if (currentRowCount == BatchSize)
                {
                    yield return(result);

                    currentRowCount = 0;
                    block.Clear();
                }
            }

            if (currentRowCount > 0)
            {
                yield return(result);
            }
        }
예제 #15
0
        private static void CopyRows(ITabularReader reader, ITabularWriter writer)
        {
            while (reader.NextRow())
            {
                for (int i = 0; i < reader.CurrentRowColumns; ++i)
                {
                    writer.Write(reader.Current(i).ToString8());
                }

                writer.NextRow();
            }
        }
예제 #16
0
        public void Sanitize_EndToEnd()
        {
            Assembly xsvTest = Assembly.GetExecutingAssembly();

            Resource.SaveStreamTo("Xsv.Test.Sanitize.SanitizeSampleSource.csv", "SanitizeSampleSource.csv", xsvTest);
            Resource.SaveStreamTo("Xsv.Test.Sanitize.SanitizeSampleSource.sanispec", "SanitizeSampleSource.sanispec", xsvTest);

            // Verify UsageException if no key is passed
            Assert.AreEqual(-2, Program.Main(new string[] { "sanitize", @"SanitizeSampleSource.csv", "SanitizeOutput.csv", @"SanitizeSampleSource.sanispec" }));

            // Verify success for base sanitize
            File.Delete("SanitizeOutput.csv");
            Assert.AreEqual(0, Program.Main(new string[] { "sanitize", @"SanitizeSampleSource.csv", "SanitizeOutput.csv", @"SanitizeSampleSource.sanispec", "Key1" }));

            // Validate the result
            using (ITabularReader r = TabularFactory.BuildReader("SanitizeOutput.csv"))
            {
                Assert.IsTrue(r.Columns.Contains("ID"), "ID column is kept (no spec line)");
                Assert.IsTrue(r.Columns.Contains("Path"), "Path column is kept (mapped)");
                Assert.IsTrue(r.Columns.Contains("IsEmptyPath"), "IsEmptyPath is kept (Keep line)");
                Assert.IsFalse(r.Columns.Contains("IsUnderXsv"), "IxUnderXsv column is dropped (Drop line)");

                int idColumnIndex          = r.ColumnIndex("ID");
                int pathColumnIndex        = r.ColumnIndex("Path");
                int isEmptyPathColumnIndex = r.ColumnIndex("IsEmptyPath");

                while (r.NextRow())
                {
                    int    id   = r.Current(idColumnIndex).ToInteger();
                    string path = r.Current(pathColumnIndex).ToString();

                    Assert.AreEqual(r.Current(isEmptyPathColumnIndex).ToBoolean(), String.IsNullOrEmpty(path), "IsEmptyPath condition matches whether mapped path is empty");

                    if (id == 5)
                    {
                        Assert.AreEqual("Elfie", path, "'Elfie' is echoed (Echo in spec)");
                    }
                    else if (!String.IsNullOrEmpty(path))
                    {
                        Assert.IsTrue(path.StartsWith("WarmBeggedTruth\\"), "Verify path is mapped in parts, and 'Elfie' is consistently mapped.");
                    }
                }

                Assert.IsTrue(r.RowCountRead < 7, "Verify sample excluded at least one row.");
            }

            // Run with another key
            Assert.AreEqual(0, Program.Main(new string[] { "sanitize", @"SanitizeSampleSource.csv", "SanitizeOutput2.csv", @"SanitizeSampleSource.sanispec", "Key2" }));

            // Verify mappings are different
            Assert.AreNotEqual(File.ReadAllText("SanitizeOutput2.csv"), File.ReadAllText("SanitizeOutput.csv"));
        }
예제 #17
0
        /// <summary>
        ///  Sanitize an input file into a given output file using this Sanitizer's configuration.
        /// </summary>
        /// <param name="inputFile">File Path to input file</param>
        /// <param name="outputFile">File Path to output file</param>
        public void Sanitize(string inputFile, string outputFile)
        {
            using (ITabularReader reader = TabularFactory.BuildReader(inputFile))
            {
                // Build an array of what we'll do with each input column, and the list of columns we'll actually write
                List <string>    columnsToOutput;
                IColumnHandler[] handlers = GetHandlersByColumnIndex(reader.Columns, out columnsToOutput);

                // Find the sample column index, if any, and calculate a hash cutoff for including rows
                int  sampleColumnIndex     = (String.IsNullOrEmpty(this.SampleColumnName) ? -1 : reader.ColumnIndex(this.SampleColumnName));
                uint sampleInclusionCutoff = (uint)(uint.MaxValue * this.SampleProbability);

                using (ITabularWriter writer = TabularFactory.BuildWriter(outputFile))
                {
                    writer.SetColumns(columnsToOutput);

                    while (reader.NextRow())
                    {
                        // If there's a sample column, decide whether to include this row
                        if (sampleColumnIndex > -1)
                        {
                            // Sample *without* the hashkey, so the same rows are consistently included or excluded.
                            uint sampleValueHash = Hashing.Hash(reader.Current(sampleColumnIndex).ToString8(), 0);
                            if (sampleValueHash > sampleInclusionCutoff)
                            {
                                continue;
                            }
                        }

                        // Run the handler for every input column, writing the output if there is one
                        for (int i = 0; i < reader.CurrentRowColumns; ++i)
                        {
                            IColumnHandler handler = handlers[i];
                            if (handler != null)
                            {
                                String8 value       = reader.Current(i).ToString8();
                                String8 replacement = handler.Sanitize(value);
                                writer.Write(replacement);
                            }
                        }

                        writer.NextRow();
                    }
                }
            }
        }
예제 #18
0
        private static TableMetadata Build(IStreamProvider streamProvider, string tableRootPath)
        {
            TableMetadata metadata       = new TableMetadata();
            string        schemaFilePath = Path.Combine(tableRootPath, SchemaFileName);

            using (ITabularReader sr = TabularFactory.BuildReader(streamProvider.OpenRead(schemaFilePath), SchemaFileName))
            {
                int nameIndex = sr.ColumnIndex("Name");
                int typeIndex = sr.ColumnIndex("Type");

                while (sr.NextRow())
                {
                    metadata.Schema.Add(new ColumnDetails(sr.Current(nameIndex).ToString(), TypeProviderFactory.Get(sr.Current(typeIndex).ToString()).Type));
                }
            }

            using (ITabularReader mr = TabularFactory.BuildReader(streamProvider.OpenRead(Path.Combine(tableRootPath, MetadataFileName)), MetadataFileName))
            {
                int nameIndex    = mr.ColumnIndex("Name");
                int contextIndex = mr.ColumnIndex("Context");
                int valueIndex   = mr.ColumnIndex("Value");

                while (mr.NextRow())
                {
                    String8       name    = mr.Current(nameIndex).ToString8();
                    String8       context = mr.Current(contextIndex).ToString8();
                    ITabularValue value   = mr.Current(valueIndex);

                    if (name.Equals("RowCount"))
                    {
                        metadata.RowCount = value.ToInteger();
                    }
                    else
                    {
                        throw new NotImplementedException($"TableMetadataSerializer.Read doesn't know how to read Metadata '{name}'");
                    }
                }
            }

            metadata.Query = streamProvider.ReadAllText(Path.Combine(tableRootPath, ConfigQueryPath));

            return(metadata);
        }
예제 #19
0
        public int Next(int desiredCount, CancellationToken cancellationToken)
        {
            // Stop reading on cancellation
            if (cancellationToken.IsCancellationRequested)
            {
                return(0);
            }

            if (_cells[0] == null || _cells[0].Length < desiredCount)
            {
                for (int i = 0; i < _cells.Length; ++i)
                {
                    Allocator.AllocateToSize(ref _cells[i], desiredCount);
                }
            }

            //return _reader.NextRow();

            _block.Clear();
            CurrentRowCount = 0;

            while (_reader.NextRow())
            {
                for (int i = 0; i < _cells.Length; ++i)
                {
                    _cells[i][CurrentRowCount] = _block.GetCopy(_reader.Current(i).ToString8());
                }

                CurrentRowCount++;
                if (CurrentRowCount == desiredCount)
                {
                    break;
                }
            }

            for (int i = 0; i < _columns.Length; ++i)
            {
                _columns[i].SetValues(_cells[i]);
            }

            return(CurrentRowCount);
        }
예제 #20
0
        private static void OnlyIn(string inputFilePath, string outputFilePath, string onlyInInputFilePath, string onlyInColumnIdentifier)
        {
            String8Block      block  = new String8Block();
            HashSet <String8> values = new HashSet <String8>();

            // Read values in 'onlyInInputFilePath'
            using (ITabularReader reader = TabularFactory.BuildReader(onlyInInputFilePath))
            {
                int leftColumnIndex = reader.ColumnIndex(onlyInColumnIdentifier);
                while (reader.NextRow())
                {
                    values.Add(block.GetCopy(reader.Current(leftColumnIndex)));
                }
            }

            // Copy from input to output where the column value is in the "only in" set
            using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath))
            {
                int valueColumnIndex = reader.ColumnIndex(onlyInColumnIdentifier);

                using (ITabularWriter writer = TabularFactory.BuildWriter(outputFilePath))
                {
                    writer.SetColumns(reader.Columns);

                    while (reader.NextRow())
                    {
                        if (values.Contains(reader.Current(valueColumnIndex).ToString8()))
                        {
                            for (int i = 0; i < reader.CurrentRowColumns; ++i)
                            {
                                writer.Write(reader.Current(i).ToString8());
                            }

                            writer.NextRow();
                        }
                    }

                    WriteSizeSummary(reader, writer);
                }
            }
        }
예제 #21
0
        public void Reader_Performance(string sampleFilePath, Func <string, bool, ITabularReader> buildReader)
        {
            long rowCountRead   = 0;
            long xsvLengthBytes = new FileInfo(sampleFilePath).Length;

            // Goal: 100MB/sec [Surface Book i7]
            Verify.PerformanceByBytes(50 * LongExtensions.Megabyte, () =>
            {
                int iterations = 100;
                for (int iteration = 0; iteration < iterations; ++iteration)
                {
                    using (ITabularReader r = buildReader(sampleFilePath, true))
                    {
                        int lineNumberIndex  = r.ColumnIndex("LineNumber");
                        int countIndex       = r.ColumnIndex("Count");
                        int descriptionIndex = r.ColumnIndex("Description");

                        while (r.NextRow())
                        {
                            rowCountRead++;

                            if (r.CurrentRowColumns < 2)
                            {
                                continue;
                            }

                            int lineNumber;
                            r.Current(lineNumberIndex).TryToInteger(out lineNumber);

                            int count;
                            r.Current(countIndex).TryToInteger(out count);

                            String8 description = r.Current(descriptionIndex).ToString8();
                        }
                    }
                }

                return(iterations * xsvLengthBytes);
            });
        }
예제 #22
0
        private static void OnlyLatest(string inputFolderPath, string outputFilePath, string idColumnIdentifier)
        {
            String8Block block = new String8Block();
            Dictionary <String8, Tuple <string, int> > latestFileAndRowByID = new Dictionary <String8, Tuple <string, int> >();
            IReadOnlyList <string> writerColumns = null;

            // Walk the input files to figure out the latest copy of each ID
            Trace.WriteLine($"Identifying latest {idColumnIdentifier} in all files in {inputFolderPath}...");
            int rowCountRead = 0;

            foreach (string inputFilePath in Directory.GetFiles(inputFolderPath))
            {
                using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath))
                {
                    int idColumnIndex = reader.ColumnIndex(idColumnIdentifier);

                    while (reader.NextRow())
                    {
                        rowCountRead++;
                        String8 id = reader.Current(idColumnIndex).ToString8();
                        id.ToUpperInvariant();

                        // Record the file and row containing this ID, overwriting previous entries
                        latestFileAndRowByID[block.GetCopy(id)] = new Tuple <string, int>(inputFilePath, reader.RowCountRead);
                    }

                    // Capture the columns from the last CSV to write
                    writerColumns = reader.Columns;
                }
            }
            Trace.WriteLine($"Scan Complete. {rowCountRead:n0} rows read; {latestFileAndRowByID.Count:n0} distinct IDs found.");

            using (ITabularWriter writer = TabularFactory.BuildWriter(outputFilePath))
            {
                writer.SetColumns(writerColumns);
                int[] writerColumnIndexInReader = new int[writerColumns.Count];

                foreach (string inputFilePath in Directory.GetFiles(inputFolderPath))
                {
                    using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath))
                    {
                        // Look up each output column's position in the input file
                        for (int i = 0; i < writerColumns.Count; ++i)
                        {
                            reader.TryGetColumnIndex(writerColumns[i], out writerColumnIndexInReader[i]);
                        }

                        int idColumnIndex = reader.ColumnIndex(idColumnIdentifier);

                        while (reader.NextRow())
                        {
                            String8 id = reader.Current(idColumnIndex).ToString8();
                            id.ToUpperInvariant();

                            // Copy this row to the output file, *if* it's the latest for this ID
                            Tuple <string, int> latestForID = latestFileAndRowByID[id];
                            if (latestForID.Item1 == inputFilePath && latestForID.Item2 == reader.RowCountRead)
                            {
                                for (int i = 0; i < writerColumns.Count; ++i)
                                {
                                    int readerColumnIndex = writerColumnIndexInReader[i];
                                    if (readerColumnIndex >= 0 && readerColumnIndex < reader.CurrentRowColumns)
                                    {
                                        writer.Write(reader.Current(i).ToString8());
                                    }
                                    else
                                    {
                                        writer.Write(String8.Empty);
                                    }
                                }

                                writer.NextRow();
                            }
                        }
                    }
                }

                WriteSizeSummary(null, writer);
            }
        }
예제 #23
0
        public void TsvSplit()
        {
            Stream tsvStream = new MemoryStream();
            //Stream tsvStream = new FileStream("Sample.tsv", FileMode.Create);
            int rowCount = 1000 * 1000;

            WriteSampleTsv(tsvStream, 5, 1000 * 1000);

            byte[]    content = new byte[64 * 1024];
            BitVector cells   = new BitVector(content.Length);
            BitVector rows    = new BitVector(content.Length);

            int[] rowEnds = new int[1024];

            byte[] allContent = new byte[tsvStream.Length];
            tsvStream.Seek(0, SeekOrigin.Begin);
            tsvStream.Read(allContent, 0, allContent.Length);
            BitVector allCells = new BitVector(allContent.Length);
            BitVector allRows  = new BitVector(allContent.Length);

            using (Benchmarker b = new Benchmarker($"Tsv Parse [{rowCount:n0}] | count", DefaultMeasureMilliseconds))
            {
                b.Measure("Read only", (int)tsvStream.Length, () =>
                {
                    tsvStream.Seek(0, SeekOrigin.Begin);
                    while (true)
                    {
                        int lengthRead = tsvStream.Read(content, 0, content.Length);
                        if (lengthRead == 0)
                        {
                            break;
                        }
                    }

                    return(rowCount);
                });

                b.Measure("ReadLine | Split", (int)tsvStream.Length, () =>
                {
                    tsvStream.Seek(0, SeekOrigin.Begin);
                    int count           = 0;
                    StreamReader reader = new StreamReader(tsvStream);
                    {
                        // Header row
                        reader.ReadLine();

                        while (!reader.EndOfStream)
                        {
                            string line      = reader.ReadLine();
                            string[] cellSet = line.Split('\t');
                            count++;
                        }
                    }
                    return(count);
                });

                b.Measure("Elfie TsvReader", (int)tsvStream.Length, () =>
                {
                    tsvStream.Seek(0, SeekOrigin.Begin);
                    int count             = 0;
                    ITabularReader reader = TabularFactory.BuildReader(tsvStream, "Unused.tsv");
                    {
                        while (reader.NextRow())
                        {
                            count++;
                        }
                    }
                    return(count);
                });


                Func <byte[], int, int, ulong[], ulong[], int> splitTsvN = NativeAccelerator.GetMethod <Func <byte[], int, int, ulong[], ulong[], int> >("XForm.Native.String8N", "SplitTsv");
                b.Measure("XForm Native Split", (int)tsvStream.Length, () =>
                {
                    tsvStream.Seek(0, SeekOrigin.Begin);

                    int count = -1;
                    while (true)
                    {
                        int lengthRead = tsvStream.Read(content, 0, content.Length);
                        if (lengthRead == 0)
                        {
                            break;
                        }
                        if (lengthRead < content.Length)
                        {
                            Array.Clear(content, lengthRead, content.Length - lengthRead);
                        }

                        int lineCount = splitTsvN(content, 0, lengthRead, cells.Array, rows.Array);
                        count        += lineCount;

                        int fromRow   = 0;
                        int countCopy = cells.Page(rowEnds, ref fromRow);
                    }

                    return(count);
                });

                b.MeasureParallel("XForm Native Split Parallel", (int)tsvStream.Length, (index, length) =>
                {
                    return(splitTsvN(allContent, index, length, allCells.Array, allRows.Array) - 1);
                });
            }
        }
예제 #24
0
        private static void ConcatenateColumn(string inputFilePath, string outputFilePath, string columnName1, string separator, string columnName2, string outputColumnName)
        {
            String8 separator8 = String8.Convert(separator, new byte[String8.GetLength(separator)]);

            using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath))
            {
                // Find the columns to concatenate
                int columnIndex1 = reader.ColumnIndex(columnName1);
                int columnIndex2 = reader.ColumnIndex(columnName2);

                // Build an output column list and mapping from output order to input index, with '-1' for the concatenated value
                List <string> outputColumns         = new List <string>();
                int[]         indexMapping          = new int[reader.Columns.Count - 1];
                bool          hasConcatenatedColumn = false;

                for (int i = 0; i < reader.Columns.Count; ++i)
                {
                    string columnName = reader.Columns[i];

                    // If this is a column to concatenate...
                    if (columnName.Equals(reader.Columns[columnIndex1], StringComparison.OrdinalIgnoreCase) ||
                        columnName.Equals(reader.Columns[columnIndex2], StringComparison.OrdinalIgnoreCase))
                    {
                        // .. if it's the first one, the output column will appear at this position
                        if (!hasConcatenatedColumn)
                        {
                            hasConcatenatedColumn = true;

                            indexMapping[outputColumns.Count] = -1;
                            outputColumns.Add(outputColumnName);
                        }
                    }
                    else
                    {
                        // Otherwise, copy this column through
                        indexMapping[outputColumns.Count] = i;
                        outputColumns.Add(columnName);
                    }
                }

                using (ITabularWriter writer = TabularFactory.BuildWriter(outputFilePath))
                {
                    writer.SetColumns(outputColumns);

                    while (reader.NextRow())
                    {
                        // Write columns in mapped order
                        for (int i = 0; i < indexMapping.Length; ++i)
                        {
                            int sourceColumnIndex = indexMapping[i];

                            if (sourceColumnIndex == -1)
                            {
                                // Write concatenated column
                                writer.WriteValueStart();
                                writer.WriteValuePart(reader.Current(columnIndex1).ToString8());
                                writer.WriteValuePart(separator8);
                                writer.WriteValuePart(reader.Current(columnIndex2).ToString8());
                                writer.WriteValueEnd();
                            }
                            else
                            {
                                writer.Write(reader.Current(sourceColumnIndex).ToString8());
                            }
                        }

                        writer.NextRow();
                    }

                    WriteSizeSummary(reader, writer);
                }
            }
        }
예제 #25
0
        public void Reader_Basics(string sampleFilePath, Func <string, bool, ITabularReader> buildReader)
        {
            // File Not Found
            Verify.Exception <FileNotFoundException>(() => buildReader("NonExistantFile.xsv", false));

            // Empty File
            File.WriteAllText("Empty.xsv", "");

            // Verify Reader throws on construction if trying to read headers
            Verify.Exception <IOException>(() => buildReader("Empty.xsv", true));

            // Verify Reader returns false immediately if not reading headers
            using (ITabularReader r = buildReader("Empty.xsv", false))
            {
                Assert.IsFalse(r.NextRow());
            }

            // Verify Reader doesn't consume header row if asked not to
            using (ITabularReader r = buildReader(sampleFilePath, false))
            {
                Assert.IsTrue(r.NextRow());
                Assert.AreEqual("LineNumber", r.Current(0).ToString());

                // Get column name (no header row read)
                Verify.Exception <ColumnNotFoundException>(() => r.ColumnIndex("Missing"));
            }

            // Open the sample Tsv the 'expected' way
            using (ITabularReader r = buildReader(sampleFilePath, true))
            {
                // Get column name (valid)
                int lineNumberColumnIndex = r.ColumnIndex("LineNumber");
                Assert.AreEqual(0, lineNumberColumnIndex, "LineNumber column not expected");

                // Get column name (different case, but valid)
                int descriptionColumnIndex = r.ColumnIndex("deSCRiption");
                Assert.AreEqual(2, descriptionColumnIndex, "Description column not expected");

                // Get column name (unknown)
                Verify.Exception <ColumnNotFoundException>(() => r.ColumnIndex("UnknownColumn"));

                while (r.NextRow())
                {
                    int rowIndex = r.RowCountRead;

                    if (rowIndex % 100 == 99)
                    {
                        // Verify empty rows return no columns, have empty row text, throw on value access
                        Assert.AreEqual(0, r.CurrentRowColumns, "Expected column count 0 in empty rows");
                        Verify.Exception <ArgumentOutOfRangeException>(() => { var v = r.Current(lineNumberColumnIndex); });
                    }
                    else if (rowIndex == 5000)
                    {
                        // Read row over 64k [block resizing logic, row values look right]
                        String8 longDescription = r.Current(descriptionColumnIndex).ToString8();
                        Assert.AreEqual(100000, longDescription.Length);
                    }
                    else
                    {
                        // Get value (valid)
                        String8 lineNumber8 = r.Current(lineNumberColumnIndex).ToString8();
                        int     lineNumber  = 0;
                        if (lineNumber8.TryToInteger(out lineNumber))
                        {
                            Assert.AreEqual(rowIndex, lineNumber, "Expected line number to equal row number");
                        }
                        else
                        {
                            Assert.Fail(String.Format("\"{0}\" was not converted to an integer.", lineNumber8));
                        }

                        // Get line number
                        Assert.AreEqual(rowIndex, r.RowCountRead, "Expected lines read to equal row number");
                    }
                }
            }
        }
예제 #26
0
        private static void Where(string inputFilePath, string columnIndentifier, string value, ITabularWriter writer)
        {
            int matchCount = 0;
            int rowCount   = 0;

            using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath))
            {
                int rowIndex = (value != null ? -1 : int.Parse(columnIndentifier));
                int colIndex = (value != null ? reader.ColumnIndex(columnIndentifier) : -1);

                while (reader.NextRow())
                {
                    // Match the row index if no value was passed
                    if (rowIndex != -1 && reader.RowCountRead != rowIndex)
                    {
                        continue;
                    }

                    // Match the column value if passed
                    if (colIndex != -1)
                    {
                        if (reader.CurrentRowColumns <= colIndex)
                        {
                            continue;
                        }
                        if (reader.Current(colIndex).ToString8().CompareTo(value, true) != 0)
                        {
                            continue;
                        }
                    }

                    matchCount++;

                    // If this is the matching row, write it
                    if (writer != null)
                    {
                        if (writer.RowCountWritten == 0)
                        {
                            List <string> columns = new List <string>();
                            columns.Add("RowIndex");
                            columns.AddRange(reader.Columns);
                            writer.SetColumns(columns);
                        }

                        writer.Write(reader.RowCountRead);
                        for (int i = 0; i < reader.CurrentRowColumns; ++i)
                        {
                            writer.Write(reader.Current(i).ToString8());
                        }
                        writer.NextRow();
                    }

                    // If we matched row index, we're done
                    if (rowIndex != -1)
                    {
                        break;
                    }
                }

                rowCount = reader.RowCountRead;
            }

            Console.WriteLine($"Done. {matchCount:n0} out of {rowCount:n0} rows matched.");
        }