Пример #1
0
        private static void NotStartsWith(string inputFilePath, string outputFilePath, string valueColumnIdentifier, string nameColumnIdentifier)
        {
            using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath))
            {
                int valueColumnIndex = reader.ColumnIndex(valueColumnIdentifier);
                int nameColumnIndex  = reader.ColumnIndex(nameColumnIdentifier);

                using (ITabularWriter writer = TabularFactory.BuildWriter(outputFilePath))
                {
                    writer.SetColumns(reader.Columns);

                    while (reader.NextRow())
                    {
                        String8 name  = reader.Current(nameColumnIndex).ToString8();
                        String8 value = reader.Current(valueColumnIndex).ToString8();

                        if (!value.StartsWith(name))
                        {
                            for (int i = 0; i < reader.CurrentRowColumns; ++i)
                            {
                                writer.Write(reader.Current(i).ToString8());
                            }

                            writer.NextRow();
                        }
                    }

                    WriteSizeSummary(reader, writer);
                }
            }
        }
Пример #2
0
        public void Sanitize_EndToEnd()
        {
            Assembly xsvTest = Assembly.GetExecutingAssembly();

            Resource.SaveStreamTo("Xsv.Test.Sanitize.SanitizeSampleSource.csv", "SanitizeSampleSource.csv", xsvTest);
            Resource.SaveStreamTo("Xsv.Test.Sanitize.SanitizeSampleSource.sanispec", "SanitizeSampleSource.sanispec", xsvTest);

            // Verify UsageException if no key is passed
            Assert.AreEqual(-2, Program.Main(new string[] { "sanitize", @"SanitizeSampleSource.csv", "SanitizeOutput.csv", @"SanitizeSampleSource.sanispec" }));

            // Verify success for base sanitize
            File.Delete("SanitizeOutput.csv");
            Assert.AreEqual(0, Program.Main(new string[] { "sanitize", @"SanitizeSampleSource.csv", "SanitizeOutput.csv", @"SanitizeSampleSource.sanispec", "Key1" }));

            // Validate the result
            using (ITabularReader r = TabularFactory.BuildReader("SanitizeOutput.csv"))
            {
                Assert.IsTrue(r.Columns.Contains("ID"), "ID column is kept (no spec line)");
                Assert.IsTrue(r.Columns.Contains("Path"), "Path column is kept (mapped)");
                Assert.IsTrue(r.Columns.Contains("IsEmptyPath"), "IsEmptyPath is kept (Keep line)");
                Assert.IsFalse(r.Columns.Contains("IsUnderXsv"), "IxUnderXsv column is dropped (Drop line)");

                int idColumnIndex          = r.ColumnIndex("ID");
                int pathColumnIndex        = r.ColumnIndex("Path");
                int isEmptyPathColumnIndex = r.ColumnIndex("IsEmptyPath");

                while (r.NextRow())
                {
                    int    id   = r.Current(idColumnIndex).ToInteger();
                    string path = r.Current(pathColumnIndex).ToString();

                    Assert.AreEqual(r.Current(isEmptyPathColumnIndex).ToBoolean(), String.IsNullOrEmpty(path), "IsEmptyPath condition matches whether mapped path is empty");

                    if (id == 5)
                    {
                        Assert.AreEqual("Elfie", path, "'Elfie' is echoed (Echo in spec)");
                    }
                    else if (!String.IsNullOrEmpty(path))
                    {
                        Assert.IsTrue(path.StartsWith("WarmBeggedTruth\\"), "Verify path is mapped in parts, and 'Elfie' is consistently mapped.");
                    }
                }

                Assert.IsTrue(r.RowCountRead < 7, "Verify sample excluded at least one row.");
            }

            // Run with another key
            Assert.AreEqual(0, Program.Main(new string[] { "sanitize", @"SanitizeSampleSource.csv", "SanitizeOutput2.csv", @"SanitizeSampleSource.sanispec", "Key2" }));

            // Verify mappings are different
            Assert.AreNotEqual(File.ReadAllText("SanitizeOutput2.csv"), File.ReadAllText("SanitizeOutput.csv"));
        }
Пример #3
0
        private static void Compare(string oldFilePath, string newFilePath, string outputFilePath, string columnIdentifier)
        {
            String8Block      block     = new String8Block();
            HashSet <String8> oldValues = new HashSet <String8>();
            HashSet <String8> newValues = new HashSet <String8>();

            using (ITabularReader oldReader = TabularFactory.BuildReader(oldFilePath))
            {
                int leftColumnIndex = oldReader.ColumnIndex(columnIdentifier);
                while (oldReader.NextRow())
                {
                    oldValues.Add(block.GetCopy(oldReader.Current(leftColumnIndex)));
                }

                Trace.WriteLine(String.Format("Old: {0:n0} values for \"{1}\" in {2:n0} rows.", oldValues.Count, columnIdentifier, oldReader.RowCountRead));
            }

            using (ITabularReader newReader = TabularFactory.BuildReader(newFilePath))
            {
                int rightColumnIndex = newReader.ColumnIndex(columnIdentifier);
                while (newReader.NextRow())
                {
                    newValues.Add(block.GetCopy(newReader.Current(rightColumnIndex)));
                }

                Trace.WriteLine(String.Format("New: {0:n0} values for \"{1}\" in {2:n0} rows.", newValues.Count, columnIdentifier, newReader.RowCountRead));
            }

            HashSet <String8> oldOnly = new HashSet <String8>(oldValues);

            oldOnly.ExceptWith(newValues);

            HashSet <String8> newOnly = new HashSet <String8>(newValues);

            newOnly.ExceptWith(oldValues);

            Trace.WriteLine(String.Format("{0:n0} values were only in \"{1}\".\r\n{2:n0} values were only in \"{3}\".", oldOnly.Count, oldFilePath, newOnly.Count, newFilePath));

            String8 leftMarker  = String8.Convert("-", new byte[1]);
            String8 rightMarker = String8.Convert("+", new byte[1]);

            using (ITabularWriter writer = TabularFactory.BuildWriter(outputFilePath))
            {
                writer.SetColumns(new string[] { "In", columnIdentifier });

                foreach (String8 value in oldOnly)
                {
                    writer.Write(leftMarker);
                    writer.Write(value);
                    writer.NextRow();
                }

                foreach (String8 value in newOnly)
                {
                    writer.Write(rightMarker);
                    writer.Write(value);
                    writer.NextRow();
                }
            }
        }
Пример #4
0
        private static void Distinct(string inputFilePath, string outputFilePath, string columnIdentifier)
        {
            String8Block      block          = new String8Block();
            HashSet <String8> distinctValues = new HashSet <String8>();

            using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath))
            {
                int columnIndex = reader.ColumnIndex(columnIdentifier);

                using (ITabularWriter writer = TabularFactory.BuildWriter(outputFilePath))
                {
                    writer.SetColumns(new string[] { reader.Columns[columnIndex] });

                    while (reader.NextRow())
                    {
                        String8 value = reader.Current(columnIndex).ToString8();

                        if (!distinctValues.Contains(value))
                        {
                            distinctValues.Add(block.GetCopy(value));
                            writer.Write(value);
                            writer.NextRow();
                        }
                    }

                    WriteSizeSummary(reader, writer);
                }
            }
        }
Пример #5
0
        private static void HtmlInnerText(string inputFilePath, string outputFilePath, string columnIdentifier)
        {
            using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath))
            {
                int columnIndexToEscape = reader.ColumnIndex(columnIdentifier);

                using (ITabularWriter writer = TabularFactory.BuildWriter(outputFilePath))
                {
                    writer.SetColumns(reader.Columns);

                    while (reader.NextRow())
                    {
                        for (int i = 0; i < reader.CurrentRowColumns; ++i)
                        {
                            if (i == columnIndexToEscape)
                            {
                                WriteHtmlEscaped(reader.Current(i).ToString8(), writer);
                            }
                            else
                            {
                                writer.Write(reader.Current(i).ToString8());
                            }
                        }

                        writer.NextRow();
                    }

                    WriteSizeSummary(reader, writer);
                }
            }
        }
Пример #6
0
        private static void Copy(string inputFilePath, string outputFilePath, string columnsDelimited)
        {
            List <string> columns = new List <string>();

            foreach (string columnName in columnsDelimited.Split(','))
            {
                columns.Add(columnName.Trim());
            }

            using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath))
            {
                int[] columnIndices = new int[columns.Count];
                for (int i = 0; i < columnIndices.Length; ++i)
                {
                    columnIndices[i] = reader.ColumnIndex(columns[i]);
                }

                using (ITabularWriter writer = TabularFactory.BuildWriter(outputFilePath))
                {
                    writer.SetColumns(columns);

                    while (reader.NextRow())
                    {
                        for (int i = 0; i < columnIndices.Length; ++i)
                        {
                            writer.Write(reader.Current(columnIndices[i]).ToString8());
                        }

                        writer.NextRow();
                    }

                    WriteSizeSummary(reader, writer);
                }
            }
        }
Пример #7
0
        private static TableMetadata Build(IStreamProvider streamProvider, string tableRootPath)
        {
            TableMetadata metadata       = new TableMetadata();
            string        schemaFilePath = Path.Combine(tableRootPath, SchemaFileName);

            using (ITabularReader sr = TabularFactory.BuildReader(streamProvider.OpenRead(schemaFilePath), SchemaFileName))
            {
                int nameIndex = sr.ColumnIndex("Name");
                int typeIndex = sr.ColumnIndex("Type");

                while (sr.NextRow())
                {
                    metadata.Schema.Add(new ColumnDetails(sr.Current(nameIndex).ToString(), TypeProviderFactory.Get(sr.Current(typeIndex).ToString()).Type));
                }
            }

            using (ITabularReader mr = TabularFactory.BuildReader(streamProvider.OpenRead(Path.Combine(tableRootPath, MetadataFileName)), MetadataFileName))
            {
                int nameIndex    = mr.ColumnIndex("Name");
                int contextIndex = mr.ColumnIndex("Context");
                int valueIndex   = mr.ColumnIndex("Value");

                while (mr.NextRow())
                {
                    String8       name    = mr.Current(nameIndex).ToString8();
                    String8       context = mr.Current(contextIndex).ToString8();
                    ITabularValue value   = mr.Current(valueIndex);

                    if (name.Equals("RowCount"))
                    {
                        metadata.RowCount = value.ToInteger();
                    }
                    else
                    {
                        throw new NotImplementedException($"TableMetadataSerializer.Read doesn't know how to read Metadata '{name}'");
                    }
                }
            }

            metadata.Query = streamProvider.ReadAllText(Path.Combine(tableRootPath, ConfigQueryPath));

            return(metadata);
        }
Пример #8
0
        private static IEnumerable <DataBlock> ReadAsDataBlockBatch(ITabularReader reader, IList <string> columnNames)
        {
            // Build a DataBlock to hold a batch of rows
            int       columnCount = columnNames.Count;
            DataBlock result      = new DataBlock(columnNames, BatchSize);

            Value[][] columnArrays = new Value[columnCount][];
            for (int i = 0; i < columnCount; ++i)
            {
                columnArrays[i] = new Value[BatchSize];
                for (int j = 0; j < BatchSize; ++j)
                {
                    columnArrays[i][j] = Value.Create(null);
                }

                result.SetColumn(i, columnArrays[i]);
            }

            // Look up indices of the columns
            int[] columnIndices = new int[columnCount];
            for (int i = 0; i < columnCount; ++i)
            {
                columnIndices[i] = reader.ColumnIndex(columnNames[i]);
            }

            // Fill blocks with rows as we go
            int          currentRowCount = 0;
            String8Block block           = new String8Block();

            while (reader.NextRow())
            {
                for (int i = 0; i < columnCount; ++i)
                {
                    String8 cell = block.GetCopy(reader.Current(columnIndices[i]).ToString8());
                    columnArrays[i][currentRowCount].Assign(new ByteBlock(cell.Array, cell.Index, cell.Length));
                    //columnArrays[i][currentRowCount].Assign(cell.ToString());
                }

                currentRowCount++;

                if (currentRowCount == BatchSize)
                {
                    yield return(result);

                    currentRowCount = 0;
                    block.Clear();
                }
            }

            if (currentRowCount > 0)
            {
                yield return(result);
            }
        }
Пример #9
0
        public void Reader_Performance(string sampleFilePath, Func <string, bool, ITabularReader> buildReader)
        {
            long rowCountRead   = 0;
            long xsvLengthBytes = new FileInfo(sampleFilePath).Length;

            // Goal: 100MB/sec [Surface Book i7]
            Verify.PerformanceByBytes(50 * LongExtensions.Megabyte, () =>
            {
                int iterations = 100;
                for (int iteration = 0; iteration < iterations; ++iteration)
                {
                    using (ITabularReader r = buildReader(sampleFilePath, true))
                    {
                        int lineNumberIndex  = r.ColumnIndex("LineNumber");
                        int countIndex       = r.ColumnIndex("Count");
                        int descriptionIndex = r.ColumnIndex("Description");

                        while (r.NextRow())
                        {
                            rowCountRead++;

                            if (r.CurrentRowColumns < 2)
                            {
                                continue;
                            }

                            int lineNumber;
                            r.Current(lineNumberIndex).TryToInteger(out lineNumber);

                            int count;
                            r.Current(countIndex).TryToInteger(out count);

                            String8 description = r.Current(descriptionIndex).ToString8();
                        }
                    }
                }

                return(iterations * xsvLengthBytes);
            });
        }
Пример #10
0
        private static void OnlyIn(string inputFilePath, string outputFilePath, string onlyInInputFilePath, string onlyInColumnIdentifier)
        {
            String8Block      block  = new String8Block();
            HashSet <String8> values = new HashSet <String8>();

            // Read values in 'onlyInInputFilePath'
            using (ITabularReader reader = TabularFactory.BuildReader(onlyInInputFilePath))
            {
                int leftColumnIndex = reader.ColumnIndex(onlyInColumnIdentifier);
                while (reader.NextRow())
                {
                    values.Add(block.GetCopy(reader.Current(leftColumnIndex)));
                }
            }

            // Copy from input to output where the column value is in the "only in" set
            using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath))
            {
                int valueColumnIndex = reader.ColumnIndex(onlyInColumnIdentifier);

                using (ITabularWriter writer = TabularFactory.BuildWriter(outputFilePath))
                {
                    writer.SetColumns(reader.Columns);

                    while (reader.NextRow())
                    {
                        if (values.Contains(reader.Current(valueColumnIndex).ToString8()))
                        {
                            for (int i = 0; i < reader.CurrentRowColumns; ++i)
                            {
                                writer.Write(reader.Current(i).ToString8());
                            }

                            writer.NextRow();
                        }
                    }

                    WriteSizeSummary(reader, writer);
                }
            }
        }
Пример #11
0
        public static WhereResult Where(ITabularReader reader, string columnIdentifier, string operatorString, string valueString, ITabularWriter writer)
        {
            int         colIndex = reader.ColumnIndex(columnIdentifier);
            WhereResult result   = new WhereResult()
            {
                ColumnIndex = colIndex, ColumnName = reader.Columns[colIndex], Op = OperatorExtensions.Parse(operatorString), Value = ConvertToBestType(valueString)
            };

            Type t = result.Value.GetType();

            if (t == typeof(bool))
            {
                MatchBoolCompare(reader, writer, result);
            }
            else if (t == typeof(int))
            {
                MatchIntCompare(reader, writer, result);
            }
            else if (t == typeof(DateTime))
            {
                MatchDateTimeCompare(reader, writer, result);
            }
            else if (result.Op == Operator.Contains)
            {
                MatchContains(reader, writer, result);
            }
            else if (result.Op == Operator.StartsWith)
            {
                MatchStartsWith(reader, writer, result);
            }
            else
            {
                MatchStringCompare(reader, writer, result);
            }

            result.RowCount = reader.RowCountRead;
            return(result);
        }
Пример #12
0
        private static void Where(string inputFilePath, string columnIndentifier, string value, ITabularWriter writer)
        {
            int matchCount = 0;
            int rowCount   = 0;

            using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath))
            {
                int rowIndex = (value != null ? -1 : int.Parse(columnIndentifier));
                int colIndex = (value != null ? reader.ColumnIndex(columnIndentifier) : -1);

                while (reader.NextRow())
                {
                    // Match the row index if no value was passed
                    if (rowIndex != -1 && reader.RowCountRead != rowIndex)
                    {
                        continue;
                    }

                    // Match the column value if passed
                    if (colIndex != -1)
                    {
                        if (reader.CurrentRowColumns <= colIndex)
                        {
                            continue;
                        }
                        if (reader.Current(colIndex).ToString8().CompareTo(value, true) != 0)
                        {
                            continue;
                        }
                    }

                    matchCount++;

                    // If this is the matching row, write it
                    if (writer != null)
                    {
                        if (writer.RowCountWritten == 0)
                        {
                            List <string> columns = new List <string>();
                            columns.Add("RowIndex");
                            columns.AddRange(reader.Columns);
                            writer.SetColumns(columns);
                        }

                        writer.Write(reader.RowCountRead);
                        for (int i = 0; i < reader.CurrentRowColumns; ++i)
                        {
                            writer.Write(reader.Current(i).ToString8());
                        }
                        writer.NextRow();
                    }

                    // If we matched row index, we're done
                    if (rowIndex != -1)
                    {
                        break;
                    }
                }

                rowCount = reader.RowCountRead;
            }

            Console.WriteLine($"Done. {matchCount:n0} out of {rowCount:n0} rows matched.");
        }
Пример #13
0
        public void Reader_Basics(string sampleFilePath, Func <string, bool, ITabularReader> buildReader)
        {
            // File Not Found
            Verify.Exception <FileNotFoundException>(() => buildReader("NonExistantFile.xsv", false));

            // Empty File
            File.WriteAllText("Empty.xsv", "");

            // Verify Reader throws on construction if trying to read headers
            Verify.Exception <IOException>(() => buildReader("Empty.xsv", true));

            // Verify Reader returns false immediately if not reading headers
            using (ITabularReader r = buildReader("Empty.xsv", false))
            {
                Assert.IsFalse(r.NextRow());
            }

            // Verify Reader doesn't consume header row if asked not to
            using (ITabularReader r = buildReader(sampleFilePath, false))
            {
                Assert.IsTrue(r.NextRow());
                Assert.AreEqual("LineNumber", r.Current(0).ToString());

                // Get column name (no header row read)
                Verify.Exception <ColumnNotFoundException>(() => r.ColumnIndex("Missing"));
            }

            // Open the sample Tsv the 'expected' way
            using (ITabularReader r = buildReader(sampleFilePath, true))
            {
                // Get column name (valid)
                int lineNumberColumnIndex = r.ColumnIndex("LineNumber");
                Assert.AreEqual(0, lineNumberColumnIndex, "LineNumber column not expected");

                // Get column name (different case, but valid)
                int descriptionColumnIndex = r.ColumnIndex("deSCRiption");
                Assert.AreEqual(2, descriptionColumnIndex, "Description column not expected");

                // Get column name (unknown)
                Verify.Exception <ColumnNotFoundException>(() => r.ColumnIndex("UnknownColumn"));

                while (r.NextRow())
                {
                    int rowIndex = r.RowCountRead;

                    if (rowIndex % 100 == 99)
                    {
                        // Verify empty rows return no columns, have empty row text, throw on value access
                        Assert.AreEqual(0, r.CurrentRowColumns, "Expected column count 0 in empty rows");
                        Verify.Exception <ArgumentOutOfRangeException>(() => { var v = r.Current(lineNumberColumnIndex); });
                    }
                    else if (rowIndex == 5000)
                    {
                        // Read row over 64k [block resizing logic, row values look right]
                        String8 longDescription = r.Current(descriptionColumnIndex).ToString8();
                        Assert.AreEqual(100000, longDescription.Length);
                    }
                    else
                    {
                        // Get value (valid)
                        String8 lineNumber8 = r.Current(lineNumberColumnIndex).ToString8();
                        int     lineNumber  = 0;
                        if (lineNumber8.TryToInteger(out lineNumber))
                        {
                            Assert.AreEqual(rowIndex, lineNumber, "Expected line number to equal row number");
                        }
                        else
                        {
                            Assert.Fail(String.Format("\"{0}\" was not converted to an integer.", lineNumber8));
                        }

                        // Get line number
                        Assert.AreEqual(rowIndex, r.RowCountRead, "Expected lines read to equal row number");
                    }
                }
            }
        }
Пример #14
0
        private static void ConcatenateColumn(string inputFilePath, string outputFilePath, string columnName1, string separator, string columnName2, string outputColumnName)
        {
            String8 separator8 = String8.Convert(separator, new byte[String8.GetLength(separator)]);

            using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath))
            {
                // Find the columns to concatenate
                int columnIndex1 = reader.ColumnIndex(columnName1);
                int columnIndex2 = reader.ColumnIndex(columnName2);

                // Build an output column list and mapping from output order to input index, with '-1' for the concatenated value
                List <string> outputColumns         = new List <string>();
                int[]         indexMapping          = new int[reader.Columns.Count - 1];
                bool          hasConcatenatedColumn = false;

                for (int i = 0; i < reader.Columns.Count; ++i)
                {
                    string columnName = reader.Columns[i];

                    // If this is a column to concatenate...
                    if (columnName.Equals(reader.Columns[columnIndex1], StringComparison.OrdinalIgnoreCase) ||
                        columnName.Equals(reader.Columns[columnIndex2], StringComparison.OrdinalIgnoreCase))
                    {
                        // .. if it's the first one, the output column will appear at this position
                        if (!hasConcatenatedColumn)
                        {
                            hasConcatenatedColumn = true;

                            indexMapping[outputColumns.Count] = -1;
                            outputColumns.Add(outputColumnName);
                        }
                    }
                    else
                    {
                        // Otherwise, copy this column through
                        indexMapping[outputColumns.Count] = i;
                        outputColumns.Add(columnName);
                    }
                }

                using (ITabularWriter writer = TabularFactory.BuildWriter(outputFilePath))
                {
                    writer.SetColumns(outputColumns);

                    while (reader.NextRow())
                    {
                        // Write columns in mapped order
                        for (int i = 0; i < indexMapping.Length; ++i)
                        {
                            int sourceColumnIndex = indexMapping[i];

                            if (sourceColumnIndex == -1)
                            {
                                // Write concatenated column
                                writer.WriteValueStart();
                                writer.WriteValuePart(reader.Current(columnIndex1).ToString8());
                                writer.WriteValuePart(separator8);
                                writer.WriteValuePart(reader.Current(columnIndex2).ToString8());
                                writer.WriteValueEnd();
                            }
                            else
                            {
                                writer.Write(reader.Current(sourceColumnIndex).ToString8());
                            }
                        }

                        writer.NextRow();
                    }

                    WriteSizeSummary(reader, writer);
                }
            }
        }
Пример #15
0
        private static void OnlyLatest(string inputFolderPath, string outputFilePath, string idColumnIdentifier)
        {
            String8Block block = new String8Block();
            Dictionary <String8, Tuple <string, int> > latestFileAndRowByID = new Dictionary <String8, Tuple <string, int> >();
            IReadOnlyList <string> writerColumns = null;

            // Walk the input files to figure out the latest copy of each ID
            Trace.WriteLine($"Identifying latest {idColumnIdentifier} in all files in {inputFolderPath}...");
            int rowCountRead = 0;

            foreach (string inputFilePath in Directory.GetFiles(inputFolderPath))
            {
                using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath))
                {
                    int idColumnIndex = reader.ColumnIndex(idColumnIdentifier);

                    while (reader.NextRow())
                    {
                        rowCountRead++;
                        String8 id = reader.Current(idColumnIndex).ToString8();
                        id.ToUpperInvariant();

                        // Record the file and row containing this ID, overwriting previous entries
                        latestFileAndRowByID[block.GetCopy(id)] = new Tuple <string, int>(inputFilePath, reader.RowCountRead);
                    }

                    // Capture the columns from the last CSV to write
                    writerColumns = reader.Columns;
                }
            }
            Trace.WriteLine($"Scan Complete. {rowCountRead:n0} rows read; {latestFileAndRowByID.Count:n0} distinct IDs found.");

            using (ITabularWriter writer = TabularFactory.BuildWriter(outputFilePath))
            {
                writer.SetColumns(writerColumns);
                int[] writerColumnIndexInReader = new int[writerColumns.Count];

                foreach (string inputFilePath in Directory.GetFiles(inputFolderPath))
                {
                    using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath))
                    {
                        // Look up each output column's position in the input file
                        for (int i = 0; i < writerColumns.Count; ++i)
                        {
                            reader.TryGetColumnIndex(writerColumns[i], out writerColumnIndexInReader[i]);
                        }

                        int idColumnIndex = reader.ColumnIndex(idColumnIdentifier);

                        while (reader.NextRow())
                        {
                            String8 id = reader.Current(idColumnIndex).ToString8();
                            id.ToUpperInvariant();

                            // Copy this row to the output file, *if* it's the latest for this ID
                            Tuple <string, int> latestForID = latestFileAndRowByID[id];
                            if (latestForID.Item1 == inputFilePath && latestForID.Item2 == reader.RowCountRead)
                            {
                                for (int i = 0; i < writerColumns.Count; ++i)
                                {
                                    int readerColumnIndex = writerColumnIndexInReader[i];
                                    if (readerColumnIndex >= 0 && readerColumnIndex < reader.CurrentRowColumns)
                                    {
                                        writer.Write(reader.Current(i).ToString8());
                                    }
                                    else
                                    {
                                        writer.Write(String8.Empty);
                                    }
                                }

                                writer.NextRow();
                            }
                        }
                    }
                }

                WriteSizeSummary(null, writer);
            }
        }
Пример #16
0
        /// <summary>
        ///  Sanitize an input file into a given output file using this Sanitizer's configuration.
        /// </summary>
        /// <param name="inputFile">File Path to input file</param>
        /// <param name="outputFile">File Path to output file</param>
        public void Sanitize(string inputFile, string outputFile)
        {
            using (ITabularReader reader = TabularFactory.BuildReader(inputFile))
            {
                // Build an array of what we'll do with each input column, and the list of columns we'll actually write
                List <string>    columnsToOutput;
                IColumnHandler[] handlers = GetHandlersByColumnIndex(reader.Columns, out columnsToOutput);

                // Find the sample column index, if any, and calculate a hash cutoff for including rows
                int  sampleColumnIndex     = (String.IsNullOrEmpty(this.SampleColumnName) ? -1 : reader.ColumnIndex(this.SampleColumnName));
                uint sampleInclusionCutoff = (uint)(uint.MaxValue * this.SampleProbability);

                using (ITabularWriter writer = TabularFactory.BuildWriter(outputFile))
                {
                    writer.SetColumns(columnsToOutput);

                    while (reader.NextRow())
                    {
                        // If there's a sample column, decide whether to include this row
                        if (sampleColumnIndex > -1)
                        {
                            // Sample *without* the hashkey, so the same rows are consistently included or excluded.
                            uint sampleValueHash = Hashing.Hash(reader.Current(sampleColumnIndex).ToString8(), 0);
                            if (sampleValueHash > sampleInclusionCutoff)
                            {
                                continue;
                            }
                        }

                        // Run the handler for every input column, writing the output if there is one
                        for (int i = 0; i < reader.CurrentRowColumns; ++i)
                        {
                            IColumnHandler handler = handlers[i];
                            if (handler != null)
                            {
                                String8 value       = reader.Current(i).ToString8();
                                String8 replacement = handler.Sanitize(value);
                                writer.Write(replacement);
                            }
                        }

                        writer.NextRow();
                    }
                }
            }
        }
Пример #17
0
        private static void HtmlInnerText(string inputFilePath, string outputFilePath, string columnsDelimited)
        {
            using (ITabularReader reader = TabularFactory.BuildReader(inputFilePath))
            {
                List <int> columnIndicesToEscape = columnsDelimited.Split(',').Select((col) => reader.ColumnIndex(col.Trim())).ToList();

                using (ITabularWriter writer = TabularFactory.BuildWriter(outputFilePath))
                {
                    writer.SetColumns(reader.Columns);

                    while (reader.NextRow())
                    {
                        for (int i = 0; i < reader.CurrentRowColumns; ++i)
                        {
                            if (columnIndicesToEscape.Contains(i))
                            {
                                WriteHtmlEscaped(reader.Current(i).ToString8(), writer);
                            }
                            else
                            {
                                writer.Write(reader.Current(i).ToString8());
                            }
                        }

                        writer.NextRow();
                    }

                    WriteSizeSummary(reader, writer);
                }
            }
        }