Beispiel #1
0
        private IColumnHandler[] GetHandlersByColumnIndex(IReadOnlyList <string> columnNames, out List <string> columnNamesToOutput)
        {
            columnNamesToOutput = new List <string>();
            IColumnHandler[] handlers = new IColumnHandler[columnNames.Count];

            for (int i = 0; i < columnNames.Count; ++i)
            {
                string columnName = columnNames[i];

                IColumnHandler handler;
                if (this.DropColumns.Contains(columnName))
                {
                    handler = null;
                }
                else
                {
                    columnNamesToOutput.Add(columnName);

                    if (!this.HandlersByColumn.TryGetValue(columnName, out handler))
                    {
                        handler = new KeepColumnHandler();
                    }
                }

                handlers[i] = handler;
            }

            return(handlers);
        }
Beispiel #2
0
        /// <summary>
        ///  Sanitize an input file into a given output file using this Sanitizer's configuration.
        /// </summary>
        /// <param name="inputFile">File Path to input file</param>
        /// <param name="outputFile">File Path to output file</param>
        public void Sanitize(string inputFile, string outputFile)
        {
            using (ITabularReader reader = TabularFactory.BuildReader(inputFile))
            {
                // Build an array of what we'll do with each input column, and the list of columns we'll actually write
                List <string>    columnsToOutput;
                IColumnHandler[] handlers = GetHandlersByColumnIndex(reader.Columns, out columnsToOutput);

                // Find the sample column index, if any, and calculate a hash cutoff for including rows
                int  sampleColumnIndex     = (String.IsNullOrEmpty(this.SampleColumnName) ? -1 : reader.ColumnIndex(this.SampleColumnName));
                uint sampleInclusionCutoff = (uint)(uint.MaxValue * this.SampleProbability);

                using (ITabularWriter writer = TabularFactory.BuildWriter(outputFile))
                {
                    writer.SetColumns(columnsToOutput);

                    while (reader.NextRow())
                    {
                        // If there's a sample column, decide whether to include this row
                        if (sampleColumnIndex > -1)
                        {
                            // Sample *without* the hashkey, so the same rows are consistently included or excluded.
                            uint sampleValueHash = Hashing.Hash(reader.Current(sampleColumnIndex).ToString8(), 0);
                            if (sampleValueHash > sampleInclusionCutoff)
                            {
                                continue;
                            }
                        }

                        // Run the handler for every input column, writing the output if there is one
                        for (int i = 0; i < reader.CurrentRowColumns; ++i)
                        {
                            IColumnHandler handler = handlers[i];
                            if (handler != null)
                            {
                                String8 value       = reader.Current(i).ToString8();
                                String8 replacement = handler.Sanitize(value);
                                writer.Write(replacement);
                            }
                        }

                        writer.NextRow();
                    }
                }
            }
        }
 public EchoColumnHandler(HashSet <String8> echoValues, IColumnHandler inner)
 {
     this.EchoValues = echoValues;
     this.Inner      = inner;
 }