private IColumnHandler[] GetHandlersByColumnIndex(IReadOnlyList <string> columnNames, out List <string> columnNamesToOutput) { columnNamesToOutput = new List <string>(); IColumnHandler[] handlers = new IColumnHandler[columnNames.Count]; for (int i = 0; i < columnNames.Count; ++i) { string columnName = columnNames[i]; IColumnHandler handler; if (this.DropColumns.Contains(columnName)) { handler = null; } else { columnNamesToOutput.Add(columnName); if (!this.HandlersByColumn.TryGetValue(columnName, out handler)) { handler = new KeepColumnHandler(); } } handlers[i] = handler; } return(handlers); }
/// <summary> /// Sanitize an input file into a given output file using this Sanitizer's configuration. /// </summary> /// <param name="inputFile">File Path to input file</param> /// <param name="outputFile">File Path to output file</param> public void Sanitize(string inputFile, string outputFile) { using (ITabularReader reader = TabularFactory.BuildReader(inputFile)) { // Build an array of what we'll do with each input column, and the list of columns we'll actually write List <string> columnsToOutput; IColumnHandler[] handlers = GetHandlersByColumnIndex(reader.Columns, out columnsToOutput); // Find the sample column index, if any, and calculate a hash cutoff for including rows int sampleColumnIndex = (String.IsNullOrEmpty(this.SampleColumnName) ? -1 : reader.ColumnIndex(this.SampleColumnName)); uint sampleInclusionCutoff = (uint)(uint.MaxValue * this.SampleProbability); using (ITabularWriter writer = TabularFactory.BuildWriter(outputFile)) { writer.SetColumns(columnsToOutput); while (reader.NextRow()) { // If there's a sample column, decide whether to include this row if (sampleColumnIndex > -1) { // Sample *without* the hashkey, so the same rows are consistently included or excluded. uint sampleValueHash = Hashing.Hash(reader.Current(sampleColumnIndex).ToString8(), 0); if (sampleValueHash > sampleInclusionCutoff) { continue; } } // Run the handler for every input column, writing the output if there is one for (int i = 0; i < reader.CurrentRowColumns; ++i) { IColumnHandler handler = handlers[i]; if (handler != null) { String8 value = reader.Current(i).ToString8(); String8 replacement = handler.Sanitize(value); writer.Write(replacement); } } writer.NextRow(); } } } }
public EchoColumnHandler(HashSet <String8> echoValues, IColumnHandler inner) { this.EchoValues = echoValues; this.Inner = inner; }