/// <summary> /// Apply a Where filter to a table. This can stream over large data and filter it down. /// </summary> /// <param name="table">source table</param> /// <param name="fpSelector">predicate to execute on each row</param> /// <returns>a new table that copies out rows from from the source table</returns> public static DataTable Where(DataTable table, Func <Row, bool> fpSelector) { TableWriter writer = new TableWriter(table); int count = 0; foreach (Row row in table.Rows) { bool keep = fpSelector(row); if (keep) { writer.AddRow(row); count++; } } return(writer.CloseAndGetTable()); }
/// <summary> /// Return a sample that's the top N records from a table. /// This is useful to sample a large table and then save the sample. /// </summary> /// <param name="table">source table</param> /// <param name="topN">positive value specifying number of rows to copy from from source table</param> /// <returns>The topN rows from the source table.</returns> public static MutableDataTable SampleTopN(DataTable table, int topN) { if (topN <= 0) { throw new ArgumentOutOfRangeException("topN", "sample must be a positive integer"); } TableWriter writer = new TableWriter(table); foreach (var row in table.Rows) { topN--; writer.AddRow(row); if (topN == 0) { // Check topN before the enumeration to avoid pulling a wasted row from the source table break; } } return(DataTable.New.GetMutableCopy(writer.CloseAndGetTable())); }
/// <summary> /// Find all rows that have dups for the given columns. /// This uses a multi-pass algorithm to operate on a large data file. /// </summary> /// <param name="table">original table</param> /// <param name="columnNames">set of columns to compare to look for duplicates</param> /// <returns>a table that's a subset of the original table</returns> public static DataTable SelectDuplicates(DataTable table, params string[] columnNames) { int[] ci = GetColumnIndexFromNames(table, columnNames); // Store on hash keys first. Use hash keys because they're compact and efficient for large data sets // But then we do need to handle collisions. HashSet <int> allKeys = new HashSet <int>(); HashSet <int> possibleDups = new HashSet <int>(); // // Take a first pass and store the hash of each row's unique Key // foreach (Row row in table.Rows) { var parts = row.Values; int hash = CalcHash(parts, ci); if (allKeys.Contains(hash)) { possibleDups.Add(hash); } else { allKeys.Add(hash); } } allKeys = null; // Free up for GC // // Now take a second pass through the dups. // Dictionary <string, Row> fullMatch = new Dictionary <string, Row>(); StringBuilder sb = new StringBuilder(); TableWriter writer = new TableWriter(table); foreach (Row row in table.Rows) { { var parts = row.Values; int hash = CalcHash(parts, ci); if (!possibleDups.Contains(hash)) { continue; } // Potential match sb.Clear(); foreach (int i in ci) { sb.Append(parts[i]); sb.Append(','); } string key = sb.ToString(); if (fullMatch.ContainsKey(key)) { Row firstLine = fullMatch[key]; if (firstLine != null) { writer.AddRow(firstLine); fullMatch[key] = null; } // Real dup! writer.AddRow(row); } else { fullMatch[key] = row; } } } // reader return(writer.CloseAndGetTable()); }
/// <summary> /// Apply a Where filter to a table. This can stream over large data and filter it down. /// </summary> /// <param name="table">source table</param> /// <param name="fpSelector">predicate to execute on each row</param> /// <returns>a new table that copies out rows from from the source table</returns> public static DataTable Where(DataTable table, Func<Row, bool> fpSelector) { TableWriter writer = new TableWriter(table); int count = 0; foreach(Row row in table.Rows) { bool keep = fpSelector(row); if (keep) { writer.AddRow(row); count++; } } return writer.CloseAndGetTable(); }
/// <summary> /// Find all rows that have dups for the given columns. /// This uses a multi-pass algorithm to operate on a large data file. /// </summary> /// <param name="table">original table</param> /// <param name="columnNames">set of columns to compare to look for duplicates</param> /// <returns>a table that's a subset of the original table</returns> public static DataTable SelectDuplicates(DataTable table, params string[] columnNames) { int[] ci = GetColumnIndexFromNames(table, columnNames); // Store on hash keys first. Use hash keys because they're compact and efficient for large data sets // But then we do need to handle collisions. HashSet<int> allKeys = new HashSet<int>(); HashSet<int> possibleDups = new HashSet<int>(); // // Take a first pass and store the hash of each row's unique Key // foreach (Row row in table.Rows) { var parts = row.Values; int hash = CalcHash(parts, ci); if (allKeys.Contains(hash)) { possibleDups.Add(hash); } else { allKeys.Add(hash); } } allKeys = null; // Free up for GC // // Now take a second pass through the dups. // Dictionary<string, Row> fullMatch = new Dictionary<string, Row>(); StringBuilder sb = new StringBuilder(); TableWriter writer = new TableWriter(table); foreach (Row row in table.Rows) { { var parts = row.Values; int hash = CalcHash(parts, ci); if (!possibleDups.Contains(hash)) { continue; } // Potential match sb.Clear(); foreach (int i in ci) { sb.Append(parts[i]); sb.Append(','); } string key = sb.ToString(); if (fullMatch.ContainsKey(key)) { Row firstLine = fullMatch[key]; if (firstLine != null) { writer.AddRow(firstLine); fullMatch[key] = null; } // Real dup! writer.AddRow(row); } else { fullMatch[key] = row; } } } // reader return writer.CloseAndGetTable(); }
/// <summary> /// Return a sample that's the top N records from a table. /// This is useful to sample a large table and then save the sample. /// </summary> /// <param name="table">source table</param> /// <param name="topN">positive value specifying number of rows to copy from from source table</param> /// <returns>The topN rows from the source table.</returns> public static MutableDataTable SampleTopN(DataTable table, int topN) { if (topN <= 0) { throw new ArgumentOutOfRangeException("topN", "sample must be a positive integer"); } TableWriter writer = new TableWriter(table); foreach (var row in table.Rows) { topN--; writer.AddRow(row); if (topN == 0) { // Check topN before the enumeration to avoid pulling a wasted row from the source table break; } } return DataTable.New.GetMutableCopy(writer.CloseAndGetTable()); }