Ejemplo n.º 1
0
        /// <summary>
        /// Apply a Where filter to a table. This can stream over large data and filter it down.
        /// </summary>
        /// <param name="table">source table</param>
        /// <param name="fpSelector">predicate to execute on each row</param>
        /// <returns>a new table that copies out rows from from the source table</returns>
        public static DataTable Where(DataTable table, Func <Row, bool> fpSelector)
        {
            TableWriter writer = new TableWriter(table);

            int count = 0;

            foreach (Row row in table.Rows)
            {
                bool keep = fpSelector(row);
                if (keep)
                {
                    writer.AddRow(row);
                    count++;
                }
            }
            return(writer.CloseAndGetTable());
        }
Ejemplo n.º 2
0
        /// <summary>
        /// Return a sample that's the top N records from a table.
        /// This is useful to sample a large table and then save the sample.
        /// </summary>
        /// <param name="table">source table</param>
        /// <param name="topN">positive value specifying number of rows to copy from from source table</param>
        /// <returns>The topN rows from the source table.</returns>
        public static MutableDataTable SampleTopN(DataTable table, int topN)
        {
            if (topN <= 0)
            {
                throw new ArgumentOutOfRangeException("topN", "sample must be a positive integer");
            }

            TableWriter writer = new TableWriter(table);

            foreach (var row in table.Rows)
            {
                topN--;
                writer.AddRow(row);

                if (topN == 0)
                {
                    // Check topN before the enumeration to avoid pulling a wasted row from the source table
                    break;
                }
            }

            return(DataTable.New.GetMutableCopy(writer.CloseAndGetTable()));
        }
Ejemplo n.º 3
0
        /// <summary>
        /// Find all rows that have dups for the given columns.
        /// This uses a multi-pass algorithm to operate on a large data file.
        /// </summary>
        /// <param name="table">original table</param>
        /// <param name="columnNames">set of columns to compare to look for duplicates</param>
        /// <returns>a table that's a subset of the original table</returns>
        public static DataTable SelectDuplicates(DataTable table, params string[] columnNames)
        {
            int[] ci = GetColumnIndexFromNames(table, columnNames);

            // Store on hash keys first. Use hash keys because they're compact and efficient for large data sets
            // But then we do need to handle collisions.
            HashSet <int> allKeys      = new HashSet <int>();
            HashSet <int> possibleDups = new HashSet <int>();

            //
            // Take a first pass and store the hash of each row's unique Key
            //
            foreach (Row row in table.Rows)
            {
                var parts = row.Values;
                int hash  = CalcHash(parts, ci);

                if (allKeys.Contains(hash))
                {
                    possibleDups.Add(hash);
                }
                else
                {
                    allKeys.Add(hash);
                }
            }
            allKeys = null; // Free up for GC

            //
            // Now take a second pass through the dups.
            //
            Dictionary <string, Row> fullMatch = new Dictionary <string, Row>();

            StringBuilder sb = new StringBuilder();

            TableWriter writer = new TableWriter(table);

            foreach (Row row in table.Rows)
            {
                {
                    var parts = row.Values;
                    int hash  = CalcHash(parts, ci);
                    if (!possibleDups.Contains(hash))
                    {
                        continue;
                    }

                    // Potential match
                    sb.Clear();
                    foreach (int i in ci)
                    {
                        sb.Append(parts[i]);
                        sb.Append(',');
                    }
                    string key = sb.ToString();

                    if (fullMatch.ContainsKey(key))
                    {
                        Row firstLine = fullMatch[key];
                        if (firstLine != null)
                        {
                            writer.AddRow(firstLine);
                            fullMatch[key] = null;
                        }

                        // Real dup!
                        writer.AddRow(row);
                    }
                    else
                    {
                        fullMatch[key] = row;
                    }
                }
            } // reader
            return(writer.CloseAndGetTable());
        }
Ejemplo n.º 4
0
        /// <summary>
        /// Apply a Where filter to a table. This can stream over large data and filter it down. 
        /// </summary>
        /// <param name="table">source table</param>
        /// <param name="fpSelector">predicate to execute on each row</param>
        /// <returns>a new table that copies out rows from from the source table</returns>
        public static DataTable Where(DataTable table, Func<Row, bool> fpSelector)
        {
            TableWriter writer = new TableWriter(table);

            int count = 0;
            foreach(Row row in table.Rows)
            {
                bool keep = fpSelector(row);
                if (keep)
                {
                    writer.AddRow(row);
                    count++;
                }
            }
            return writer.CloseAndGetTable();
        }
Ejemplo n.º 5
0
        /// <summary>
        /// Find all rows that have dups for the given columns.
        /// This uses a multi-pass algorithm to operate on a large data file.
        /// </summary>
        /// <param name="table">original table</param>
        /// <param name="columnNames">set of columns to compare to look for duplicates</param>
        /// <returns>a table that's a subset of the original table</returns>
        public static DataTable SelectDuplicates(DataTable table, params string[] columnNames)
        {
            int[] ci = GetColumnIndexFromNames(table, columnNames);

            // Store on hash keys first. Use hash keys because they're compact and efficient for large data sets
            // But then we do need to handle collisions.
            HashSet<int> allKeys = new HashSet<int>();
            HashSet<int> possibleDups = new HashSet<int>();

            //
            // Take a first pass and store the hash of each row's unique Key
            //
            foreach (Row row in table.Rows)
            {
                var parts = row.Values;
                int hash = CalcHash(parts, ci);

                if (allKeys.Contains(hash))
                {
                    possibleDups.Add(hash);
                }
                else
                {
                    allKeys.Add(hash);
                }
            }
            allKeys = null; // Free up for GC

            //
            // Now take a second pass through the dups.
            //
            Dictionary<string, Row> fullMatch = new Dictionary<string, Row>();

            StringBuilder sb = new StringBuilder();

            TableWriter writer = new TableWriter(table);

            foreach (Row row in table.Rows)
            {
                {
                    var parts = row.Values;
                    int hash = CalcHash(parts, ci);
                    if (!possibleDups.Contains(hash))
                    {
                        continue;
                    }

                    // Potential match
                    sb.Clear();
                    foreach (int i in ci)
                    {
                        sb.Append(parts[i]);
                        sb.Append(',');
                    }
                    string key = sb.ToString();

                    if (fullMatch.ContainsKey(key))
                    {
                        Row firstLine = fullMatch[key];
                        if (firstLine != null)
                        {
                            writer.AddRow(firstLine);
                            fullMatch[key] = null;
                        }

                        // Real dup!
                        writer.AddRow(row);
                    }
                    else
                    {
                        fullMatch[key] = row;
                    }
                }
            } // reader
            return writer.CloseAndGetTable();
        }
Ejemplo n.º 6
0
        /// <summary>
        /// Return a sample that's the top N records from a table.
        /// This is useful to sample a large table and then save the sample. 
        /// </summary>
        /// <param name="table">source table</param>
        /// <param name="topN">positive value specifying number of rows to copy from from source table</param>
        /// <returns>The topN rows from the source table.</returns>
        public static MutableDataTable SampleTopN(DataTable table, int topN)
        {
            if (topN <= 0)
            {
                throw new ArgumentOutOfRangeException("topN", "sample must be a positive integer");
            }

            TableWriter writer = new TableWriter(table);

            foreach (var row in table.Rows)
            {
                topN--;
                writer.AddRow(row);

                if (topN == 0)
                {
                    // Check topN before the enumeration to avoid pulling a wasted row from the source table
                    break;
                }
            }

            return DataTable.New.GetMutableCopy(writer.CloseAndGetTable());
        }