Beispiel #1
0
 public static void AssertEquals(string content, DataTable dt)
 {
     StringWriter sw = new StringWriter();
     dt.SaveToStream(sw);
     string actual = sw.ToString();
     AssertEquals(content, actual);
 }
Beispiel #2
0
        /// <returns></returns>
        /// <summary>
        /// Extract column as a histogram, sorted in descending order by frequency.        
        /// </summary>
        /// <param name="table">source table</param>
        /// <param name="columnIdx">0-based index of column </param>
        /// <returns>collection of tuples, where each tuple is a value and the count of that value within the column</returns>
        public static Tuple<string, int>[] AsHistogram(DataTable table, int columnIdx)
        {
            Dictionary<string, int> values = new Dictionary<string, int>();

            //string name = "unknown";
            foreach (Row row in table.Rows)
            {

                var parts = row.Values;
                if (columnIdx >= parts.Count)
                {
                    // malformed input file
                    continue;
                }
                string p = parts[columnIdx];

                int count;
                values.TryGetValue(p, out count);
                count++;
                values[p] = count;
            }

            // Get top N?

            var items = from kv in values
                        orderby kv.Value descending
                        select Tuple.Create(kv.Key, kv.Value)
                        ;

            //int N = 10;
            //return items.Take(N).ToArray();
            return items.ToArray();
        }
Beispiel #3
0
        /// <summary>
        /// The timer1_Tick
        /// </summary>
        /// <param name="sender">The <see cref="object"/></param>
        /// <param name="e">The <see cref="EventArgs"/></param>
        private void timer1_Tick(object sender, EventArgs e)
        {
            scope1.Hold = true;
            Thread.Sleep(1000);
            if (scope1.Hold == true)
            {
                DataAccess.DataTable dt = DataAccess.DataTable.New.ReadCsv(@"Voltage.csv");

                // Query via the DataTable.Rows enumeration.
                foreach (Row row in dt.Rows)
                {
                    listBox1.Items.Add(row["Audio"]);
                }

                if (listBox1.Items.Contains(textBox2.Text))
                {
                    led1.OffColor = Color.LawnGreen;
                }
                else
                {
                    //MessageBox.Show("Please check the DMM USB connections");
                    led1.OffColor = Color.Red;
                }

                this.Refresh();
                this.Close();
                this.Dispose();
            }
        }
Beispiel #4
0
        public void DuplicatTests(DataTable dtOriginal)
        {
            // Select first colyumn
            DataTable dt1 = Analyze.SelectDuplicates(dtOriginal, "first");

            AssertEquals(
            @"first,last,age
            Bob,Smith,12
            Bob,Jones,34
            ", dt1);

            // Select two columns
            DataTable dt2 = Analyze.SelectDuplicates(dtOriginal, "last", "age");

            AssertEquals(
            @"first,last,age
            Bob,Smith,12
            Ed,Smith,12
            ", dt2);

            // Select two columns, empty
            DataTable dt3 = Analyze.SelectDuplicates(dtOriginal, "first", "age");

            AssertEquals(
            @"first,last,age
            ", dt3);
        }
Beispiel #5
0
        /// <summary>
        /// The VoltageRead2
        /// </summary>
        private void VoltageRead2()
        {
            form10 = new TP2(this);
            form10.ShowDialog(this);

            while (form10.Visible == true)
            {
                Application.DoEvents();
            }

            if (DMM.IsOpen)
            {
                //DMM.Open();
                DMM.DiscardInBuffer();
                DMM.Write(":FETCh? \r");
                System.Threading.Thread.Sleep(3000);
                string response = DMM.ReadExisting();                                         //read response string
                textBox3.Text = response;

                double responseD = double.Parse(response);
                textBox3.Text = string.Format("{0:0.000}", responseD);

                DMM.DiscardOutBuffer();
                //this.Refresh();

                DataAccess.DataTable dt = DataAccess.DataTable.New.ReadCsv(@"Voltage.csv");

                // Query via the DataTable.Rows enumeration.
                foreach (Row row in dt.Rows)
                {
                    listBox2.Items.Add(row["Voltage2"]);
                }

                if (listBox2.Items.Contains(textBox3.Text))

                {
                    led3.OffColor = Color.LimeGreen;
                }
                else
                {
                    MessageBox.Show("Please check the DMM USB connections");
                    led3.OffColor = Color.Red;
                }
            }
            else
            {
                MessageBox.Show("Please check the DMM USB connections");
                led3.OffColor = Color.Red;
            }

            this.Refresh();
        }
Beispiel #6
0
        /// <summary>
        /// The CurrentRead
        /// </summary>
        private void CurrentRead()
        {
            if (DMM.IsOpen)
            {
                // DMM.Open();
                // DMM.DiscardInBuffer();
                DMM.Write("FETCh?\r");
                System.Threading.Thread.Sleep(3000);
                // DMM.Write(":FETCh? \r");
                string response = DMM.ReadExisting();                                         //read response string

                //System.Threading.Thread.Sleep(3000);
                textBox1.Text = response;

                /* decimal responseD = decimal.Parse(response);
                 * textBox1.Text = String.Format("{0:n4}", responseD);*/

                DMM.DiscardOutBuffer();
                // this.Refresh();

                DataAccess.DataTable dt = DataAccess.DataTable.New.ReadCsv(@"Voltage.csv");

                // Query via the DataTable.Rows enumeration.
                foreach (Row row in dt.Rows)
                {
                    listBox5.Items.Add(row["Current"]);
                }

                if (listBox5.Items.Contains(textBox1.Text))
                {
                    led1.OffColor = Color.LimeGreen;
                }
                else
                {
                    MessageBox.Show("Please check the DMM USB connections");
                    led1.OffColor = Color.Red;
                }
            }
            else
            {
                MessageBox.Show("Please check the DMM USB connections");
                led1.OffColor = Color.Red;
            }

            this.Refresh();
        }
Beispiel #7
0
        /// <summary>
        /// Apply a Where filter to a table. This can stream over large data and filter it down. 
        /// </summary>
        /// <param name="table">source table</param>
        /// <param name="fpSelector">predicate to execute on each row</param>
        /// <returns>a new table that copies out rows from from the source table</returns>
        public static DataTable Where(DataTable table, Func<Row, bool> fpSelector)
        {
            TableWriter writer = new TableWriter(table);

            int count = 0;
            foreach(Row row in table.Rows)
            {
                bool keep = fpSelector(row);
                if (keep)
                {
                    writer.AddRow(row);
                    count++;
                }
            }
            return writer.CloseAndGetTable();
        }
Beispiel #8
0
 /// <summary>
 /// Given a potentially extremely large table, shred it into smaller CSV files based on the values in columnName.
 /// This can be very useful for easily building an index for a large file. 
 /// For each unique value in column, a CSV file is created and named string.Format(templateFilename, value).
 /// The ordering within each small file is preserved
 /// </summary>
 /// <param name="table">original table to shred</param>
 /// <param name="columnName">column name to use for shredding. You can use <see cref="GetColumnValueCounts"/>
 /// to see the variation in each column to determine a good column to use for shredding.
 /// </param>
 /// <param name="templateFilename">template specifying filename of shredded files.</param>
 public static void Shred(DataTable table, string columnName, string templateFilename)
 {
     Func<string, TextWriter> func =
         (value) =>
         {
             string destination = string.Format(templateFilename, value);
             TextWriter tw = new StreamWriter(destination);
             return tw;
         };
     Shred(table, columnName, func);
 }
Beispiel #9
0
        private static System.Data.DataTable ReadFile(string filePath)
        {   //Create clone tblRaw
            System.Data.DataTable newMeter = new System.Data.DataTable("tblRaw");

            DataColumn Id = new DataColumn();

            Id.DataType   = Type.GetType("System.Guid");
            Id.ColumnName = "Id";
            newMeter.Columns.Add(Id);

            DataColumn MeterAddress = new DataColumn();

            MeterAddress.DataType   = Type.GetType("System.String");
            MeterAddress.ColumnName = "MeterAddress";
            newMeter.Columns.Add(MeterAddress);

            DataColumn ReadingDate = new DataColumn();

            ReadingDate.DataType   = Type.GetType("System.DateTime");
            ReadingDate.ColumnName = "ReadingDate";
            newMeter.Columns.Add(ReadingDate);

            DataColumn RawTelegram = new DataColumn();

            RawTelegram.DataType   = Type.GetType("System.String");
            RawTelegram.ColumnName = "RawTelegram";
            newMeter.Columns.Add(RawTelegram);

            DataColumn GatewayId = new DataColumn();

            GatewayId.DataType   = Type.GetType("System.String");
            GatewayId.ColumnName = "GatewayId";
            newMeter.Columns.Add(GatewayId);

            DataColumn[] keys = new DataColumn[1];
            keys[0]             = Id;
            newMeter.PrimaryKey = keys;

            try
            {
                DataAccess.DataTable dt = DataAccess.DataTable.New.ReadCsv(filePath);
                // Query via the DataTable.Rows enumeration.
                var     gateway = Path.GetFileName(Path.GetDirectoryName(filePath));
                DataRow rowMeter;
                foreach (Row row in dt.Rows)
                {
                    rowMeter                 = newMeter.NewRow();
                    rowMeter["Id"]           = Guid.NewGuid();
                    rowMeter["MeterAddress"] = row["METER_ADDRESS"];
                    rowMeter["ReadingDate"]  = row["READING_DATE"]
                                               .CSVtoDateDateConvert();
                    rowMeter["RawTelegram"] = row["RAW_TELEGRAM"];
                    rowMeter["GatewayId"]   = gateway;
                    newMeter.Rows.Add(rowMeter);
                }
            }
            catch (Exception ex)
            {
                Console.WriteLine($"Contact Admin: {ex.Message}", "Import");
            }

            newMeter.AcceptChanges();

            return(newMeter);
        }
Beispiel #10
0
        /// <summary>
        /// Produces a table where each row is the number of unique values in a source column, followed by the top N occurences in that column.
        /// </summary>
        /// <param name="table">source table</param>
        /// <param name="N">number of top N occurences to include in the summary table </param>
        /// <returns>a summary table</returns>
        public static MutableDataTable GetColumnValueCounts(DataTable table, int N)
        {
            if (N < 0)
            {
                throw new ArgumentOutOfRangeException("N");
            }

            string[] names = table.ColumnNames.ToArray();
            int count = names.Length;

            MutableDataTable dSummary = new MutableDataTable();
            Column c1 = new Column("column name", count);
            Column c2 = new Column("count", count);

            int kFixed = 2;
            Column[] cAll = new Column[kFixed + N * 2];
            cAll[0] = c1;
            cAll[1] = c2;

            for (int i = 0; i < N; i++)
            {
                cAll[i * 2 + kFixed] = new Column("Top Value " + i, count);
                cAll[i * 2 + 1 + kFixed] = new Column("Top Occurrence " + i, count);
            }
            dSummary.Columns = cAll;

            int columnId = 0;
            foreach (string name in names)
            {
                Tuple<string, int>[] hist = AsHistogram(table, columnId);

                c1.Values[columnId] = name;
                c2.Values[columnId] = hist.Length.ToString();

                for (int i = 0; i < N; i++)
                {
                    if (i >= hist.Length)
                    {
                        break;
                    }
                    cAll[i * 2 + kFixed].Values[columnId] = hist[i].Item1;
                    cAll[i * 2 + 1 + kFixed].Values[columnId] = hist[i].Item2.ToString();
                }

                columnId++;
            }

            return dSummary;
        }
Beispiel #11
0
 // Return 0-based index of column with matching name.
 // throws an exception if not found
 private static int GetColumnIndexFromName(DataTable table, string columnName)
 {
     string[] columnNames = table.ColumnNames.ToArray();
     return Utility.GetColumnIndexFromName(columnNames, columnName);
 }
Beispiel #12
0
 private static string TableToString(DataTable dt)
 {
     StringWriter sw = new StringWriter();
     dt.SaveToStream(sw);
     return sw.ToString();
 }
        // Write a DataTable to an AzureTable.
        // DataTable's Rows are an unstructured property bag.
        // columnTypes - type of the column, or null if column should be skipped. Length of columnTypes should be the same as number of columns.
        public static void SaveToAzureTable(DataTable table, CloudStorageAccount account, string tableName, Type[] columnTypes, Func<int, Row, PartitionRowKey> funcComputeKeys)
        {
            if (table == null)
            {
                throw new ArgumentNullException("table");
            }
            if (account == null)
            {
                throw new ArgumentNullException("account");
            }
            if (columnTypes == null)
            {
                throw new ArgumentNullException("columnTypes");
            }
            if (tableName == null)
            {
                throw new ArgumentNullException("tableName");
            }
            ValidateAzureTableName(tableName);

            // Azure tables have "special" columns.
            // We can skip these by settings columnType[i] to null, which means don't write that column
            string[] columnNames = table.ColumnNames.ToArray();
            if (columnNames.Length != columnTypes.Length)
            {
                throw new ArgumentException(string.Format("columnTypes should have {0} elements", columnNames.Length), "columnTypes");
            }

            columnTypes = columnTypes.ToArray(); // create a copy for mutation.
            for (int i = 0; i < columnNames.Length; i++)
            {
                if (IsSpecialColumnName(columnNames[i]))
                {
                    columnTypes[i] = null;
                }
            }

            if (funcComputeKeys == null)
            {
                funcComputeKeys = GetPartitionRowKeyFunc(columnNames);
            }

            // Validate columnTypes
            string [] edmTypeNames = Array.ConvertAll(columnTypes,
                 columnType => {
                     if (columnType == null)
                     {
                         return null;
                     }
                     string edmTypeName;
                     _edmNameMapping.TryGetValue(columnType, out edmTypeName);
                     if (edmTypeName == null)
                     {
                         // Unsupported type!
                         throw new InvalidOperationException(string.Format("Type '{0}' is not a supported type on azure tables", columnType.FullName));
                     }
                     return edmTypeName;
                 });

            var tableClient = account.CreateCloudTableClient();
            var tableReference = tableClient.GetTableReference(tableName);

            if (tableReference.Exists())
            {
                tableReference.Delete();
            }

            tableReference.Create();

            GenericTableWriter w = new GenericTableWriter
            {
                _edmTypeNames = edmTypeNames,
                _columnNames = table.ColumnNames.ToArray()
            };

            // Batch rows for performance,
            // but all rows in the batch must have the same partition key
            TableServiceContext ctx = null;
            string lastPartitionKey = null;

            HashSet<PartitionRowKey> dups = new HashSet<PartitionRowKey>();

            int rowCounter = 0;
            int batchSize = 0;
            foreach (Row row in table.Rows)
            {
                GenericWriterEntity entity = new GenericWriterEntity { _source = row };
                // Compute row and partition keys too.
                var partRow = funcComputeKeys(rowCounter, row);
                entity.PartitionKey = partRow.PartitionKey;
                entity.RowKey = partRow.RowKey;
                rowCounter++;

                // but all rows in the batch must have the same partition key
                if ((ctx != null) && (lastPartitionKey != null) && (lastPartitionKey != entity.PartitionKey))
                {
                    ctx.SaveChangesWithRetries(SaveChangesOptions.Batch | SaveChangesOptions.ReplaceOnUpdate);
                    ctx = null;
                }

                if (ctx == null)
                {
                    dups.Clear();
                    lastPartitionKey = null;
                    ctx = tableClient.GetTableServiceContext();
                    ctx.Format.UseAtom();
                    ctx.WritingEntity += new EventHandler<ReadingWritingEntityEventArgs>(w.ctx_WritingEntity);
                    batchSize = 0;
                }

                // Add enty to the current batch
                // Upsert means insert+Replace. But still need uniqueness within a batch.
                bool allowUpsert = true;

                // Check for dups within a batch.
                var key = new PartitionRowKey { PartitionKey = entity.PartitionKey, RowKey = entity.RowKey };
                bool dupWithinBatch = dups.Contains(key);
                dups.Add(key);

                if (allowUpsert)
                {
                    // Upsert allows overwriting existing keys. But still must be unique within a batch.
                    if (!dupWithinBatch)
                    {
                        ctx.AttachTo(tableName, entity);
                        ctx.UpdateObject(entity);
                    }
                }
                else
                {
                    // AddObject requires uniquess.
                    if (dupWithinBatch)
                    {
                        // Azure REST APIs will give us a horrible cryptic error (400 with no message).
                        // Provide users a useful error instead.
                        throw new InvalidOperationException(string.Format("Table has duplicate keys: {0}", key));
                    }

                    ctx.AddObject(tableName, entity);
                }

                lastPartitionKey = entity.PartitionKey;
                batchSize++;

                if (batchSize % UploadBatchSize == 0)
                {
                    // Beware, if keys collide within a batch, we get a very cryptic error and 400.
                    // If they collide across batches, we get a more useful 409 (conflict).
                    try
                    {
                        ctx.SaveChangesWithRetries(SaveChangesOptions.Batch | SaveChangesOptions.ReplaceOnUpdate);
                    }
                    catch (DataServiceRequestException de)
                    {
                        var e = de.InnerException as DataServiceClientException;
                        if (e != null)
                        {
                            if (e.StatusCode == 409)
                            {
                                // Conflict. Duplicate keys. We don't get the specific duplicate key.
                                // Server shouldn't do this if we support upsert.
                                // (although an old emulator that doesn't yet support upsert may throw it).
                                throw new InvalidOperationException(string.Format("Table has duplicate keys. {0}", e.Message));
                            }
                        }
                    }
                    ctx = null;
                }
            }

            if (ctx != null)
            {
                ctx.SaveChangesWithRetries(SaveChangesOptions.Batch | SaveChangesOptions.ReplaceOnUpdate);
            }
        }
Beispiel #14
0
        static void Main(string[] args)
        {
            var whContext1    = new HR1Entities();
            var verpsContext1 = new VERPSEntities();



            DataAccess.DataTable dt = DataAccess.DataTable.New.ReadCsv(@"C:\Users\mehme\Documents\doors.csv");

            DataSet ds = new DataSet();

            System.Data.DataTable dt2 = new System.Data.DataTable();


            foreach (Row row in dt.Rows)
            {
                if (row["Card ID"].ToString().Trim() != "DA45A9BF000000" && 1 == 2)
                {
                    DoorsRawData drd = new DoorsRawData();

                    drd.Eventdatetime        = DateTime.Parse(row["Event date/time"]);
                    drd.EventdatetimeUTC     = DateTime.Parse(row["Event date/time UTC"]);
                    drd.OperationID          = Int16.Parse(row["Operation ID"]);
                    drd.Isexit               = Int16.Parse(row["Is exit"]);
                    drd.Operationdescription = row["Operation description"];
                    drd.Usertype             = Int16.Parse(row["User type"]);
                    drd.Username             = row["User name"];
                    drd.UserExtID            = row["User ExtID"];
                    drd.UserGPF1             = row["User GPF1"];
                    drd.UserGPF2             = row["User GPF2"];
                    drd.UserGPF3             = row["User GPF3"];
                    drd.UserGPF4             = row["User GPF4"];
                    drd.UserGPF5             = row["User GPF5"];
                    drd.Cardserialnumber     = row["Card serial number"];
                    drd.CardID               = row["Card ID"];
                    drd.Doorname             = row["Door name"];
                    drd.DoorExtID            = row["Door ExtID"];
                    drd.DoorGPF1             = row["Door GPF1"];
                    drd.DoorGPF2             = row["Door GPF2"];

                    //whContext.DoorsRawDatas.Add(drd);
                }
            }

            //whContext.SaveChanges();

            for (int i = 1; i < 32; i++)
            {
                InsertWorkingTimes(i);
            }



            var           reader = new StreamReader(File.OpenRead(@"C:\Users\mehme\Documents\doors.csv"));
            List <string> listA  = new List <string>();
            List <string> listB  = new List <string>();

            while (!reader.EndOfStream)
            {
                var line   = reader.ReadLine();
                var values = line.Split('"');

                listA.Add(values[0]);
                listB.Add(values[1]);
            }
        }
 internal RowFromStreamingTable(IList<string> values, DataTable table)
 {
     _values = values;
     _table = table;
 }
Beispiel #16
0
        // Write a DataTable to an AzureTable.
        // DataTable's Rows are an unstructured property bag.
        // columnTypes - type of the column, or null if column should be skipped. Length of columnTypes should be the same as number of columns.
        public static void SaveToAzureTable(DataTable table, CloudStorageAccount account, string tableName, Type[] columnTypes, Func<int, Row, ParitionRowKey> funcComputeKeys)
        {
            if (table == null)
            {
                throw new ArgumentNullException("table");
            }
            if (account == null)
            {
                throw new ArgumentNullException("account");
            }
            if (columnTypes == null)
            {
                throw new ArgumentNullException("columnTypes");
            }
            if (tableName == null)
            {
                throw new ArgumentNullException("tableName");
            }
            ValidateAzureTableName(tableName);

            // Azure tables have "special" columns.
            // We can skip these by settings columnType[i] to null, which means don't write that column
            string[] columnNames = table.ColumnNames.ToArray();
            if (columnNames.Length != columnTypes.Length)
            {
                throw new ArgumentException(string.Format("columnTypes should have {0} elements", columnNames.Length), "columnTypes");
            }

            columnTypes = columnTypes.ToArray(); // create a copy for mutation.
            for (int i = 0; i < columnNames.Length; i++)
            {
                if (IsSpecialColumnName(columnNames[i]))
                {
                    columnTypes[i] = null;
                }
            }

            if (funcComputeKeys == null)
            {
                funcComputeKeys = GetPartitionRowKeyFunc(columnNames);
            }

            // Validate columnTypes
            string [] edmTypeNames = Array.ConvertAll(columnTypes,
                 columnType => {
                     if (columnType == null)
                     {
                         return null;
                     }
                     string edmTypeName;
                     _edmNameMapping.TryGetValue(columnType, out edmTypeName);
                     if (edmTypeName == null)
                     {
                         // Unsupported type!
                         throw new InvalidOperationException(string.Format("Type '{0}' is not a supported type on azure tables", columnType.FullName));
                     }
                     return edmTypeName;
                 });

            CloudTableClient tableClient = account.CreateCloudTableClient();

            tableClient.DeleteTableIfExist(tableName);
            tableClient.CreateTableIfNotExist(tableName);

            GenericTableWriter w = new GenericTableWriter
            {
                _edmTypeNames = edmTypeNames,
                _columnNames = table.ColumnNames.ToArray()
            };

            // Batch rows for performance,
            // but all rows in the batch must have the same partition key
            TableServiceContext ctx = null;
            string lastPartitionKey = null;

            int rowCounter = 0;
            int batchSize = 0;
            foreach (Row row in table.Rows)
            {
                GenericWriterEntity entity = new GenericWriterEntity { _source = row };
                // Compute row and partition keys too.
                var partRow = funcComputeKeys(rowCounter, row);
                entity.PartitionKey = partRow.PartitionKey;
                entity.RowKey = partRow.RowKey;
                rowCounter++;

                // but all rows in the batch must have the same partition key
                if ((ctx != null) && (lastPartitionKey != null) && (lastPartitionKey != entity.PartitionKey))
                {
                    ctx.SaveChangesWithRetries(SaveChangesOptions.Batch | SaveChangesOptions.ReplaceOnUpdate);
                    ctx = null;
                }

                if (ctx == null)
                {
                    lastPartitionKey = null;
                    ctx = tableClient.GetDataServiceContext();
                    ctx.WritingEntity += new EventHandler<ReadingWritingEntityEventArgs>(w.ctx_WritingEntity);
                    batchSize = 0;
                }

                // Add enty to the current batch
                ctx.AddObject(tableName, entity);
                lastPartitionKey = entity.PartitionKey;
                batchSize++;

                if (batchSize % 50 == 0)
                {
                    ctx.SaveChangesWithRetries(SaveChangesOptions.Batch | SaveChangesOptions.ReplaceOnUpdate);
                    ctx = null;
                }
            }

            if (ctx != null)
            {
                ctx.SaveChangesWithRetries(SaveChangesOptions.Batch | SaveChangesOptions.ReplaceOnUpdate);
            }
        }
Beispiel #17
0
        static void Add(DataTable table, Dictionary2d<string, string, string> dict, ref int counter)
        {
            foreach (var row in table.Rows)
            {
                int i = 0;
                foreach (var name in row.ColumnNames)
                {
                    var value = row.Values[i];

                    dict[counter.ToString(), name] = value;
                    i++;
                }

                counter++;
            }
        }
Beispiel #18
0
        static void Add(DataTable table, Dictionary2d<string, string, string> dict, ref int counter)
        {
            var names = (from name in table.ColumnNames select name.ToLowerInvariant()).ToArray();
            foreach (var row in table.Rows)
            {
                int i = 0;
                foreach (var name in names)
                {
                    var value = row.Values[i];

                    dict[counter.ToString(), name] = value;
                    i++;
                }

                counter++;
            }
        }
Beispiel #19
0
 private static int[] GetColumnIndexFromNames(DataTable table, string[] columnNames)
 {
     return Array.ConvertAll(columnNames, columnName => GetColumnIndexFromName(table, columnName));
 }
Beispiel #20
0
        /// <summary>
        /// Return a sample that's the top N records from a table.
        /// This is useful to sample a large table and then save the sample. 
        /// </summary>
        /// <param name="table">source table</param>
        /// <param name="topN">positive value specifying number of rows to copy from from source table</param>
        /// <returns>The topN rows from the source table.</returns>
        public static MutableDataTable SampleTopN(DataTable table, int topN)
        {
            if (topN <= 0)
            {
                throw new ArgumentOutOfRangeException("topN", "sample must be a positive integer");
            }

            TableWriter writer = new TableWriter(table);

            foreach (var row in table.Rows)
            {
                topN--;
                writer.AddRow(row);

                if (topN == 0)
                {
                    // Check topN before the enumeration to avoid pulling a wasted row from the source table
                    break;
                }
            }

            return DataTable.New.GetMutableCopy(writer.CloseAndGetTable());
        }
Beispiel #21
0
        void GetColumnValueCounts(DataTable dtOriginal)
        {
            MutableDataTable result = Analyze.GetColumnValueCounts(dtOriginal, 1);

            AssertEquals(
            @"column name,count,Top Value 0,Top Occurrence 0
            first,3,Bob,2
            last,2,Smith,3
            age,2,12,2
            ", result);
        }
Beispiel #22
0
        /// <summary>
        /// Find all rows that have dups for the given columns.
        /// This uses a multi-pass algorithm to operate on a large data file.
        /// </summary>
        /// <param name="table">original table</param>
        /// <param name="columnNames">set of columns to compare to look for duplicates</param>
        /// <returns>a table that's a subset of the original table</returns>
        public static DataTable SelectDuplicates(DataTable table, params string[] columnNames)
        {
            int[] ci = GetColumnIndexFromNames(table, columnNames);

            // Store on hash keys first. Use hash keys because they're compact and efficient for large data sets
            // But then we do need to handle collisions.
            HashSet<int> allKeys = new HashSet<int>();
            HashSet<int> possibleDups = new HashSet<int>();

            //
            // Take a first pass and store the hash of each row's unique Key
            //
            foreach (Row row in table.Rows)
            {
                var parts = row.Values;
                int hash = CalcHash(parts, ci);

                if (allKeys.Contains(hash))
                {
                    possibleDups.Add(hash);
                }
                else
                {
                    allKeys.Add(hash);
                }
            }
            allKeys = null; // Free up for GC

            //
            // Now take a second pass through the dups.
            //
            Dictionary<string, Row> fullMatch = new Dictionary<string, Row>();

            StringBuilder sb = new StringBuilder();

            TableWriter writer = new TableWriter(table);

            foreach (Row row in table.Rows)
            {
                {
                    var parts = row.Values;
                    int hash = CalcHash(parts, ci);
                    if (!possibleDups.Contains(hash))
                    {
                        continue;
                    }

                    // Potential match
                    sb.Clear();
                    foreach (int i in ci)
                    {
                        sb.Append(parts[i]);
                        sb.Append(',');
                    }
                    string key = sb.ToString();

                    if (fullMatch.ContainsKey(key))
                    {
                        Row firstLine = fullMatch[key];
                        if (firstLine != null)
                        {
                            writer.AddRow(firstLine);
                            fullMatch[key] = null;
                        }

                        // Real dup!
                        writer.AddRow(row);
                    }
                    else
                    {
                        fullMatch[key] = row;
                    }
                }
            } // reader
            return writer.CloseAndGetTable();
        }
Beispiel #23
0
 /// <summary>
 /// Extract column as a histogram, sorted in descending order by frequency.        
 /// </summary>
 /// <param name="table">source table</param>
 /// <param name="columnName">column within short table</param>
 /// <returns>collection of tuples, where each tuple is a value and the count of that value within the column</returns>
 public static Tuple<string, int>[] AsHistogram(DataTable table, string columnName)
 {
     int i = GetColumnIndexFromName(table, columnName);
     return AsHistogram(table, i);
 }
Beispiel #24
0
        /// <summary>
        /// Given a potentially extremely large table, shred it into smaller CSV files based on the values in columnName.
        /// This can be very useful for easily building an index for a large file. 
        /// For each unique value in column, funcCreateStream is invoked with that value to get a TextWriter. The csv is written to that writer.
        /// The ordering within each small file is preserved
        /// This stream based overload is useful when you need to avoid writing to the local file system (such as with Azure storage)
        /// </summary>
        /// <param name="table">original table to shred</param>
        /// <param name="funcCreateStream">callback function to create a stream for each new table.</param>
        /// <param name="columnName">column name to use for shredding. You can use <see cref="GetColumnValueCounts"/>
        /// to see the variation in each column to determine a good column to use for shredding.
        /// </param>    
        public static void Shred(DataTable table, string columnName, Func<string, TextWriter> funcCreateStream)
        {
            Dictionary<string, TextWriter> dict = new Dictionary<string, TextWriter>();

            try
            {
                foreach (Row row in table.Rows)
                {
                    TextWriter tw;
                    string val = row[columnName];
                    if (!dict.TryGetValue(val, out tw))
                    {
                        // New value
                        tw = funcCreateStream(val);
                        dict[val] = tw;
                        CsvWriter.RawWriteLine(table.ColumnNames, tw); // header
                    }
                    CsvWriter.RawWriteLine(row.Values, tw);
                }

            }
            finally
            {
                foreach (var kv in dict)
                {
                    kv.Value.Close();
                }
            }
        }
Beispiel #25
0
 public CsvExport(String LocationToSave, DataTable DtPatients)
 {
     DtPatients.SaveCSV(LocationToSave);
 }
Beispiel #26
0
 public TableWriter(DataTable source)
 {
     _table = new ViewTable();
     _table._ColumnNames = source.ColumnNames.ToArray(); // copy in case source is mutable
 }