Example #1
0
        public static DataColumnReport CreateColumnReport(string fileName, bool containsHeader, char delimiter, int firstK)
        {
            if (!File.Exists(fileName))
            {
                throw new ArgumentException(String.Format("File {0} does not exist", fileName));
            }
            IDictionary <string, int> columnToOrder = null;
            IDictionary <int, Histogram <ColumnNativeType> > histograms;
            var cnt = 0;

            using (var stream = new FileStream(fileName, FileMode.Open, FileAccess.Read))
            {
                using (var reader = new StreamReader(stream))
                {
                    var colCount = -1;
                    var line     = String.Empty;
                    if (containsHeader)
                    {
                        line = reader.ReadLine();
                        if (!String.IsNullOrEmpty(line))
                        {
                            columnToOrder = GetColumnOrder(line, delimiter);
                            colCount      = columnToOrder.Count;
                        }
                    }
                    else
                    {
                        line = reader.ReadLine();
                        if (!String.IsNullOrEmpty(line))
                        {
                            colCount = line.Split(delimiter).Length;
                            if (stream.CanSeek)
                            {
                                stream.Seek(0, SeekOrigin.Begin);
                            }
                            else
                            {
                                throw new InvalidOperationException(String.Format("File {0} cant not be seeked", fileName));
                            }
                        }
                    }
                    if (colCount <= 0)
                    {
                        throw new ArgumentException(String.Format("File {0} is empty", fileName));
                    }
                    histograms = InitializeHistograms(colCount);
                    cnt        = 0;
                    while ((line = reader.ReadLine()) != null)
                    {
                        var parsed = line.Split(delimiter);
                        if (parsed == null || parsed.Length != colCount)
                        {
                            throw new InvalidOperationException(String.Format("There is a non-conformant record in line {0} of file {1} ", cnt, fileName));
                        }
                        for (int i = 0; i < colCount; i++)
                        {
                            var nativeType = DataColumn.GetNativeType(parsed[i]);
                            var hist       = histograms[i];
                            if (hist.ContainsKey(nativeType))
                            {
                                ++hist[nativeType];
                            }
                            else
                            {
                                hist.Add(nativeType, 1);
                            }
                        }
                        ++cnt;
                        if (cnt >= firstK)
                        {
                            break;
                        }
                    }
                }
            }
            var orderToColumn = new Dictionary <int, string>();

            foreach (var item in columnToOrder)
            {
                orderToColumn.Add(item.Value, item.Key);
            }
            var collection  = new DataColumnCollection();
            var listInvalid = new List <string>();

            foreach (var item in histograms)
            {
                var order   = item.Key;
                var colName = orderToColumn[order];
                var hist    = item.Value;
                var kv      = InferNativeType(hist);
                if (kv.Key)
                {
                    var nativeType      = kv.Value;
                    var measurementType = DataColumn.GetDefaultMeasurementType(nativeType);
                    var dc = new DataColumn(colName, order)
                    {
                        NativeType = nativeType, MeasurementType = measurementType
                    };
                    collection.Add(colName, dc);
                }
                else
                {
                    listInvalid.Add(colName);
                }
            }
            return(new DataColumnReport {
                ColumnCollection = collection, InvalidColumns = listInvalid, ScannedNumOfLines = cnt
            });
        }