Exemplo n.º 1
0
        // load model from JSON file
        static CsvFormatter loadModelFormatFile()
        {
            CsvFormatter _csv;

            // get file contents
            try
            {
                // get contents
                string json = File.ReadAllText(inputFormatFile);

                // construct the csv formatter model
                _csv = JsonConvert.DeserializeObject <CsvFormatter>(json);
            }
            catch (Exception ex)
            {
                sendToConsole(ConsoleColor.Red, ex.Message.ToString());
                throw ex;
            }

            return(_csv);
        }
Exemplo n.º 2
0
        static void Main(string[] args)
        {
            // first process cmd line args
            if (!ProcessCmdLineArgs())
            {
                sendToConsole(ConsoleColor.Red, "Failed to process command line arguments.");
                return;
            }

            // did we just show help?
            if (displayHelp)
            {
                return;
            }

            // we only support 2 partitions at this time
            if (partitionColumns.Length > 2 || partitionSizes.Length > 2)
            {
                sendToConsole(ConsoleColor.Red, "A maximum of two partitioning columns is supported.");
                return;
            }

            // first we must load the format file
            try
            {
                _csv = loadModelFormatFile();
            }
            catch (Exception ex)
            {
                sendToConsole(ConsoleColor.Red, String.Format(@"Error occurred loading format file '{0}'", inputFormatFile.ToString()));
                return;
            }

            // initialize prevValue array
            prevValue = new string[partitionColumns.Length];

            for (int q = 0; q < partitionSizes.Length; q++)
            {
                prevValue[q] = null;
            }

            // at this point we have the model file loaded
            // now we must make sure we are using supported data types
            for (int i = 0; i < partitionColumns.Length; i++)
            {
                int _id;

                if (!int.TryParse(partitionColumns[i], out _id))
                {
                    sendToConsole(ConsoleColor.Red, String.Format(@"Unable to read column id for partition."));
                    return;
                }
                else
                {
                    if (_csv.columns[_id].dataType != CsvFormatter.dataTypes.Date && _csv.columns[_id].dataType != CsvFormatter.dataTypes.Integer)
                    {
                        sendToConsole(ConsoleColor.Red, String.Format(@"Column {0} is not a supported datatype for partitioning.  Supported data types are date and integer.", _id.ToString()));
                        return;
                    }
                }

                if (_csv.columns[_id].valueListFile != "" && _csv.columns[_id].defaultValue != "")
                {
                    sendToConsole(ConsoleColor.Red, String.Format(@"Default values and value lists are not supported for partitioning in column {0}", _id.ToString()));
                    return;
                }
            }

            // if using parallelism, we must make sure monotonics are not used.  Parallelism does not support monotonically increasing values because we do not coordinate across processes to make sure duplicates are not introduced.
            // we simply change the row count and file account to allow generation to happen in parallel across multiple cores.
            if (parallelism > 1)
            {
                // we check for monotonics in any column
                foreach (CsvFormatter.column c in _csv.columns)
                {
                    if (c.monotonic == true)
                    {
                        sendToConsole(ConsoleColor.Red, "The use of monotonically increasing values is not supported when parallelism is enabled for templates.");
                        return;
                    }

                    if (c.dataType == CsvFormatter.dataTypes.Date && (c.minvalue == c.maxValue))
                    {
                        sendToConsole(ConsoleColor.Red, "The use of monotonically increasing values is not supported when parallelism is enabled for templates.");
                        return;
                    }
                }
            }


            // validate that the ranges are equal multiples of each other
            // here we must calculate the number of files each range will produce and make sure it is a) a multiple of the total run files defined and b) a multiple of the total run rows defined
            // and c) a multiple of the other ranges.  It must be a multiple of other ranges because lower numbered ranges, those with a smaller defined size, will be duplicated and this must be done
            // on the template file boundary.  For example, if we have a range of integers in a column from 1 to 120,000,000 - then a partition size of 1,000,000 will produce 120 template files.  Other
            // partitioned columns must align to this boundary.  For a date column that has a range of one year and is partitioned monthly (30 days), this would work fine as a size of 30 days would yield
            // 12 files and the boundaries between the two align -->  120 % 12 = 0.   If they did not align, for instance we used a range of 100,000,000 and wrote 100 template files on the first partition,
            // we'd have no way to align them correctly as 100 % 12 = 4 (misaligned as we'd need to have two covering ranges for 1 column in that template - not possible).

            // calculate the target file count
            for (int t = 0; t < partitionColumns.Length; t++)
            {
                if (_csv.columns[int.Parse(partitionColumns[t])].dataType == CsvFormatter.dataTypes.Date)
                {
                    // calculate range on dates
                    DateTime _max;
                    DateTime _min;
                    int      _size;

                    if (!DateTime.TryParse(_csv.columns[int.Parse(partitionColumns[t])].maxValue, out _max))
                    {
                        sendToConsole(ConsoleColor.Red, String.Format(@"Unable to parse out max value for column '{0}' for alignment validation.", partitionColumns[t].ToString()));
                        return;
                    }

                    if (!DateTime.TryParse(_csv.columns[int.Parse(partitionColumns[t])].minvalue, out _min))
                    {
                        sendToConsole(ConsoleColor.Red, String.Format(@"Unable to parse out min value for column '{0}' for alignment validation.", partitionColumns[t].ToString()));
                        return;
                    }

                    // now we need the size passed
                    if (!int.TryParse(partitionSizes[t], out _size))
                    {
                        sendToConsole(ConsoleColor.Red, String.Format(@"Unable to parse out size value for column '{0}' for alignment validation.", partitionSizes[t].ToString()));
                        return;
                    }

                    // get time span
                    TimeSpan ts = (_max - _min);

                    // get normalized size
                    int ds = normalizeDate(_size);

                    switch (ds)
                    {
                    case 7:
                        values[t] = (Int64)(ts.TotalDays / 7);
                        break;

                    case 30:
                        values[t] = (Int64)(ts.TotalDays / 30);
                        break;

                    case 365:
                        values[t] = (Int64)(ts.TotalDays / 365);
                        break;
                    }
                }
                else
                {
                    // calculate range on ints
                    Int64 _max;
                    Int64 _min;
                    Int64 _size;

                    if (!Int64.TryParse(_csv.columns[int.Parse(partitionColumns[t])].maxValue, out _max))
                    {
                        sendToConsole(ConsoleColor.Red, String.Format(@"Unable to parse out max value for column '{0}' for alignment validation.", partitionColumns[t].ToString()));
                        return;
                    }

                    if (!Int64.TryParse(_csv.columns[int.Parse(partitionColumns[t])].minvalue, out _min))
                    {
                        sendToConsole(ConsoleColor.Red, String.Format(@"Unable to parse out min value for column '{0}' for alignment validation.", partitionColumns[t].ToString()));
                        return;
                    }

                    // now we need the size passed
                    if (!Int64.TryParse(partitionSizes[t], out _size))
                    {
                        sendToConsole(ConsoleColor.Red, String.Format(@"Unable to parse out size value for column '{0}' for alignment validation.", partitionSizes[t].ToString()));
                        return;
                    }

                    // now we get the file count
                    values[t] = (((_max - _min) + 1) / _size);
                }
            }

            // check for alignment and set file count
            if (partitionColumns.Length > 1)
            {
                bool aligned = true;

                // now check alignment
                if (values[0] > values[1])
                {
                    // set templateFiles
                    templateFiles = values[0];

                    if (values[0] % values[1] != 0)
                    {
                        // not aligned
                        aligned = false;
                    }
                }
                else
                {
                    // set templateFiles
                    templateFiles = values[1];

                    if (values[1] % values[0] != 0)
                    {
                        // not aligned
                        aligned = false;
                    }
                }

                if (!aligned)
                {
                    sendToConsole(ConsoleColor.Red, "The partition values supplied are not aligned.  Partition sizes and ranges must be aligned by being multiples of each other.");
                    return;
                }
            }
            else
            {
                // there is only one value
                templateFiles = values[0];
            }


            // generate templates
            for (Int64 f = 0; f < templateFiles; f++)
            {
                // loop once for each file that should be written

                // first get a copy of the model format file
                _csvPartition = loadModelFormatFile();

                _csvPartition.numberOfFiles = (_csv.numberOfFiles / (int)templateFiles);
                _csvPartition.numberOfRows  = (_csv.numberOfRows / templateFiles);

                for (int w = 0; w < partitionColumns.Length; w++)
                {
                    if (values[w] == templateFiles || f % values[w] == 0)
                    {
                        // we only want to calculate new ranges *IF* the calculated number of templates files is eql to the templateFiles var - which means we are on the column controlling rows/files ratio or the largest
                        // template count column
                        // --OR--
                        // if we are aligned on count with the number of files into templateFiles

                        // loop through partitioning columns
                        int   _colid = int.Parse(partitionColumns[w]);
                        Int64 _size  = Int64.Parse(partitionSizes[w]);

                        if (_csvPartition.columns[w].dataType == CsvFormatter.dataTypes.Date)
                        {
                            // we have a date
                            // calculate range on dates
                            DateTime _max;
                            DateTime _min;
                            int      __sze;

                            if (!DateTime.TryParse(_csv.columns[w].minvalue, out _min))
                            {
                                sendToConsole(ConsoleColor.Red, String.Format(@"Unable to parse out min value for column '{0}' for alignment validation.", partitionColumns[w].ToString()));
                                return;
                            }

                            // now we need the size passed
                            if (!int.TryParse(partitionSizes[w], out __sze))
                            {
                                sendToConsole(ConsoleColor.Red, String.Format(@"Unable to parse out size value for column '{0}' for alignment validation.", partitionSizes[w].ToString()));
                                return;
                            }

                            // normalize
                            __sze = normalizeDate(__sze);

                            // if we have a previous value, use it instead
                            if (prevValue[w] != null)
                            {
                                _min = DateTime.Parse(prevValue[w]);
                            }


                            // now we add the size
                            switch (__sze)
                            {
                            case 7:
                                _max = _min.AddDays(7);
                                break;

                            case 30:
                                _max = _min.AddMonths(1);
                                break;

                            case 365:
                                _max = _min.AddYears(1);
                                break;

                            default:
                                _max = _min.AddYears(1);
                                break;
                            }

                            // now we set
                            _csvPartition.columns[_colid].minvalue = _min.ToString("MM/dd/yyyy");
                            _csvPartition.columns[_colid].maxValue = _max.AddDays(-1).ToString("MM/dd/yyyy");

                            // update previous Values
                            prevValue[w] = _max.ToString();
                        }
                        else
                        {
                            // we have an integer
                            Int64 _max;
                            Int64 _min;
                            Int64 __sze;

                            if (!Int64.TryParse(_csv.columns[int.Parse(partitionColumns[w])].minvalue, out _min))
                            {
                                sendToConsole(ConsoleColor.Red, String.Format(@"Unable to parse out min value for column '{0}' for alignment validation.", partitionColumns[w].ToString()));
                                return;
                            }

                            // now we need the size passed
                            if (!Int64.TryParse(partitionSizes[w], out __sze))
                            {
                                sendToConsole(ConsoleColor.Red, String.Format(@"Unable to parse out size value for column '{0}' for alignment validation.", partitionSizes[w].ToString()));
                                return;
                            }

                            // if we have a previous value, use it instead
                            if (prevValue[w] != null)
                            {
                                _min = Int64.Parse(prevValue[w]);
                            }

                            // now we add the size
                            _max = _min + __sze;

                            _csvPartition.columns[_colid].minvalue = _min.ToString();
                            _csvPartition.columns[_colid].maxValue = (_max - 1).ToString();

                            // update previous values
                            prevValue[w] = _max.ToString();
                        }
                    }
                    else
                    {
                        // if we don't calculate a new range, we must copy the old range - which we should save
                        _csvPartition.columns[w].minvalue = _shadowCSV.columns[w].minvalue;
                        _csvPartition.columns[w].maxValue = _shadowCSV.columns[w].maxValue;
                    }
                } // inner for

                if (parallelism > 1)
                {
                    _csvPartition.numberOfFiles = (_csvPartition.numberOfCols / parallelism);
                    _csvPartition.numberOfRows  = (_csvPartition.numberOfRows / parallelism);

                    for (int p = 0; p < parallelism; p++)
                    {
                        if (!saveModelFormatFile((int)f, (int)p))
                        {
                            sendToConsole(ConsoleColor.Red, String.Format(@"Failed to save parallel model format file #{0}-{1}", f.ToString(), p.ToString()));
                            return;
                        }
                    }
                }
                else
                {
                    if (!saveModelFormatFile((int)f))
                    {
                        sendToConsole(ConsoleColor.Red, String.Format(@"Failed to save model format file #{0}", f.ToString()));
                        return;
                    }
                }

                // save shadow copy
                _shadowCSV = _csvPartition;
            } // outer for
        }     // main