Example #1
0
        /// <summary>
        /// Profiles and pre-processes a file. Pre-processing involves 1) removing headers and footers, 2) skipping lines,
        /// 3) un-quoting fields, and 4) splitting fields.
        /// </summary>
        /// <param name="PrepFileFqpn">The fully-qualified path name of the prepped file generated by the method. If the
        /// method does not perform prepping, then the output variable will be set to null</param>
        /// <param name="SplitCols">A Dictionary of column splitting specifiers. The key is a column name, and the
        /// value is the value within a field upon which to split. E.g. <"foo",":"> splits the foo column into foo,
        /// and foo_descr on the colon character</param>
        /// <returns>A List of ProfileColumn instances with data type info about the file.</returns>

        public static List <ProfileColumn> Process(out string PrepFileFqpn, Dictionary <string, string> SplitCols)
        {
            List <ProfileColumn> Cols = ProfileColumn.BuildColumnList(SplitCols);

            ProcessOneFile(Cols, out PrepFileFqpn, SplitCols != null);
            return(Cols);
        }
Example #2
0
        /// <summary>
        /// Displays the DDL to the console
        /// </summary>
        /// <param name="Cols">A List of ProfileColumn with column name and data type information</param>

        static void ShowDDL(List <ProfileColumn> Cols)
        {
            string TableName       = Cfg.Tbl;
            bool   CreateAsVarchar = !Cfg.Typed;
            string DDLStatement    = ProfileColumn.GenerateCreateTableStatement(Cols, TableName, CreateAsVarchar);

            Log.InformationMessage("Target table creation DDL statement:\n====================================\n{0}", DDLStatement);
        }
Example #3
0
        /// <summary>
        /// Generates the frequencies file
        /// </summary>
        /// <param name="Cols">A List of ProfileColumn with frequency information</param>

        static void GenerateFreqs(List <ProfileColumn> Cols)
        {
            Log.InformationMessage("Generating frequencies to frequency file: {0}", Cfg.FreqFile);

            ProfileColumn.SaveFrequencies(Cols, Cfg.FreqFile);

            Log.InformationMessage("Frequency generation completed");
        }
Example #4
0
        /// <summary>
        /// Creates the target table
        /// </summary>
        /// <param name="Cols">A List of ProfileColumn with column name and data type information</param>

        static void CreateTable(List <ProfileColumn> Cols)
        {
            string TableName = Cfg.Tbl;

            Log.InformationMessage("Creating table: {0}", TableName);
            bool   CreateAsVarchar = !Cfg.Typed;
            string DDLStatement    = ProfileColumn.GenerateCreateTableStatement(Cols, TableName, CreateAsVarchar);

            ServerUtils.ExecSql(Cfg.Server, Cfg.Db, DDLStatement);
        }
Example #5
0
        /// <summary>
        /// Reads a user-specified number of lines from the top of the file, displays them to the console,
        /// and then exits/
        /// </summary>
        /// <param name="SplitCols">A Dictionary of column splitting specifiers. The key is a column name, and the
        /// value is the value within a field upon which to split. E.g. <"foo",":"> splits the foo column into foo,
        /// and foo_descr on the colon character</param>

        public static void Preview(Dictionary <string, string> SplitCols)
        {
            int InRows = 0;
            List <ProfileColumn>     Cols          = ProfileColumn.BuildColumnList(SplitCols);
            Dictionary <int, string> SplitOrdinals = SplitOrdinalsToDict(Cols);

            Log.InformationMessage("Previewing the file");
            using (FileReader Rdr = FileReader.NewFileReader(Cfg.File, Cfg.Prep))
            {
                List <string> InFields = null;
                while ((InFields = Rdr.ReadLine()) != null)
                {
                    if (SplitCols != null)
                    {
                        InFields = ProfileColumn.SplitFields(InFields, SplitOrdinals);
                    }
                    Log.InformationMessage(string.Join(",", InFields.ToArray()));
                    if (++InRows >= Cfg.Preview)
                    {
                        break;
                    }
                }
            }
        }
Example #6
0
        /// <summary>
        /// Profiles and pre-processes a file. Pre-processing involves 1) removing headers and footers, 2) skipping lines,
        /// 3) un-quoting fields, and 4) splitting fields.
        /// </summary>
        /// <param name="Cols">A List of ProfileColumn instances to populate with data type information accumulated from the
        /// file data</param>
        /// <param name="PrepFileFqpn">The fully-qualified path name of the prepped file generated by the method. If the
        /// method does not perform prepping, then the output variable will be set to null</param>
        /// <param name="ShouldSplit">True if columns splitting should be performed. If true then the Cols arg will have
        /// already been split and so the method will split the input lines to align with the Cols list</param>
        /// <returns></returns>

        private static int ProcessOneFile(List <ProfileColumn> Cols, out string PrepFileFqpn, bool ShouldSplit)
        {
            int InRows  = 0;
            int ErrRows = 0;
            {
                string Msg = "Processing file: {0}";
                if (Cfg.Profile && Cfg.Prep)
                {
                    Msg = "Profiling and Prepping file: {0}";
                }
                else if (Cfg.Profile)
                {
                    Msg = "Profiling file: {0}";
                }
                else if (Cfg.Prep)
                {
                    Msg = "Prepping file: {0}";
                }
                Log.InformationMessage(Msg, Cfg.File);
            }
            bool     DoFreqs   = !string.IsNullOrEmpty(Cfg.FreqFile);
            int      SkipLines = Math.Max(Cfg.SkipLines, Cfg.HeaderLine);
            DateTime Start     = DateTime.Now;
            long     BytesRead = 0;
            decimal  FileSize  = new FileInfo(Cfg.File).Length;

            Dictionary <int, string> SplitOrdinals = SplitOrdinalsToDict(Cols);

            using (FileReader Rdr = FileReader.NewFileReader(Cfg.File, Cfg.Prep))
            {
                List <string> InFields       = null;
                StreamWriter  PrepFileWriter = null;
                if ((PrepFileFqpn = Cfg.Prep ? MakePrepFileName(Cfg.File, Cfg.PrepDir) : null) != null)
                {
                    PrepFileWriter = new StreamWriter(PrepFileFqpn);
                }
                try
                {
                    while ((InFields = Rdr.ReadLine()) != null)
                    {
                        if (ShouldSplit)
                        {
                            InFields = ProfileColumn.SplitFields(InFields, SplitOrdinals);
                        }
                        ++InRows;
                        if (Cols != null && InFields.Count != Cols.Count)
                        {
                            // ERROR -- column count mismatch
                            ++ErrRows;
                            if (!string.IsNullOrEmpty(Cfg.ErrFile))
                            {
                                LogErrorRecord(SkipLines + InRows, InFields, Cfg.ErrFile);
                            }
                            if (Cfg.MaxErrors >= 0 && ErrRows > Cfg.MaxErrors) // Cfg.MaxErrors value of -1 means ignore all errors
                            {
                                throw new LoadException(string.Format("File {0} has a different number of columns than expected on row {1}. Expected: {2} Found: {3}", Cfg.File, SkipLines + InRows, Cols.Count, InFields.Count));
                            }
                            else
                            {
                                continue;
                            }
                        }
                        if (PrepFileWriter != null)
                        {
                            // prepped file is always tab-delimited
                            PrepFileWriter.WriteLine(string.Join("\t", InFields.ToArray()));
                        }
                        if (Cfg.Profile)
                        {
                            for (int FldNum = 0; FldNum < InFields.Count; ++FldNum)
                            {
                                Cols[FldNum].Profile(InFields[FldNum], DoFreqs, Cfg.Typed);
                            }
                        }
                        if (InRows % 10000 == 0) // && Args.ShowProgress)
                        {
                            decimal PctComplete = Rdr.TotBytesRead / FileSize;
                            Log.InformationMessage("{0} rows ({1:P0})", InRows, PctComplete); // something to look at for large files
                        }
                    }
                    BytesRead = Rdr.TotBytesRead;
                }
                finally
                {
                    if (PrepFileWriter != null)
                    {
                        PrepFileWriter.Close();
                    }
                }
            }
            if (ErrRows != 0)
            {
                Log.InformationMessage("{0} error records were written to error file: {1}", ErrRows, Cfg.ErrFile);
            }
            Log.InformationMessage("Rows read: {0:n0} -- Error rows: {1:n0} -- Bytes read: {2:n0} -- Elapsed time (HH:MM:SS.Milli): {3}", InRows, ErrRows, BytesRead, DateTime.Now - Start);
            if (InRows == ErrRows)
            {
                // something went way wrong
                throw new LoadException(string.Format("No records in the input file matched the expected format."));
            }
            return(InRows);
        }