/// <summary> /// Reads a user-specified number of lines from the top of the file, displays them to the console, /// and then exits/ /// </summary> /// <param name="SplitCols">A Dictionary of column splitting specifiers. The key is a column name, and the /// value is the value within a field upon which to split. E.g. <"foo",":"> splits the foo column into foo, /// and foo_descr on the colon character</param> public static void Preview(Dictionary <string, string> SplitCols) { int InRows = 0; List <ProfileColumn> Cols = ProfileColumn.BuildColumnList(SplitCols); Dictionary <int, string> SplitOrdinals = SplitOrdinalsToDict(Cols); Log.InformationMessage("Previewing the file"); using (FileReader Rdr = FileReader.NewFileReader(Cfg.File, Cfg.Prep)) { List <string> InFields = null; while ((InFields = Rdr.ReadLine()) != null) { if (SplitCols != null) { InFields = ProfileColumn.SplitFields(InFields, SplitOrdinals); } Log.InformationMessage(string.Join(",", InFields.ToArray())); if (++InRows >= Cfg.Preview) { break; } } } }
/// <summary> /// Profiles and pre-processes a file. Pre-processing involves 1) removing headers and footers, 2) skipping lines, /// 3) un-quoting fields, and 4) splitting fields. /// </summary> /// <param name="Cols">A List of ProfileColumn instances to populate with data type information accumulated from the /// file data</param> /// <param name="PrepFileFqpn">The fully-qualified path name of the prepped file generated by the method. If the /// method does not perform prepping, then the output variable will be set to null</param> /// <param name="ShouldSplit">True if columns splitting should be performed. If true then the Cols arg will have /// already been split and so the method will split the input lines to align with the Cols list</param> /// <returns></returns> private static int ProcessOneFile(List <ProfileColumn> Cols, out string PrepFileFqpn, bool ShouldSplit) { int InRows = 0; int ErrRows = 0; { string Msg = "Processing file: {0}"; if (Cfg.Profile && Cfg.Prep) { Msg = "Profiling and Prepping file: {0}"; } else if (Cfg.Profile) { Msg = "Profiling file: {0}"; } else if (Cfg.Prep) { Msg = "Prepping file: {0}"; } Log.InformationMessage(Msg, Cfg.File); } bool DoFreqs = !string.IsNullOrEmpty(Cfg.FreqFile); int SkipLines = Math.Max(Cfg.SkipLines, Cfg.HeaderLine); DateTime Start = DateTime.Now; long BytesRead = 0; decimal FileSize = new FileInfo(Cfg.File).Length; Dictionary <int, string> SplitOrdinals = SplitOrdinalsToDict(Cols); using (FileReader Rdr = FileReader.NewFileReader(Cfg.File, Cfg.Prep)) { List <string> InFields = null; StreamWriter PrepFileWriter = null; if ((PrepFileFqpn = Cfg.Prep ? MakePrepFileName(Cfg.File, Cfg.PrepDir) : null) != null) { PrepFileWriter = new StreamWriter(PrepFileFqpn); } try { while ((InFields = Rdr.ReadLine()) != null) { if (ShouldSplit) { InFields = ProfileColumn.SplitFields(InFields, SplitOrdinals); } ++InRows; if (Cols != null && InFields.Count != Cols.Count) { // ERROR -- column count mismatch ++ErrRows; if (!string.IsNullOrEmpty(Cfg.ErrFile)) { LogErrorRecord(SkipLines + InRows, InFields, Cfg.ErrFile); } if (Cfg.MaxErrors >= 0 && ErrRows > Cfg.MaxErrors) // Cfg.MaxErrors value of -1 means ignore all errors { throw new LoadException(string.Format("File {0} has a different number of columns than expected on row {1}. Expected: {2} Found: {3}", Cfg.File, SkipLines + InRows, Cols.Count, InFields.Count)); } else { continue; } } if (PrepFileWriter != null) { // prepped file is always tab-delimited PrepFileWriter.WriteLine(string.Join("\t", InFields.ToArray())); } if (Cfg.Profile) { for (int FldNum = 0; FldNum < InFields.Count; ++FldNum) { Cols[FldNum].Profile(InFields[FldNum], DoFreqs, Cfg.Typed); } } if (InRows % 10000 == 0) // && Args.ShowProgress) { decimal PctComplete = Rdr.TotBytesRead / FileSize; Log.InformationMessage("{0} rows ({1:P0})", InRows, PctComplete); // something to look at for large files } } BytesRead = Rdr.TotBytesRead; } finally { if (PrepFileWriter != null) { PrepFileWriter.Close(); } } } if (ErrRows != 0) { Log.InformationMessage("{0} error records were written to error file: {1}", ErrRows, Cfg.ErrFile); } Log.InformationMessage("Rows read: {0:n0} -- Error rows: {1:n0} -- Bytes read: {2:n0} -- Elapsed time (HH:MM:SS.Milli): {3}", InRows, ErrRows, BytesRead, DateTime.Now - Start); if (InRows == ErrRows) { // something went way wrong throw new LoadException(string.Format("No records in the input file matched the expected format.")); } return(InRows); }