/// <summary> /// Guesses the not a delimited file. /// </summary> /// <param name="setting"><see cref="ICsvFile" /> with the information</param> /// <returns><c>true</c> if this is most likely not a delimited file</returns> public static bool GuessNotADelimitedFile(ICsvFile setting) { Contract.Requires(setting != null); using (var improvedStream = ImprovedStream.OpenRead(setting)) using (var streamReader = new StreamReader(improvedStream.Stream, setting.GetEncoding(), setting.ByteOrderMark)) { for (int i = 0; i < setting.SkipRows; i++) { streamReader.ReadLine(); } // If the file doe not have a good delimiter // has empty lines var dc = GetDelimiterCounter(streamReader, '\0', 300); // Have a proper delimiter for (var sep = 0; sep < dc.Separators.Length; sep++) { if (dc.SeparatorRows[sep] >= dc.LastRow * 9 / 10) { Log.Info("Not a delimited file"); return(false); } } } Log.Info("Delimited file"); return(true); }
/// <summary> /// Determines the start row in the file /// </summary> /// <param name="setting"><see cref="ICsvFile" /> with the information</param> /// <returns> /// The number of rows to skip /// </returns> public static int GuessStartRow(ICsvFile setting) { Contract.Requires(setting != null); using (var improvedStream = ImprovedStream.OpenRead(setting)) using (var streamReader = new StreamReader(improvedStream.Stream, setting.GetEncoding(), setting.ByteOrderMark)) { return(GuessStartRow(streamReader, setting.FileFormat.FieldDelimiterChar, setting.FileFormat.FieldQualifierChar)); } }
/// <summary> /// Try to guess the new line sequence /// </summary> /// <param name="setting"><see cref="ICsvFile" /> with the information</param> /// <returns>The NewLine Combination used</returns> public static string GuessNewline(ICsvFile setting) { Contract.Requires(setting != null); using (var improvedStream = ImprovedStream.OpenRead(setting)) using (var streamReader = new StreamReader(improvedStream.Stream, setting.GetEncoding(), setting.ByteOrderMark)) { for (int i = 0; i < setting.SkipRows; i++) { streamReader.ReadLine(); } return(GuessNewline(streamReader, setting.FileFormat.FieldQualifierChar)); } }
/// <summary> /// Guesses the delimiter for a files. Done with a rather simple csv parsing, and trying to find /// the delimiter that has the least variance in the read rows, if that is not possible the /// delimiter with the highest number of occurrences. /// </summary> /// <param name="setting">The CSVFile fileSetting</param> /// <returns> /// A character with the assumed delimiter for the file /// </returns> /// <remarks> /// No Error will not be thrown. /// </remarks> public static string GuessDelimiter(ICsvFile setting) { Contract.Requires(setting != null); Contract.Ensures(Contract.Result <string>() != null); using (var improvedStream = ImprovedStream.OpenRead(setting)) using (var streamReader = new StreamReader(improvedStream.Stream, setting.GetEncoding(), setting.ByteOrderMark)) { for (int i = 0; i < setting.SkipRows; i++) { streamReader.ReadLine(); } return(GuessDelimiter(streamReader, setting.FileFormat.EscapeCharacterChar)); } }
private void UpdateView() { m_DisplayedAt = ScrollBarVertical.Value; if (string.IsNullOrEmpty(m_CsvFile.FileName)) { return; } try { using (var procDisp = new ProcessDisplayTime(System.Threading.CancellationToken.None)) using (var istream = ImprovedStream.OpenRead(m_CsvFile)) using (var sr = new StreamReader(istream.Stream, m_CsvFile.GetEncoding(), m_CsvFile.ByteOrderMark)) { // Some stream do not support seek... if (istream.Stream.CanSeek) { istream.Stream.Seek(m_DisplayedAt, SeekOrigin.Begin); if (m_DisplayedAt != 0) { // find the line start var read = sr.Read(); while (read != 13 && read != 10 && !sr.EndOfStream) { read = sr.Read(); } var next = sr.Peek(); if (read == 13 && next == 10 || read == 10 && next == 13) { sr.Read(); } } } else { ScrollBarVertical.Enabled = false; } var buffer = new char[32000]; var len = sr.Read(buffer, 0, buffer.Length); CSVTextBox.Text = new string(buffer, 0, len); } } catch (Exception exc) { CSVTextBox.Text = exc.ExceptionMessages(); } }
/// <summary> /// Resets the position and buffer to the first line, excluding headers, use ResetPositionToStart if you want to go to /// first data line /// </summary> private void ResetPositionToStart() { if (m_ImprovedStream == null) { m_ImprovedStream = ImprovedStream.OpenRead(m_CsvFile); } m_ImprovedStream.ResetToStart(delegate(Stream str) { // in case we can not seek need to reopen the stream reader if (!str.CanSeek || m_TextReader == null) { if (m_TextReader != null) { m_TextReader.Dispose(); } m_TextReader = new StreamReader(str, m_CsvFile.GetEncoding(), m_CsvFile.ByteOrderMark); } else { // only need to discard the buffer m_TextReader.DiscardBufferedData(); } }); m_CsvFile.CurrentEncoding = m_TextReader.CurrentEncoding; m_BufferPos = 0; m_BufferFilled = 0; // End Line should be at 1, later on as the line is read the start line s set to this value EndLineNumber = 1; RecordNumber = 0; m_EndOfLine = false; EndOfFile = false; // Skip the given number of lines // <= so we do skip the right number while (EndLineNumber <= m_CsvFile.SkipRows && !EndOfFile && !CancellationToken.IsCancellationRequested) { ReadToEOL(); } }
/// <summary> /// Guesses the code page ID of a file /// </summary> /// <param name="setting">The CSVFile fileSetting</param> /// <remarks> /// No Error will be thrown, the CodePage and the BOM will bet set /// </remarks> public static void GuessCodePage(ICsvFile setting) { Contract.Requires(setting != null); // Read 256 kBytes var buff = new byte[262144]; int length; using (var fileStream = ImprovedStream.OpenRead(setting)) { length = fileStream.Stream.Read(buff, 0, buff.Length); } if (length >= 2) { var byBom = EncodingHelper.GetCodePageByByteOrderMark(buff); if (byBom != 0) { setting.ByteOrderMark = true; setting.CodePageId = byBom; return; } } setting.ByteOrderMark = false; var detected = EncodingHelper.GuessCodePageNoBom(buff, length); // ASCII will be reported as UTF-8, UTF8 includes ASCII as subset if (detected == 20127) { detected = 65001; } Log.Info("Detected Code Page: " + EncodingHelper.GetEncodingName(detected, true, setting.ByteOrderMark)); setting.CodePageId = detected; }
/// <summary> /// Does check if quoting was actually used in the file /// </summary> /// <param name="setting">The setting.</param> /// <param name="token">The token.</param> /// <returns> /// <c>true</c> if [has used qualifier] [the specified setting]; otherwise, <c>false</c>. /// </returns> public static bool HasUsedQualifier(ICsvFile setting, CancellationToken token) { Contract.Requires(setting != null); // if we do not have a quote defined it does not matter if (string.IsNullOrEmpty(setting.FileFormat.FieldQualifier) || token.IsCancellationRequested) { return(false); } using (var improvedStream = ImprovedStream.OpenRead(setting)) using (var streamReader = new StreamReader(improvedStream.Stream, setting.GetEncoding(), setting.ByteOrderMark)) { for (int i = 0; i < setting.SkipRows; i++) { streamReader.ReadLine(); } var buff = new char[262144]; var isStartOfColumn = true; while (!streamReader.EndOfStream) { var read = streamReader.ReadBlock(buff, 0, 262143); // Look for Delimiter [Whitespace] Qualifier or StartofLine [Whitespace] Qualifier for (var current = 0; current < read; current++) { if (token.IsCancellationRequested) { return(false); } var c = buff[current]; if (c == '\r' || c == '\n' || c == setting.FileFormat.FieldDelimiterChar) { isStartOfColumn = true; continue; } // if we are not at the start of a column we can get the next char if (!isStartOfColumn) { continue; } // If we are at the start of a column and this is a ", we can stop, this is a real qualifier if (c == setting.FileFormat.FieldQualifierChar) { return(true); } // Any non whitespace will reset isStartOfColumn if (c <= '\x00ff') { isStartOfColumn = c == ' ' || c == '\t'; } else { isStartOfColumn = CharUnicodeInfo.GetUnicodeCategory(c) == UnicodeCategory.SpaceSeparator; } } } } return(false); }