/// <inheritdoc /> protected override unsafe void ProcessCore(CsvReaderVisitorBase visitor) { var tokenizer = new CsvTokenizer(_delimiter); using (var fl = new FileStream(_csvFilePath, FileMode.Open, FileAccess.Read, FileShare.Read, 4096, FileOptions.SequentialScan)) { long length = fl.Length; if (length == 0) { tokenizer.ProcessEndOfStream(visitor); return; } using (var memoryMappedFile = MemoryMappedFile.CreateFromFile(fl, null, 0, MemoryMappedFileAccess.Read, HandleInheritability.None, leaveOpen: true)) using (var accessor = memoryMappedFile.CreateViewAccessor(0, 0, MemoryMappedFileAccess.Read)) { var handle = accessor.SafeMemoryMappedViewHandle; byte *ptr = null; RuntimeHelpers.PrepareConstrainedRegions(); try { handle.AcquirePointer(ref ptr); if (_ignoreUTF8ByteOrderMark) { var head = new ReadOnlySpan <byte>(UTF8BOM, 0, length < 3 ? unchecked ((int)length) : 3); if (head.SequenceEqual(new ReadOnlySpan <byte>(ptr, head.Length))) { length -= head.Length; ptr += head.Length; } } while (length > int.MaxValue) { tokenizer.ProcessNextChunk(new ReadOnlySpan <byte>(ptr, int.MaxValue), visitor); length -= int.MaxValue; ptr += int.MaxValue; } tokenizer.ProcessNextChunk(new ReadOnlySpan <byte>(ptr, unchecked ((int)length)), visitor); tokenizer.ProcessEndOfStream(visitor); } finally { if (ptr != null) { handle.ReleasePointer(); } } } } }
private static async ValueTask <bool> EatUTF8BOMAsync(CsvTokenizer tokenizer, CsvReaderVisitorBase visitor, Stream csvStream, byte[] readBuffer, IProgress <int> progress, CancellationToken cancellationToken) { if (readBuffer.Length < 3) { // don't bother pooling; nobody should really ever care. readBuffer = new byte[3]; } int byteCount = 0; while (byteCount < 3) { int readLength = await csvStream.ReadAsync(readBuffer, byteCount, readBuffer.Length - byteCount, cancellationToken).ConfigureAwait(false); // not all streams support cancellation, so we might as well do this ourselves. it // does involve a volatile read, so don't go overboard. cancellationToken.ThrowIfCancellationRequested(); if (readLength == 0) { if (byteCount != 0) { if (!new ReadOnlySpan <byte>(readBuffer, 0, byteCount).SequenceEqual(new ReadOnlySpan <byte>(UTF8BOM, 0, byteCount))) { tokenizer.ProcessNextChunk(new ReadOnlySpan <byte>(readBuffer, 0, byteCount), visitor); } progress?.Report(byteCount); } tokenizer.ProcessEndOfStream(visitor); progress?.Report(0); return(true); } byteCount += readLength; } var buf = new ReadOnlyMemory <byte>(readBuffer, 0, byteCount); if (buf.Span.StartsWith(UTF8BOM)) { buf = buf.Slice(3); } tokenizer.ProcessNextChunk(buf.Span, visitor); progress?.Report(byteCount); return(false); }
public static List <string[]> TokenizeHeaderedCsvFileUsingCursivelyWithTheseHeaderLimits(ReadOnlySpan <byte> fileData, int chunkLength, byte delimiter, int maxHeaderCount, int maxHeaderLength) { var tokenizer = new CsvTokenizer(delimiter); var visitor = new HeaderedStringBufferingVisitor(maxHeaderCount, maxHeaderLength); while (fileData.Length > chunkLength) { tokenizer.ProcessNextChunk(fileData.Slice(0, chunkLength), visitor); fileData = fileData.Slice(chunkLength); } tokenizer.ProcessNextChunk(fileData, visitor); tokenizer.ProcessEndOfStream(visitor); return(visitor.Records); }
/// <inheritdoc /> protected override void ProcessCore(CsvReaderVisitorBase visitor) { var tokenizer = new CsvTokenizer(_delimiter); bool ignoreUTF8ByteOrderMark = _ignoreUTF8ByteOrderMark; var bytes = _sequence; if (bytes.IsSingleSegment) { CsvReadOnlyMemoryInput.ProcessFullSegment(bytes.First.Span, ignoreUTF8ByteOrderMark, tokenizer, visitor); return; } var enumerator = bytes.GetEnumerator(); if (ignoreUTF8ByteOrderMark && EatUTF8BOM(tokenizer, visitor, ref enumerator)) { return; } while (enumerator.MoveNext()) { tokenizer.ProcessNextChunk(enumerator.Current.Span, visitor); } tokenizer.ProcessEndOfStream(visitor); }
public long CountRowsUsingCursivelyRaw(CsvFile csvFile) { var visitor = new RowCountingVisitor(); var tokenizer = new CsvTokenizer(); tokenizer.ProcessNextChunk(csvFile.FileData, visitor); tokenizer.ProcessEndOfStream(visitor); return(visitor.RowCount); }
private static bool EatUTF8BOM(CsvTokenizer tokenizer, CsvReaderVisitorBase visitor, Stream csvStream, byte[] readBuffer) { if (readBuffer.Length < 3) { // don't bother pooling; nobody should really ever care. readBuffer = new byte[3]; } int byteCount = 0; while (byteCount < 3) { int readLength = csvStream.Read(readBuffer, byteCount, readBuffer.Length - byteCount); if (readLength == 0) { if (byteCount != 0) { if (!new ReadOnlySpan <byte>(readBuffer, 0, byteCount).SequenceEqual(new ReadOnlySpan <byte>(UTF8BOM, 0, byteCount))) { tokenizer.ProcessNextChunk(new ReadOnlySpan <byte>(readBuffer, 0, byteCount), visitor); } } tokenizer.ProcessEndOfStream(visitor); return(true); } byteCount += readLength; } var buf = new ReadOnlySpan <byte>(readBuffer, 0, byteCount); if (buf.StartsWith(UTF8BOM)) { buf = buf.Slice(3); } tokenizer.ProcessNextChunk(buf, visitor); return(false); }
public void NullVisitorShouldBeFine(string filePath) { // arrange filePath = Path.Combine(TestCsvFilesFolderPath, filePath); ReadOnlySpan <byte> fileData = File.ReadAllBytes(filePath); var tokenizer = new CsvTokenizer(); // act tokenizer.ProcessNextChunk(fileData, null); tokenizer.ProcessEndOfStream(null); // assert (empty) }
internal static void ProcessFullSegment(ReadOnlySpan <byte> bytes, bool ignoreUTF8ByteOrderMark, CsvTokenizer tokenizer, CsvReaderVisitorBase visitor) { if (ignoreUTF8ByteOrderMark) { var head = new ReadOnlySpan <byte>(UTF8BOM, 0, bytes.Length < 3 ? bytes.Length : 3); if (bytes.StartsWith(head)) { bytes = bytes.Slice(head.Length); } } tokenizer.ProcessNextChunk(bytes, visitor); tokenizer.ProcessEndOfStream(visitor); }
/// <inheritdoc /> protected override async ValueTask ProcessAsyncCore(CsvReaderVisitorBase visitor, IProgress <int> progress, CancellationToken cancellationToken) { var tokenizer = new CsvTokenizer(_delimiter); var reader = _reader; if (_ignoreUTF8ByteOrderMark && await EatUTF8BOMAsync(tokenizer, visitor, progress, cancellationToken).ConfigureAwait(false)) { return; } while (true) { var result = await reader.ReadAsync(cancellationToken).ConfigureAwait(false); if (result.IsCanceled) { throw new OperationCanceledException(cancellationToken); } var buffer = result.Buffer; foreach (var segment in buffer) { tokenizer.ProcessNextChunk(segment.Span, visitor); } reader.AdvanceTo(buffer.End); if (progress != null) { long totalLength = buffer.Length; while (totalLength > int.MaxValue) { progress.Report(int.MaxValue); totalLength -= int.MaxValue; } if (totalLength != 0) { progress.Report(unchecked ((int)totalLength)); } } if (result.IsCompleted) { break; } } tokenizer.ProcessEndOfStream(visitor); progress?.Report(0); }
/// <inheritdoc /> protected override async ValueTask ProcessAsyncCore(CsvReaderVisitorBase visitor, IProgress <int> progress, CancellationToken cancellationToken) { // not all streams support cancellation, so we might as well do this ourselves. it // does involve a volatile read, so don't go overboard. cancellationToken.ThrowIfCancellationRequested(); var tokenizer = new CsvTokenizer(_delimiter); var csvStream = _csvStream; int minReadBufferByteCount = _minReadBufferByteCount; var readBufferPool = _readBufferPool; byte[] readBuffer; if (readBufferPool is null) { readBuffer = new byte[minReadBufferByteCount]; } else { readBuffer = readBufferPool.Rent(minReadBufferByteCount); } try { if (_ignoreUTF8ByteOrderMark && await EatUTF8BOMAsync(tokenizer, visitor, csvStream, readBuffer, progress, cancellationToken).ConfigureAwait(false)) { return; } int cnt; while ((cnt = await csvStream.ReadAsync(readBuffer, 0, readBuffer.Length, cancellationToken).ConfigureAwait(false)) != 0) { // not all streams support cancellation, so we might as well do this ourselves. it // does involve a volatile read, so don't go overboard. cancellationToken.ThrowIfCancellationRequested(); tokenizer.ProcessNextChunk(new ReadOnlySpan <byte>(readBuffer, 0, cnt), visitor); progress?.Report(cnt); } } finally { readBufferPool?.Return(readBuffer, clearArray: true); } tokenizer.ProcessEndOfStream(visitor); progress?.Report(0); }
/// <inheritdoc /> protected override void ProcessCore(CsvReaderVisitorBase visitor) { var tokenizer = new CsvTokenizer(_delimiter); var csvStream = _csvStream; int minReadBufferByteCount = _minReadBufferByteCount; var readBufferPool = _readBufferPool; byte[] readBuffer; if (readBufferPool is null) { readBuffer = new byte[minReadBufferByteCount]; } else { readBuffer = readBufferPool.Rent(minReadBufferByteCount); } try { if (_ignoreUTF8ByteOrderMark && EatUTF8BOM(tokenizer, visitor, csvStream, readBuffer)) { return; } int cnt; while ((cnt = csvStream.Read(readBuffer, 0, readBuffer.Length)) != 0) { tokenizer.ProcessNextChunk(new ReadOnlySpan <byte>(readBuffer, 0, cnt), visitor); } } finally { readBufferPool?.Return(readBuffer, clearArray: true); } tokenizer.ProcessEndOfStream(visitor); }
private static bool EatUTF8BOM(CsvTokenizer tokenizer, CsvReaderVisitorBase visitor, ref ReadOnlySequence <byte> .Enumerator enumerator) { ReadOnlyMemory <byte> segment; while (true) { if (!enumerator.MoveNext()) { tokenizer.ProcessEndOfStream(visitor); return(true); } segment = enumerator.Current; if (!segment.IsEmpty) { break; } } var span = segment.Span; ReadOnlySpan <byte> head = UTF8BOM; // this greed should **probably** pay off most of the time. if (span.Length >= 3) { if (span.StartsWith(head)) { span = span.Slice(3); } tokenizer.ProcessNextChunk(span, visitor); return(false); } int alreadyEaten = 0; while (true) { if (span[0] == head[alreadyEaten]) { span = span.Slice(1); if (++alreadyEaten == 3) { tokenizer.ProcessNextChunk(span, visitor); return(false); } } else { tokenizer.ProcessNextChunk(head.Slice(0, alreadyEaten), visitor); tokenizer.ProcessNextChunk(span, visitor); return(false); } if (span.IsEmpty) { while (true) { if (!enumerator.MoveNext()) { tokenizer.ProcessEndOfStream(visitor); return(true); } segment = enumerator.Current; if (!segment.IsEmpty) { break; } } span = segment.Span; } } }
private async ValueTask <bool> EatUTF8BOMAsync(CsvTokenizer tokenizer, CsvReaderVisitorBase visitor, IProgress <int> progress, CancellationToken cancellationToken) { var reader = _reader; ReadOnlySequence <byte> buffer; // keep asking for more until we've seen either 3+ bytes or the end of the data. while (true) { var result = await reader.ReadAsync(cancellationToken).ConfigureAwait(false); if (result.IsCanceled) { throw new OperationCanceledException(cancellationToken); } buffer = result.Buffer; if (buffer.Length >= 3) { // we've seen 3+ bytes. break; } if (result.IsCompleted) { // we've seen the end of the data. Finish(); tokenizer.ProcessEndOfStream(visitor); reader.AdvanceTo(buffer.End); progress?.Report(0); return(true); } // tell the reader that we've looked at everything it had to give us, and we weren't // able to consume any of it, so the next read should have everything we've seen so // far, plus at least one more byte. reader.AdvanceTo(buffer.Start, buffer.End); } Finish(); return(false); void Finish() { Span <byte> upToFirstThreeBytes = stackalloc byte[3]; int alreadyEaten = 0; foreach (var segment in buffer) { int lengthToCopy = 3 - alreadyEaten; if (lengthToCopy > segment.Length) { lengthToCopy = segment.Length; } segment.Slice(0, lengthToCopy).Span.CopyTo(upToFirstThreeBytes.Slice(alreadyEaten, lengthToCopy)); alreadyEaten += lengthToCopy; if (alreadyEaten == 3) { break; } } upToFirstThreeBytes = upToFirstThreeBytes.Slice(0, alreadyEaten); var head = new ReadOnlySpan <byte>(UTF8BOM, 0, alreadyEaten); if (!upToFirstThreeBytes.SequenceEqual(head)) { tokenizer.ProcessNextChunk(upToFirstThreeBytes, visitor); } reader.AdvanceTo(buffer.GetPosition(alreadyEaten)); progress?.Report(alreadyEaten); } }