/// <inheritdoc />
        protected override unsafe void ProcessCore(CsvReaderVisitorBase visitor)
        {
            var tokenizer = new CsvTokenizer(_delimiter);

            using (var fl = new FileStream(_csvFilePath, FileMode.Open, FileAccess.Read, FileShare.Read, 4096, FileOptions.SequentialScan))
            {
                long length = fl.Length;
                if (length == 0)
                {
                    tokenizer.ProcessEndOfStream(visitor);
                    return;
                }

                using (var memoryMappedFile = MemoryMappedFile.CreateFromFile(fl, null, 0, MemoryMappedFileAccess.Read, HandleInheritability.None, leaveOpen: true))
                    using (var accessor = memoryMappedFile.CreateViewAccessor(0, 0, MemoryMappedFileAccess.Read))
                    {
                        var   handle = accessor.SafeMemoryMappedViewHandle;
                        byte *ptr    = null;
                        RuntimeHelpers.PrepareConstrainedRegions();
                        try
                        {
                            handle.AcquirePointer(ref ptr);

                            if (_ignoreUTF8ByteOrderMark)
                            {
                                var head = new ReadOnlySpan <byte>(UTF8BOM, 0, length < 3 ? unchecked ((int)length) : 3);
                                if (head.SequenceEqual(new ReadOnlySpan <byte>(ptr, head.Length)))
                                {
                                    length -= head.Length;
                                    ptr    += head.Length;
                                }
                            }

                            while (length > int.MaxValue)
                            {
                                tokenizer.ProcessNextChunk(new ReadOnlySpan <byte>(ptr, int.MaxValue), visitor);
                                length -= int.MaxValue;
                                ptr    += int.MaxValue;
                            }

                            tokenizer.ProcessNextChunk(new ReadOnlySpan <byte>(ptr, unchecked ((int)length)), visitor);
                            tokenizer.ProcessEndOfStream(visitor);
                        }
                        finally
                        {
                            if (ptr != null)
                            {
                                handle.ReleasePointer();
                            }
                        }
                    }
            }
        }
Beispiel #2
0
        private static async ValueTask <bool> EatUTF8BOMAsync(CsvTokenizer tokenizer, CsvReaderVisitorBase visitor, Stream csvStream, byte[] readBuffer, IProgress <int> progress, CancellationToken cancellationToken)
        {
            if (readBuffer.Length < 3)
            {
                // don't bother pooling; nobody should really ever care.
                readBuffer = new byte[3];
            }

            int byteCount = 0;

            while (byteCount < 3)
            {
                int readLength = await csvStream.ReadAsync(readBuffer, byteCount, readBuffer.Length - byteCount, cancellationToken).ConfigureAwait(false);

                // not all streams support cancellation, so we might as well do this ourselves.  it
                // does involve a volatile read, so don't go overboard.
                cancellationToken.ThrowIfCancellationRequested();

                if (readLength == 0)
                {
                    if (byteCount != 0)
                    {
                        if (!new ReadOnlySpan <byte>(readBuffer, 0, byteCount).SequenceEqual(new ReadOnlySpan <byte>(UTF8BOM, 0, byteCount)))
                        {
                            tokenizer.ProcessNextChunk(new ReadOnlySpan <byte>(readBuffer, 0, byteCount), visitor);
                        }

                        progress?.Report(byteCount);
                    }

                    tokenizer.ProcessEndOfStream(visitor);
                    progress?.Report(0);
                    return(true);
                }

                byteCount += readLength;
            }

            var buf = new ReadOnlyMemory <byte>(readBuffer, 0, byteCount);

            if (buf.Span.StartsWith(UTF8BOM))
            {
                buf = buf.Slice(3);
            }

            tokenizer.ProcessNextChunk(buf.Span, visitor);
            progress?.Report(byteCount);

            return(false);
        }
Beispiel #3
0
        public static List <string[]> TokenizeHeaderedCsvFileUsingCursivelyWithTheseHeaderLimits(ReadOnlySpan <byte> fileData, int chunkLength, byte delimiter, int maxHeaderCount, int maxHeaderLength)
        {
            var tokenizer = new CsvTokenizer(delimiter);
            var visitor   = new HeaderedStringBufferingVisitor(maxHeaderCount, maxHeaderLength);

            while (fileData.Length > chunkLength)
            {
                tokenizer.ProcessNextChunk(fileData.Slice(0, chunkLength), visitor);
                fileData = fileData.Slice(chunkLength);
            }

            tokenizer.ProcessNextChunk(fileData, visitor);
            tokenizer.ProcessEndOfStream(visitor);
            return(visitor.Records);
        }
Beispiel #4
0
        /// <inheritdoc />
        protected override void ProcessCore(CsvReaderVisitorBase visitor)
        {
            var  tokenizer = new CsvTokenizer(_delimiter);
            bool ignoreUTF8ByteOrderMark = _ignoreUTF8ByteOrderMark;
            var  bytes = _sequence;

            if (bytes.IsSingleSegment)
            {
                CsvReadOnlyMemoryInput.ProcessFullSegment(bytes.First.Span, ignoreUTF8ByteOrderMark, tokenizer, visitor);
                return;
            }

            var enumerator = bytes.GetEnumerator();

            if (ignoreUTF8ByteOrderMark && EatUTF8BOM(tokenizer, visitor, ref enumerator))
            {
                return;
            }

            while (enumerator.MoveNext())
            {
                tokenizer.ProcessNextChunk(enumerator.Current.Span, visitor);
            }

            tokenizer.ProcessEndOfStream(visitor);
        }
Beispiel #5
0
        public long CountRowsUsingCursivelyRaw(CsvFile csvFile)
        {
            var visitor   = new RowCountingVisitor();
            var tokenizer = new CsvTokenizer();

            tokenizer.ProcessNextChunk(csvFile.FileData, visitor);
            tokenizer.ProcessEndOfStream(visitor);
            return(visitor.RowCount);
        }
        private static bool EatUTF8BOM(CsvTokenizer tokenizer, CsvReaderVisitorBase visitor, Stream csvStream, byte[] readBuffer)
        {
            if (readBuffer.Length < 3)
            {
                // don't bother pooling; nobody should really ever care.
                readBuffer = new byte[3];
            }

            int byteCount = 0;

            while (byteCount < 3)
            {
                int readLength = csvStream.Read(readBuffer, byteCount, readBuffer.Length - byteCount);
                if (readLength == 0)
                {
                    if (byteCount != 0)
                    {
                        if (!new ReadOnlySpan <byte>(readBuffer, 0, byteCount).SequenceEqual(new ReadOnlySpan <byte>(UTF8BOM, 0, byteCount)))
                        {
                            tokenizer.ProcessNextChunk(new ReadOnlySpan <byte>(readBuffer, 0, byteCount), visitor);
                        }
                    }

                    tokenizer.ProcessEndOfStream(visitor);
                    return(true);
                }

                byteCount += readLength;
            }

            var buf = new ReadOnlySpan <byte>(readBuffer, 0, byteCount);

            if (buf.StartsWith(UTF8BOM))
            {
                buf = buf.Slice(3);
            }

            tokenizer.ProcessNextChunk(buf, visitor);

            return(false);
        }
        public void NullVisitorShouldBeFine(string filePath)
        {
            // arrange
            filePath = Path.Combine(TestCsvFilesFolderPath, filePath);
            ReadOnlySpan <byte> fileData = File.ReadAllBytes(filePath);
            var tokenizer = new CsvTokenizer();

            // act
            tokenizer.ProcessNextChunk(fileData, null);
            tokenizer.ProcessEndOfStream(null);

            // assert (empty)
        }
        internal static void ProcessFullSegment(ReadOnlySpan <byte> bytes, bool ignoreUTF8ByteOrderMark, CsvTokenizer tokenizer, CsvReaderVisitorBase visitor)
        {
            if (ignoreUTF8ByteOrderMark)
            {
                var head = new ReadOnlySpan <byte>(UTF8BOM, 0, bytes.Length < 3 ? bytes.Length : 3);
                if (bytes.StartsWith(head))
                {
                    bytes = bytes.Slice(head.Length);
                }
            }

            tokenizer.ProcessNextChunk(bytes, visitor);
            tokenizer.ProcessEndOfStream(visitor);
        }
        /// <inheritdoc />
        protected override async ValueTask ProcessAsyncCore(CsvReaderVisitorBase visitor, IProgress <int> progress, CancellationToken cancellationToken)
        {
            var tokenizer = new CsvTokenizer(_delimiter);
            var reader    = _reader;

            if (_ignoreUTF8ByteOrderMark && await EatUTF8BOMAsync(tokenizer, visitor, progress, cancellationToken).ConfigureAwait(false))
            {
                return;
            }

            while (true)
            {
                var result = await reader.ReadAsync(cancellationToken).ConfigureAwait(false);

                if (result.IsCanceled)
                {
                    throw new OperationCanceledException(cancellationToken);
                }

                var buffer = result.Buffer;
                foreach (var segment in buffer)
                {
                    tokenizer.ProcessNextChunk(segment.Span, visitor);
                }

                reader.AdvanceTo(buffer.End);
                if (progress != null)
                {
                    long totalLength = buffer.Length;
                    while (totalLength > int.MaxValue)
                    {
                        progress.Report(int.MaxValue);
                        totalLength -= int.MaxValue;
                    }

                    if (totalLength != 0)
                    {
                        progress.Report(unchecked ((int)totalLength));
                    }
                }

                if (result.IsCompleted)
                {
                    break;
                }
            }

            tokenizer.ProcessEndOfStream(visitor);
            progress?.Report(0);
        }
Beispiel #10
0
        /// <inheritdoc />
        protected override async ValueTask ProcessAsyncCore(CsvReaderVisitorBase visitor, IProgress <int> progress, CancellationToken cancellationToken)
        {
            // not all streams support cancellation, so we might as well do this ourselves.  it
            // does involve a volatile read, so don't go overboard.
            cancellationToken.ThrowIfCancellationRequested();

            var tokenizer = new CsvTokenizer(_delimiter);
            var csvStream = _csvStream;
            int minReadBufferByteCount = _minReadBufferByteCount;
            var readBufferPool         = _readBufferPool;

            byte[] readBuffer;
            if (readBufferPool is null)
            {
                readBuffer = new byte[minReadBufferByteCount];
            }
            else
            {
                readBuffer = readBufferPool.Rent(minReadBufferByteCount);
            }

            try
            {
                if (_ignoreUTF8ByteOrderMark && await EatUTF8BOMAsync(tokenizer, visitor, csvStream, readBuffer, progress, cancellationToken).ConfigureAwait(false))
                {
                    return;
                }

                int cnt;
                while ((cnt = await csvStream.ReadAsync(readBuffer, 0, readBuffer.Length, cancellationToken).ConfigureAwait(false)) != 0)
                {
                    // not all streams support cancellation, so we might as well do this ourselves.  it
                    // does involve a volatile read, so don't go overboard.
                    cancellationToken.ThrowIfCancellationRequested();

                    tokenizer.ProcessNextChunk(new ReadOnlySpan <byte>(readBuffer, 0, cnt), visitor);
                    progress?.Report(cnt);
                }
            }
            finally
            {
                readBufferPool?.Return(readBuffer, clearArray: true);
            }

            tokenizer.ProcessEndOfStream(visitor);
            progress?.Report(0);
        }
        /// <inheritdoc />
        protected override void ProcessCore(CsvReaderVisitorBase visitor)
        {
            var tokenizer = new CsvTokenizer(_delimiter);
            var csvStream = _csvStream;
            int minReadBufferByteCount = _minReadBufferByteCount;
            var readBufferPool         = _readBufferPool;

            byte[] readBuffer;
            if (readBufferPool is null)
            {
                readBuffer = new byte[minReadBufferByteCount];
            }
            else
            {
                readBuffer = readBufferPool.Rent(minReadBufferByteCount);
            }

            try
            {
                if (_ignoreUTF8ByteOrderMark && EatUTF8BOM(tokenizer, visitor, csvStream, readBuffer))
                {
                    return;
                }

                int cnt;
                while ((cnt = csvStream.Read(readBuffer, 0, readBuffer.Length)) != 0)
                {
                    tokenizer.ProcessNextChunk(new ReadOnlySpan <byte>(readBuffer, 0, cnt), visitor);
                }
            }
            finally
            {
                readBufferPool?.Return(readBuffer, clearArray: true);
            }

            tokenizer.ProcessEndOfStream(visitor);
        }
Beispiel #12
0
        private static bool EatUTF8BOM(CsvTokenizer tokenizer, CsvReaderVisitorBase visitor, ref ReadOnlySequence <byte> .Enumerator enumerator)
        {
            ReadOnlyMemory <byte> segment;

            while (true)
            {
                if (!enumerator.MoveNext())
                {
                    tokenizer.ProcessEndOfStream(visitor);
                    return(true);
                }

                segment = enumerator.Current;
                if (!segment.IsEmpty)
                {
                    break;
                }
            }

            var span = segment.Span;

            ReadOnlySpan <byte> head = UTF8BOM;

            // this greed should **probably** pay off most of the time.
            if (span.Length >= 3)
            {
                if (span.StartsWith(head))
                {
                    span = span.Slice(3);
                }

                tokenizer.ProcessNextChunk(span, visitor);
                return(false);
            }

            int alreadyEaten = 0;

            while (true)
            {
                if (span[0] == head[alreadyEaten])
                {
                    span = span.Slice(1);
                    if (++alreadyEaten == 3)
                    {
                        tokenizer.ProcessNextChunk(span, visitor);
                        return(false);
                    }
                }
                else
                {
                    tokenizer.ProcessNextChunk(head.Slice(0, alreadyEaten), visitor);
                    tokenizer.ProcessNextChunk(span, visitor);
                    return(false);
                }

                if (span.IsEmpty)
                {
                    while (true)
                    {
                        if (!enumerator.MoveNext())
                        {
                            tokenizer.ProcessEndOfStream(visitor);
                            return(true);
                        }

                        segment = enumerator.Current;
                        if (!segment.IsEmpty)
                        {
                            break;
                        }
                    }

                    span = segment.Span;
                }
            }
        }
        private async ValueTask <bool> EatUTF8BOMAsync(CsvTokenizer tokenizer, CsvReaderVisitorBase visitor, IProgress <int> progress, CancellationToken cancellationToken)
        {
            var reader = _reader;

            ReadOnlySequence <byte> buffer;

            // keep asking for more until we've seen either 3+ bytes or the end of the data.
            while (true)
            {
                var result = await reader.ReadAsync(cancellationToken).ConfigureAwait(false);

                if (result.IsCanceled)
                {
                    throw new OperationCanceledException(cancellationToken);
                }

                buffer = result.Buffer;
                if (buffer.Length >= 3)
                {
                    // we've seen 3+ bytes.
                    break;
                }

                if (result.IsCompleted)
                {
                    // we've seen the end of the data.
                    Finish();
                    tokenizer.ProcessEndOfStream(visitor);
                    reader.AdvanceTo(buffer.End);
                    progress?.Report(0);
                    return(true);
                }

                // tell the reader that we've looked at everything it had to give us, and we weren't
                // able to consume any of it, so the next read should have everything we've seen so
                // far, plus at least one more byte.
                reader.AdvanceTo(buffer.Start, buffer.End);
            }

            Finish();
            return(false);

            void Finish()
            {
                Span <byte> upToFirstThreeBytes = stackalloc byte[3];
                int         alreadyEaten        = 0;

                foreach (var segment in buffer)
                {
                    int lengthToCopy = 3 - alreadyEaten;
                    if (lengthToCopy > segment.Length)
                    {
                        lengthToCopy = segment.Length;
                    }

                    segment.Slice(0, lengthToCopy).Span.CopyTo(upToFirstThreeBytes.Slice(alreadyEaten, lengthToCopy));
                    alreadyEaten += lengthToCopy;
                    if (alreadyEaten == 3)
                    {
                        break;
                    }
                }

                upToFirstThreeBytes = upToFirstThreeBytes.Slice(0, alreadyEaten);
                var head = new ReadOnlySpan <byte>(UTF8BOM, 0, alreadyEaten);

                if (!upToFirstThreeBytes.SequenceEqual(head))
                {
                    tokenizer.ProcessNextChunk(upToFirstThreeBytes, visitor);
                }

                reader.AdvanceTo(buffer.GetPosition(alreadyEaten));
                progress?.Report(alreadyEaten);
            }
        }