예제 #1
0
        /// <inheritdoc />
        protected override void ProcessCore(CsvReaderVisitorBase visitor)
        {
            var  tokenizer = new CsvTokenizer(_delimiter);
            bool ignoreUTF8ByteOrderMark = _ignoreUTF8ByteOrderMark;
            var  bytes = _sequence;

            if (bytes.IsSingleSegment)
            {
                CsvReadOnlyMemoryInput.ProcessFullSegment(bytes.First.Span, ignoreUTF8ByteOrderMark, tokenizer, visitor);
                return;
            }

            var enumerator = bytes.GetEnumerator();

            if (ignoreUTF8ByteOrderMark && EatUTF8BOM(tokenizer, visitor, ref enumerator))
            {
                return;
            }

            while (enumerator.MoveNext())
            {
                tokenizer.ProcessNextChunk(enumerator.Current.Span, visitor);
            }

            tokenizer.ProcessEndOfStream(visitor);
        }
예제 #2
0
        public long CountRowsUsingCursivelyRaw(CsvFile csvFile)
        {
            var visitor   = new RowCountingVisitor();
            var tokenizer = new CsvTokenizer();

            tokenizer.ProcessNextChunk(csvFile.FileData, visitor);
            tokenizer.ProcessEndOfStream(visitor);
            return(visitor.RowCount);
        }
예제 #3
0
        public void CsvTokenizerTest_DoubleQouteOnRecord()
        {
            using CsvTokenizer tokenizer = new CsvTokenizer(new StringReader("aaa\""));

            CsvToken token = tokenizer.NextToken();

            Assert.AreEqual(CsvTokenType.Eof, token.TokenType);
            Assert.AreEqual("aaa\"", token.Value);
        }
        /// <summary>
        /// Creates a new instance of the <see cref="CsvAsyncStreamInput"/> class as a copy of this
        /// one, with the given delimiter.
        /// </summary>
        /// <param name="delimiter">
        /// The delimiter to use.  Use <see cref="CsvTokenizer.IsValidDelimiter"/> to test whether
        /// or not a particular value is valid.
        /// </param>
        /// <returns>
        /// A new instance of the <see cref="CsvAsyncStreamInput"/> class as a copy of this one, with
        /// the given delimiter.
        /// </returns>
        /// <exception cref="ArgumentException">
        /// Thrown when <paramref name="delimiter"/> is one of the illegal values.
        /// </exception>
        /// <exception cref="InvalidOperationException">
        /// Thrown when <see cref="CsvAsyncInputBase.ProcessAsync"/> has already been called.
        /// </exception>
        public CsvAsyncStreamInput WithDelimiter(byte delimiter)
        {
            if (!CsvTokenizer.IsValidDelimiter(delimiter))
            {
                throw new ArgumentException("Must not be a carriage return, linefeed, or double-quote.", nameof(delimiter));
            }

            ThrowIfProcessingHasAlreadyStarted();
            return(new CsvAsyncStreamInput(delimiter, _csvStream, _minReadBufferByteCount, _readBufferPool, _ignoreUTF8ByteOrderMark));
        }
예제 #5
0
        /// <summary>
        /// Creates a new instance of the <see cref="CsvReadOnlySequenceInput"/> class as a copy of this
        /// one, with the given delimiter.
        /// </summary>
        /// <param name="delimiter">
        /// The delimiter to use.  Use <see cref="CsvTokenizer.IsValidDelimiter"/> to test whether
        /// or not a particular value is valid.
        /// </param>
        /// <returns>
        /// A new instance of the <see cref="CsvReadOnlySequenceInput"/> class as a copy of this one, with
        /// the given delimiter.
        /// </returns>
        /// <exception cref="ArgumentException">
        /// Thrown when <paramref name="delimiter"/> is one of the illegal values.
        /// </exception>
        /// <exception cref="InvalidOperationException">
        /// Thrown when <see cref="CsvSyncInputBase.Process"/> has already been called.
        /// </exception>
        public CsvReadOnlySequenceInput WithDelimiter(byte delimiter)
        {
            if (!CsvTokenizer.IsValidDelimiter(delimiter))
            {
                throw new ArgumentException("Must not be a carriage return, linefeed, or double-quote.", nameof(delimiter));
            }

            ThrowIfProcessingHasAlreadyStarted();
            return(new CsvReadOnlySequenceInput(delimiter, _sequence, _ignoreUTF8ByteOrderMark));
        }
예제 #6
0
        /// <summary>
        /// Creates a new instance of the <see cref="CsvAsyncStreamInput"/> class as a copy of this
        /// one, with the given delimiter.
        /// </summary>
        /// <param name="delimiter">
        /// The delimiter to use.  Use <see cref="CsvTokenizer.IsValidDelimiter"/> to test whether
        /// or not a particular value is valid.
        /// </param>
        /// <returns>
        /// A new instance of the <see cref="CsvAsyncStreamInput"/> class as a copy of this one, with
        /// the given delimiter.
        /// </returns>
        /// <exception cref="ArgumentException">
        /// Thrown when <paramref name="delimiter"/> is one of the illegal values.
        /// </exception>
        /// <exception cref="InvalidOperationException">
        /// Thrown when <see cref="CsvAsyncInputBase.ProcessAsync"/> has already been called.
        /// </exception>
        public CsvAsyncStreamInput WithDelimiter(byte delimiter)
        {
            if (!CsvTokenizer.IsValidDelimiter(delimiter))
            {
#pragma warning disable CA1303 // Do not pass literals as localized parameters
                throw new ArgumentException("Must not be a carriage return, linefeed, or double-quote.", nameof(delimiter));
#pragma warning restore CA1303 // Do not pass literals as localized parameters
            }

            ThrowIfProcessingHasAlreadyStarted();
            return(new CsvAsyncStreamInput(delimiter, _csvStream, _minReadBufferByteCount, _readBufferPool, _ignoreUTF8ByteOrderMark));
        }
        /// <inheritdoc />
        protected override unsafe void ProcessCore(CsvReaderVisitorBase visitor)
        {
            var tokenizer = new CsvTokenizer(_delimiter);

            using (var fl = new FileStream(_csvFilePath, FileMode.Open, FileAccess.Read, FileShare.Read, 4096, FileOptions.SequentialScan))
            {
                long length = fl.Length;
                if (length == 0)
                {
                    tokenizer.ProcessEndOfStream(visitor);
                    return;
                }

                using (var memoryMappedFile = MemoryMappedFile.CreateFromFile(fl, null, 0, MemoryMappedFileAccess.Read, HandleInheritability.None, leaveOpen: true))
                    using (var accessor = memoryMappedFile.CreateViewAccessor(0, 0, MemoryMappedFileAccess.Read))
                    {
                        var   handle = accessor.SafeMemoryMappedViewHandle;
                        byte *ptr    = null;
                        RuntimeHelpers.PrepareConstrainedRegions();
                        try
                        {
                            handle.AcquirePointer(ref ptr);

                            if (_ignoreUTF8ByteOrderMark)
                            {
                                var head = new ReadOnlySpan <byte>(UTF8BOM, 0, length < 3 ? unchecked ((int)length) : 3);
                                if (head.SequenceEqual(new ReadOnlySpan <byte>(ptr, head.Length)))
                                {
                                    length -= head.Length;
                                    ptr    += head.Length;
                                }
                            }

                            while (length > int.MaxValue)
                            {
                                tokenizer.ProcessNextChunk(new ReadOnlySpan <byte>(ptr, int.MaxValue), visitor);
                                length -= int.MaxValue;
                                ptr    += int.MaxValue;
                            }

                            tokenizer.ProcessNextChunk(new ReadOnlySpan <byte>(ptr, unchecked ((int)length)), visitor);
                            tokenizer.ProcessEndOfStream(visitor);
                        }
                        finally
                        {
                            if (ptr != null)
                            {
                                handle.ReleasePointer();
                            }
                        }
                    }
            }
        }
예제 #8
0
 public void WriteCsv(string csv)
 {
     if (_noColor)
     {
         Console.Write(csv);
     }
     else
     {
         var tokens = new CsvTokenizer().Tokenize(csv);
         CsvWriter.WriteCsv(tokens, Theme, Console.Out, true);
     }
 }
예제 #9
0
        public void NullVisitorShouldBeFine(string filePath)
        {
            // arrange
            filePath = Path.Combine(TestCsvFilesFolderPath, filePath);
            ReadOnlySpan <byte> fileData = File.ReadAllBytes(filePath);
            var tokenizer = new CsvTokenizer();

            // act
            tokenizer.ProcessNextChunk(fileData, null);
            tokenizer.ProcessEndOfStream(null);

            // assert (empty)
        }
예제 #10
0
        private static async ValueTask <bool> EatUTF8BOMAsync(CsvTokenizer tokenizer, CsvReaderVisitorBase visitor, Stream csvStream, byte[] readBuffer, IProgress <int> progress, CancellationToken cancellationToken)
        {
            if (readBuffer.Length < 3)
            {
                // don't bother pooling; nobody should really ever care.
                readBuffer = new byte[3];
            }

            int byteCount = 0;

            while (byteCount < 3)
            {
                int readLength = await csvStream.ReadAsync(readBuffer, byteCount, readBuffer.Length - byteCount, cancellationToken).ConfigureAwait(false);

                // not all streams support cancellation, so we might as well do this ourselves.  it
                // does involve a volatile read, so don't go overboard.
                cancellationToken.ThrowIfCancellationRequested();

                if (readLength == 0)
                {
                    if (byteCount != 0)
                    {
                        if (!new ReadOnlySpan <byte>(readBuffer, 0, byteCount).SequenceEqual(new ReadOnlySpan <byte>(UTF8BOM, 0, byteCount)))
                        {
                            tokenizer.ProcessNextChunk(new ReadOnlySpan <byte>(readBuffer, 0, byteCount), visitor);
                        }

                        progress?.Report(byteCount);
                    }

                    tokenizer.ProcessEndOfStream(visitor);
                    progress?.Report(0);
                    return(true);
                }

                byteCount += readLength;
            }

            var buf = new ReadOnlyMemory <byte>(readBuffer, 0, byteCount);

            if (buf.Span.StartsWith(UTF8BOM))
            {
                buf = buf.Slice(3);
            }

            tokenizer.ProcessNextChunk(buf.Span, visitor);
            progress?.Report(byteCount);

            return(false);
        }
예제 #11
0
        /// <inheritdoc />
        protected override async ValueTask ProcessAsyncCore(CsvReaderVisitorBase visitor, IProgress <int> progress, CancellationToken cancellationToken)
        {
            var tokenizer = new CsvTokenizer(_delimiter);
            var reader    = _reader;

            if (_ignoreUTF8ByteOrderMark && await EatUTF8BOMAsync(tokenizer, visitor, progress, cancellationToken).ConfigureAwait(false))
            {
                return;
            }

            while (true)
            {
                var result = await reader.ReadAsync(cancellationToken).ConfigureAwait(false);

                if (result.IsCanceled)
                {
                    throw new OperationCanceledException(cancellationToken);
                }

                var buffer = result.Buffer;
                foreach (var segment in buffer)
                {
                    tokenizer.ProcessNextChunk(segment.Span, visitor);
                }

                reader.AdvanceTo(buffer.End);
                if (progress != null)
                {
                    long totalLength = buffer.Length;
                    while (totalLength > int.MaxValue)
                    {
                        progress.Report(int.MaxValue);
                        totalLength -= int.MaxValue;
                    }

                    if (totalLength != 0)
                    {
                        progress.Report(unchecked ((int)totalLength));
                    }
                }

                if (result.IsCompleted)
                {
                    break;
                }
            }

            tokenizer.ProcessEndOfStream(visitor);
            progress?.Report(0);
        }
예제 #12
0
        public static List <string[]> TokenizeHeaderedCsvFileUsingCursivelyWithTheseHeaderLimits(ReadOnlySpan <byte> fileData, int chunkLength, byte delimiter, int maxHeaderCount, int maxHeaderLength)
        {
            var tokenizer = new CsvTokenizer(delimiter);
            var visitor   = new HeaderedStringBufferingVisitor(maxHeaderCount, maxHeaderLength);

            while (fileData.Length > chunkLength)
            {
                tokenizer.ProcessNextChunk(fileData.Slice(0, chunkLength), visitor);
                fileData = fileData.Slice(chunkLength);
            }

            tokenizer.ProcessNextChunk(fileData, visitor);
            tokenizer.ProcessEndOfStream(visitor);
            return(visitor.Records);
        }
예제 #13
0
        /// <inheritdoc />
        protected override async ValueTask ProcessAsyncCore(CsvReaderVisitorBase visitor, IProgress <int> progress, CancellationToken cancellationToken)
        {
            // not all streams support cancellation, so we might as well do this ourselves.  it
            // does involve a volatile read, so don't go overboard.
            cancellationToken.ThrowIfCancellationRequested();

            var tokenizer = new CsvTokenizer(_delimiter);
            var csvStream = _csvStream;
            int minReadBufferByteCount = _minReadBufferByteCount;
            var readBufferPool         = _readBufferPool;

            byte[] readBuffer;
            if (readBufferPool is null)
            {
                readBuffer = new byte[minReadBufferByteCount];
            }
            else
            {
                readBuffer = readBufferPool.Rent(minReadBufferByteCount);
            }

            try
            {
                if (_ignoreUTF8ByteOrderMark && await EatUTF8BOMAsync(tokenizer, visitor, csvStream, readBuffer, progress, cancellationToken).ConfigureAwait(false))
                {
                    return;
                }

                int cnt;
                while ((cnt = await csvStream.ReadAsync(readBuffer, 0, readBuffer.Length, cancellationToken).ConfigureAwait(false)) != 0)
                {
                    // not all streams support cancellation, so we might as well do this ourselves.  it
                    // does involve a volatile read, so don't go overboard.
                    cancellationToken.ThrowIfCancellationRequested();

                    tokenizer.ProcessNextChunk(new ReadOnlySpan <byte>(readBuffer, 0, cnt), visitor);
                    progress?.Report(cnt);
                }
            }
            finally
            {
                readBufferPool?.Return(readBuffer, clearArray: true);
            }

            tokenizer.ProcessEndOfStream(visitor);
            progress?.Report(0);
        }
예제 #14
0
        public void CsvTokenizerTest_Disposed()
        {
            StringReader reader    = new StringReader("");
            CsvTokenizer tokenizer = new CsvTokenizer(reader);

            tokenizer.Dispose();

            Assert.ThrowsException <ObjectDisposedException>(() =>
            {
                reader.Read();
            });

            Assert.ThrowsException <ObjectDisposedException>(() =>
            {
                tokenizer.NextToken();
            });
        }
예제 #15
0
        public void CsvTokenizerTest_OneLineNormal()
        {
            using CsvTokenizer tokenizer = new CsvTokenizer(new StringReader(@"aaa,bbb,ccc"));
            List <string> records = new List <string>();

            CsvToken token;

            while ((token = tokenizer.NextToken()).TokenType != CsvTokenType.Eof)
            {
                records.Add(token.Value);
            }
            records.Add(token.Value);

            Assert.AreEqual(3, records.Count);
            Assert.AreEqual("aaa", records[0]);
            Assert.AreEqual("bbb", records[1]);
            Assert.AreEqual("ccc", records[2]);
        }
예제 #16
0
        public void CsvTokenizerTest_LRLFInDoubleQoutedValues()
        {
            using CsvTokenizer tokenizer = new CsvTokenizer(new StringReader("aaa\",\"b\r\n\"\"\",ccc"));
            List <string> records = new List <string>();

            CsvToken token;

            while ((token = tokenizer.NextToken()).TokenType != CsvTokenType.Eof)
            {
                records.Add(token.Value);
            }
            records.Add(token.Value);

            Assert.AreEqual(3, records.Count);
            Assert.AreEqual("aaa\"", records[0]);
            Assert.AreEqual("b\r\n\"", records[1]);
            Assert.AreEqual("ccc", records[2]);
        }
예제 #17
0
        private static bool EatUTF8BOM(CsvTokenizer tokenizer, CsvReaderVisitorBase visitor, Stream csvStream, byte[] readBuffer)
        {
            if (readBuffer.Length < 3)
            {
                // don't bother pooling; nobody should really ever care.
                readBuffer = new byte[3];
            }

            int byteCount = 0;

            while (byteCount < 3)
            {
                int readLength = csvStream.Read(readBuffer, byteCount, readBuffer.Length - byteCount);
                if (readLength == 0)
                {
                    if (byteCount != 0)
                    {
                        if (!new ReadOnlySpan <byte>(readBuffer, 0, byteCount).SequenceEqual(new ReadOnlySpan <byte>(UTF8BOM, 0, byteCount)))
                        {
                            tokenizer.ProcessNextChunk(new ReadOnlySpan <byte>(readBuffer, 0, byteCount), visitor);
                        }
                    }

                    tokenizer.ProcessEndOfStream(visitor);
                    return(true);
                }

                byteCount += readLength;
            }

            var buf = new ReadOnlySpan <byte>(readBuffer, 0, byteCount);

            if (buf.StartsWith(UTF8BOM))
            {
                buf = buf.Slice(3);
            }

            tokenizer.ProcessNextChunk(buf, visitor);

            return(false);
        }
예제 #18
0
        /// <inheritdoc />
        protected override void ProcessCore(CsvReaderVisitorBase visitor)
        {
            var tokenizer = new CsvTokenizer(_delimiter);
            var csvStream = _csvStream;
            int minReadBufferByteCount = _minReadBufferByteCount;
            var readBufferPool         = _readBufferPool;

            byte[] readBuffer;
            if (readBufferPool is null)
            {
                readBuffer = new byte[minReadBufferByteCount];
            }
            else
            {
                readBuffer = readBufferPool.Rent(minReadBufferByteCount);
            }

            try
            {
                if (_ignoreUTF8ByteOrderMark && EatUTF8BOM(tokenizer, visitor, csvStream, readBuffer))
                {
                    return;
                }

                int cnt;
                while ((cnt = csvStream.Read(readBuffer, 0, readBuffer.Length)) != 0)
                {
                    tokenizer.ProcessNextChunk(new ReadOnlySpan <byte>(readBuffer, 0, cnt), visitor);
                }
            }
            finally
            {
                readBufferPool?.Return(readBuffer, clearArray: true);
            }

            tokenizer.ProcessEndOfStream(visitor);
        }
예제 #19
0
            public object Convert(IConversionRequest request)
            {
                var stringValue = request.Text;

                if (stringValue.ToUpper() == StringConverterStrategy.EMPTY || stringValue.Trim().IsEmpty())
                {
                    return(Array.CreateInstance(_innerType, 0));
                }

                var csvTokenizer = new CsvTokenizer();

                csvTokenizer.Read(stringValue);
                var tokens = csvTokenizer.Tokens.Select(t => t.Trim()).ToList();

                var array = Array.CreateInstance(_innerType, tokens.Count);

                for (var i = 0; i < tokens.Count; i++)
                {
                    var value = _inner.Convert(request.AnotherRequest(tokens[i]));
                    array.SetValue(value, i);
                }

                return(array);
            }
        internal static void ProcessFullSegment(ReadOnlySpan <byte> bytes, bool ignoreUTF8ByteOrderMark, CsvTokenizer tokenizer, CsvReaderVisitorBase visitor)
        {
            if (ignoreUTF8ByteOrderMark)
            {
                var head = new ReadOnlySpan <byte>(UTF8BOM, 0, bytes.Length < 3 ? bytes.Length : 3);
                if (bytes.StartsWith(head))
                {
                    bytes = bytes.Slice(head.Length);
                }
            }

            tokenizer.ProcessNextChunk(bytes, visitor);
            tokenizer.ProcessEndOfStream(visitor);
        }
예제 #21
0
        protected override void Arrange()
        {
            var tokenizer = new CsvTokenizer(TextReader);

            Parser = new CsvParser(tokenizer);
        }
예제 #22
0
        private static bool EatUTF8BOM(CsvTokenizer tokenizer, CsvReaderVisitorBase visitor, ref ReadOnlySequence <byte> .Enumerator enumerator)
        {
            ReadOnlyMemory <byte> segment;

            while (true)
            {
                if (!enumerator.MoveNext())
                {
                    tokenizer.ProcessEndOfStream(visitor);
                    return(true);
                }

                segment = enumerator.Current;
                if (!segment.IsEmpty)
                {
                    break;
                }
            }

            var span = segment.Span;

            ReadOnlySpan <byte> head = UTF8BOM;

            // this greed should **probably** pay off most of the time.
            if (span.Length >= 3)
            {
                if (span.StartsWith(head))
                {
                    span = span.Slice(3);
                }

                tokenizer.ProcessNextChunk(span, visitor);
                return(false);
            }

            int alreadyEaten = 0;

            while (true)
            {
                if (span[0] == head[alreadyEaten])
                {
                    span = span.Slice(1);
                    if (++alreadyEaten == 3)
                    {
                        tokenizer.ProcessNextChunk(span, visitor);
                        return(false);
                    }
                }
                else
                {
                    tokenizer.ProcessNextChunk(head.Slice(0, alreadyEaten), visitor);
                    tokenizer.ProcessNextChunk(span, visitor);
                    return(false);
                }

                if (span.IsEmpty)
                {
                    while (true)
                    {
                        if (!enumerator.MoveNext())
                        {
                            tokenizer.ProcessEndOfStream(visitor);
                            return(true);
                        }

                        segment = enumerator.Current;
                        if (!segment.IsEmpty)
                        {
                            break;
                        }
                    }

                    span = segment.Span;
                }
            }
        }
예제 #23
0
        private async ValueTask <bool> EatUTF8BOMAsync(CsvTokenizer tokenizer, CsvReaderVisitorBase visitor, IProgress <int> progress, CancellationToken cancellationToken)
        {
            var reader = _reader;

            ReadOnlySequence <byte> buffer;

            // keep asking for more until we've seen either 3+ bytes or the end of the data.
            while (true)
            {
                var result = await reader.ReadAsync(cancellationToken).ConfigureAwait(false);

                if (result.IsCanceled)
                {
                    throw new OperationCanceledException(cancellationToken);
                }

                buffer = result.Buffer;
                if (buffer.Length >= 3)
                {
                    // we've seen 3+ bytes.
                    break;
                }

                if (result.IsCompleted)
                {
                    // we've seen the end of the data.
                    Finish();
                    tokenizer.ProcessEndOfStream(visitor);
                    reader.AdvanceTo(buffer.End);
                    progress?.Report(0);
                    return(true);
                }

                // tell the reader that we've looked at everything it had to give us, and we weren't
                // able to consume any of it, so the next read should have everything we've seen so
                // far, plus at least one more byte.
                reader.AdvanceTo(buffer.Start, buffer.End);
            }

            Finish();
            return(false);

            void Finish()
            {
                Span <byte> upToFirstThreeBytes = stackalloc byte[3];
                int         alreadyEaten        = 0;

                foreach (var segment in buffer)
                {
                    int lengthToCopy = 3 - alreadyEaten;
                    if (lengthToCopy > segment.Length)
                    {
                        lengthToCopy = segment.Length;
                    }

                    segment.Slice(0, lengthToCopy).Span.CopyTo(upToFirstThreeBytes.Slice(alreadyEaten, lengthToCopy));
                    alreadyEaten += lengthToCopy;
                    if (alreadyEaten == 3)
                    {
                        break;
                    }
                }

                upToFirstThreeBytes = upToFirstThreeBytes.Slice(0, alreadyEaten);
                var head = new ReadOnlySpan <byte>(UTF8BOM, 0, alreadyEaten);

                if (!upToFirstThreeBytes.SequenceEqual(head))
                {
                    tokenizer.ProcessNextChunk(upToFirstThreeBytes, visitor);
                }

                reader.AdvanceTo(buffer.GetPosition(alreadyEaten));
                progress?.Report(alreadyEaten);
            }
        }