예제 #1
0
        // TODO: Check exception handling. Original scala exception handling might be faulty?, it returns exceptions when it is parsing, but disregards any csv errors when doing return checks at the end.
        private List <ByteString> ParseLine(bool requireLineEnd)
        {
            var buffer       = _buffer;
            var columns      = new List <ByteString>();
            var state        = State.LineStart;
            var fieldBuilder = new FieldBuilder(buffer, this);

            void WrongCharEscaped()
            {
                throw new MalformedCsvException($"wrong escaping at {_currentLineNo}:{_position}, only escape or delimiter may be escaped");
            }

            void WrongCharEscapedWithinQuotes()
            {
                throw new MalformedCsvException($"wrong escaping at {_currentLineNo}:{_position}, only escape or quote may be escaped within quotes");
            }

            void NoCharEscaped()
            {
                throw new MalformedCsvException($"wrong escaping at {_currentLineNo}:{_position}, no character after escape");
            }

            void ReadPastLf()
            {
                if (_position < buffer.Count && buffer[_position] == Lf)
                {
                    _position++;
                }
            }

            void CheckForByteOrderMark()
            {
                if (buffer.Count >= 2)
                {
                    if (buffer.StartsWith(ByteOrderMark.UTF8))
                    {
                        _position   = 3;
                        _fieldStart = 3;
                    }
                    else
                    {
                        if (buffer.StartsWith(ByteOrderMark.UTF16_LE))
                        {
                            throw new UnsupportedCharsetException("UTF-16 LE and UTF-32 LE");
                        }
                        if (buffer.StartsWith(ByteOrderMark.UTF16_BE))
                        {
                            throw new UnsupportedCharsetException("UTF-16 BE");
                        }
                        if (buffer.StartsWith(ByteOrderMark.UTF32_BE))
                        {
                            throw new UnsupportedCharsetException("UTF-32 BE");
                        }
                    }
                }
            }

            if (_firstData)
            {
                CheckForByteOrderMark();
                _firstData = false;
            }

            while (state != State.LineEnd && _position < buffer.Count)
            {
                var b = buffer[_position];
                switch (state)
                {
                case State.LineStart:
                    if (b == _quoteChar)
                    {
                        state = State.QuoteStarted;
                        _position++;
                        _fieldStart = _position;
                        continue;
                    }

                    if (b == _delimiter)
                    {
                        columns.Add(ByteString.Empty);
                        state = State.AfterDelimiter;
                        _position++;
                        _fieldStart = _position;
                        continue;
                    }

                    switch (b)
                    {
                    case Lf:
                        columns.Add(ByteString.Empty);
                        state = State.LineEnd;
                        _position++;
                        _fieldStart = _position;
                        break;

                    case Cr:
                        columns.Add(ByteString.Empty);
                        state = State.LineEnd;
                        _position++;
                        ReadPastLf();
                        _fieldStart = _position;
                        break;

                    default:
                        fieldBuilder.Add(b);
                        state = State.WithinField;
                        _position++;
                        break;
                    }
                    break;

                case State.AfterDelimiter:
                    if (b == _quoteChar)
                    {
                        state = State.QuoteStarted;
                        _position++;
                        _fieldStart = _position;
                        continue;
                    }

                    if (b == _escapeChar)
                    {
                        if (_position + 1 >= buffer.Count)
                        {
                            NoCharEscaped();
                        }

                        if (buffer[_position + 1] != _escapeChar && buffer[_position + 1] != _delimiter)
                        {
                            WrongCharEscaped();
                        }

                        fieldBuilder.Init(buffer[_position + 1]);
                        state      = State.WithinField;
                        _position += 2;
                        continue;
                    }

                    if (b == _delimiter)
                    {
                        columns.Add(ByteString.Empty);
                        state = State.AfterDelimiter;
                        _position++;
                        _fieldStart = _position;
                        continue;
                    }

                    switch (b)
                    {
                    case Lf:
                        columns.Add(ByteString.Empty);
                        state = State.LineEnd;
                        _position++;
                        _fieldStart = _position;
                        break;

                    case Cr:
                        columns.Add(ByteString.Empty);
                        state = State.LineEnd;
                        _position++;
                        ReadPastLf();
                        _fieldStart = _position;
                        break;

                    default:
                        fieldBuilder.Add(b);
                        state = State.WithinField;
                        _position++;
                        break;
                    }
                    break;

                case State.WithinField:
                    if (b == _escapeChar)
                    {
                        if (_position + 1 >= buffer.Count)
                        {
                            NoCharEscaped();
                        }

                        if (buffer[_position + 1] != _escapeChar && buffer[_position + 1] != _delimiter)
                        {
                            WrongCharEscaped();
                        }

                        fieldBuilder.Init(buffer[_position + 1]);
                        state      = State.WithinField;
                        _position += 2;
                        continue;
                    }

                    if (b == _delimiter)
                    {
                        columns.Add(fieldBuilder.Result(_position));
                        state = State.AfterDelimiter;
                        _position++;
                        _fieldStart = _position;
                        continue;
                    }

                    switch (b)
                    {
                    case Lf:
                        columns.Add(fieldBuilder.Result(_position));
                        state = State.LineEnd;
                        _position++;
                        _fieldStart = _position;
                        break;

                    case Cr:
                        columns.Add(fieldBuilder.Result(_position));
                        state = State.LineEnd;
                        _position++;
                        ReadPastLf();
                        _fieldStart = _position;
                        break;

                    default:
                        fieldBuilder.Add(b);
                        state = State.WithinField;
                        _position++;
                        break;
                    }
                    break;

                case State.QuoteStarted:
                    if (b == _escapeChar && _escapeChar != _quoteChar)
                    {
                        if (_position + 1 >= buffer.Count)
                        {
                            NoCharEscaped();
                        }

                        if (buffer[_position + 1] != _escapeChar && buffer[_position + 1] != _quoteChar)
                        {
                            WrongCharEscapedWithinQuotes();
                        }

                        fieldBuilder.Init(buffer[_position + 1]);
                        state      = State.WithinQuotedField;
                        _position += 2;
                        continue;
                    }

                    if (b == _quoteChar)
                    {
                        if (_position + 1 < buffer.Count && buffer[_position + 1] == _quoteChar)
                        {
                            fieldBuilder.Init(b);
                            state      = State.WithinQuotedField;
                            _position += 2;
                            continue;
                        }
                        state = State.QuoteEnd;
                        _position++;
                        continue;
                    }

                    fieldBuilder.Add(b);
                    state = State.WithinQuotedField;
                    _position++;
                    break;

                case State.QuoteEnd:
                    if (b == _delimiter)
                    {
                        columns.Add(fieldBuilder.Result(_position - 1));
                        state = State.AfterDelimiter;
                        _position++;
                        _fieldStart = _position;
                        continue;
                    }

                    switch (b)
                    {
                    case Lf:
                        columns.Add(fieldBuilder.Result(_position - 1));
                        state = State.LineEnd;
                        _position++;
                        _fieldStart = _position;
                        break;

                    case Cr:
                        columns.Add(fieldBuilder.Result(_position - 1));
                        state = State.LineEnd;
                        _position++;
                        ReadPastLf();
                        _fieldStart = _position;
                        break;

                    default:
                        throw new MalformedCsvException($"Expected delimiter or end of line at {_currentLineNo}:{_position}");
                    }
                    break;

                case State.WithinQuotedField:
                    if (b == _escapeChar && _escapeChar != _quoteChar)
                    {
                        if (_position + 1 >= buffer.Count)
                        {
                            NoCharEscaped();
                        }

                        if (buffer[_position + 1] != _escapeChar && buffer[_position + 1] != _quoteChar)
                        {
                            WrongCharEscapedWithinQuotes();
                        }

                        fieldBuilder.Init(buffer[_position + 1]);
                        state      = State.WithinQuotedField;
                        _position += 2;
                        continue;
                    }

                    if (b == _quoteChar)
                    {
                        if (_position + 1 < buffer.Count && buffer[_position + 1] == _quoteChar)
                        {
                            fieldBuilder.Init(b);
                            state      = State.WithinQuotedField;
                            _position += 2;
                            continue;
                        }
                        state = State.QuoteEnd;
                        _position++;
                        continue;
                    }

                    fieldBuilder.Add(b);
                    state = State.WithinQuotedField;
                    _position++;
                    break;
                }
            }

            if (requireLineEnd)
            {
                if (state == State.LineEnd)
                {
                    return(columns);
                }
                return(null);
            }

            switch (state)
            {
            case State.AfterDelimiter:
                columns.Add(ByteString.Empty);
                return(columns);

            case State.WithinQuotedField:
                return(null);

            case State.WithinField:
                columns.Add(fieldBuilder.Result(_position));
                return(columns);

            case State.QuoteEnd:
                columns.Add(fieldBuilder.Result(_position - 1));
                return(columns);
            }

            return(columns);
        }