예제 #1
0
        private int FindPriorNonWhitespace(int from)
        {
            if (from > 0)
            {
                var cp = Codepoint.ReadAt(_text, from - 1, out var count);

                var cls = cp.LineBreakClass;

                if (cls == LineBreakClass.MandatoryBreak || cls == LineBreakClass.LineFeed || cls == LineBreakClass.CarriageReturn)
                {
                    from -= count;
                }
            }

            while (from > 0)
            {
                var cp = Codepoint.ReadAt(_text, from - 1, out var count);

                var cls = cp.LineBreakClass;

                if (cls == LineBreakClass.Space)
                {
                    from -= count;
                }
                else
                {
                    break;
                }
            }
            return(from);
        }
예제 #2
0
        // Get the next character class
        private LineBreakClass ReadCharClass()
        {
            var cp = Codepoint.ReadAt(_text, _pos, out var count);

            _pos += count;

            return(MapClass(cp.LineBreakClass));
        }
예제 #3
0
        /// <summary>
        /// Moves to the next <see cref="Codepoint"/>.
        /// </summary>
        /// <returns></returns>
        public bool MoveNext()
        {
            if (_text.IsEmpty)
            {
                Current = Codepoint.ReplacementCodepoint;

                return(false);
            }

            Current = Codepoint.ReadAt(_text, 0, out var count);

            _text = _text.Skip(count);

            return(true);
        }
            public void MoveNext()
            {
                // For ill-formed subsequences (like unpaired UTF-16 surrogate code points), we rely on
                // the decoder's default behavior of interpreting these ill-formed subsequences as
                // equivalent to U+FFFD REPLACEMENT CHARACTER. This code point has a boundary property
                // of Other (XX), which matches the modifications made to UAX#29, Rev. 35.
                // See: https://www.unicode.org/reports/tr29/tr29-35.html#Modifications
                // This change is also reflected in the UCD files. For example, Unicode 11.0's UCD file
                // https://www.unicode.org/Public/11.0.0/ucd/auxiliary/GraphemeBreakProperty.txt
                // has the line "D800..DFFF    ; Control # Cs [2048] <surrogate-D800>..<surrogate-DFFF>",
                // but starting with Unicode 12.0 that line has been removed.
                //
                // If a later version of the Unicode Standard further modifies this guidance we should reflect
                // that here.

                if (CurrentCodeUnitOffset == _buffer.Length)
                {
                    CurrentCodepoint = Codepoint.ReplacementCodepoint;
                }
                else
                {
                    CurrentCodeUnitOffset += _codeUnitLengthOfCurrentScalar;

                    if (CurrentCodeUnitOffset < _buffer.Length)
                    {
                        CurrentCodepoint = Codepoint.ReadAt(_buffer, CurrentCodeUnitOffset,
                                                            out _codeUnitLengthOfCurrentScalar);
                    }
                    else
                    {
                        CurrentCodepoint = Codepoint.ReplacementCodepoint;
                    }
                }

                CurrentType = CurrentCodepoint.GraphemeBreakClass;
            }
예제 #5
0
        public bool MoveNext()
        {
            // get the first char if we're at the beginning of the string
            if (!_curClass.HasValue)
            {
                _curClass = PeekCharClass() == LineBreakClass.Space ? LineBreakClass.WordJoiner : MapFirst(ReadCharClass());
            }

            while (_pos < _text.Length)
            {
                _lastPos = _pos;
                var lastClass = _nextClass;
                _nextClass = ReadCharClass();

                // explicit newline
                if (_curClass.HasValue && (_curClass == LineBreakClass.MandatoryBreak || _curClass == LineBreakClass.CarriageReturn && _nextClass != LineBreakClass.LineFeed))
                {
                    _curClass = MapFirst(MapClass(_nextClass.Value));
                    Current   = new LineBreak(FindPriorNonWhitespace(_lastPos), _lastPos, true);
                    return(true);
                }

                // handle classes not handled by the pair table
                LineBreakClass?cur = null;
                switch (_nextClass.Value)
                {
                case LineBreakClass.Space:
                    cur = _curClass;
                    break;

                case LineBreakClass.MandatoryBreak:
                case LineBreakClass.LineFeed:
                case LineBreakClass.NextLine:
                    cur = LineBreakClass.MandatoryBreak;
                    break;

                case LineBreakClass.CarriageReturn:
                    cur = LineBreakClass.CarriageReturn;
                    break;

                case LineBreakClass.ContingentBreak:
                    cur = LineBreakClass.BreakAfter;
                    break;
                }

                if (cur != null)
                {
                    _curClass = cur;

                    if (_nextClass.Value == LineBreakClass.MandatoryBreak)
                    {
                        _lastPos = _pos;
                        Current  = new LineBreak(FindPriorNonWhitespace(_lastPos), _lastPos, true);
                        return(true);
                    }

                    continue;
                }

                // if not handled already, use the pair table
                var shouldBreak = false;
                switch (BreakPairTable.Map(_curClass.Value, _nextClass.Value))
                {
                case PairBreakType.DI:     // Direct break
                    shouldBreak = true;
                    _lastPos    = _pos;
                    break;

                case PairBreakType.IN:     // possible indirect break
                    shouldBreak = lastClass.HasValue && lastClass.Value == LineBreakClass.Space;
                    break;

                case PairBreakType.CI:
                    shouldBreak = lastClass.HasValue && lastClass.Value == LineBreakClass.Space;
                    if (!shouldBreak)
                    {
                        continue;
                    }
                    break;

                case PairBreakType.CP:     // prohibited for combining marks
                    if (!lastClass.HasValue || lastClass.Value != LineBreakClass.Space)
                    {
                        continue;
                    }
                    break;
                }

                _curClass = _nextClass;

                if (shouldBreak)
                {
                    Current = new LineBreak(FindPriorNonWhitespace(_lastPos), _lastPos);
                    return(true);
                }
            }

            if (_pos >= _text.Length)
            {
                if (_lastPos < _text.Length)
                {
                    _lastPos = _text.Length;
                    var  cls      = Codepoint.ReadAt(_text, _text.Length - 1, out _).LineBreakClass;
                    bool required = cls == LineBreakClass.MandatoryBreak || cls == LineBreakClass.LineFeed || cls == LineBreakClass.CarriageReturn;
                    Current = new LineBreak(FindPriorNonWhitespace(_text.Length), _text.Length, required);
                    return(true);
                }
            }

            return(false);
        }
예제 #6
0
 private LineBreakClass PeekCharClass()
 {
     return(MapClass(Codepoint.ReadAt(_text, _pos, out _).LineBreakClass));
 }
예제 #7
0
        // Get the next character class
        private LineBreakClass NextCharClass()
        {
            var cp  = Codepoint.ReadAt(_text, _position, out var count);
            var cls = MapClass(cp);

            _position += count;

            // Keep track of alphanumeric + any combining marks.
            // This is used for LB22 and LB30.
            if (IsAlphaNumeric(_currentClass) || _alphaNumericCount > 0 && cls == LineBreakClass.CombiningMark)
            {
                _alphaNumericCount++;
            }

            // Track combining mark exceptions. LB22
            if (cls == LineBreakClass.CombiningMark)
            {
                switch (_currentClass)
                {
                case LineBreakClass.MandatoryBreak:
                case LineBreakClass.ContingentBreak:
                case LineBreakClass.Exclamation:
                case LineBreakClass.LineFeed:
                case LineBreakClass.NextLine:
                case LineBreakClass.Space:
                case LineBreakClass.ZWSpace:
                case LineBreakClass.CarriageReturn:
                    _lb22ex = true;
                    break;
                }
            }

            // Track combining mark exceptions. LB31
            if (_first && cls == LineBreakClass.CombiningMark)
            {
                _lb31 = true;
            }

            if (cls == LineBreakClass.CombiningMark)
            {
                switch (_currentClass)
                {
                case LineBreakClass.MandatoryBreak:
                case LineBreakClass.ContingentBreak:
                case LineBreakClass.Exclamation:
                case LineBreakClass.LineFeed:
                case LineBreakClass.NextLine:
                case LineBreakClass.Space:
                case LineBreakClass.ZWSpace:
                case LineBreakClass.CarriageReturn:
                case LineBreakClass.ZWJ:
                    _lb31 = true;
                    break;
                }
            }

            if (_first &&
                (cls == LineBreakClass.PostfixNumeric || cls == LineBreakClass.PrefixNumeric || cls == LineBreakClass.Space))
            {
                _lb31 = true;
            }

            if (_currentClass == LineBreakClass.Alphabetic &&
                (cls == LineBreakClass.PostfixNumeric || cls == LineBreakClass.PrefixNumeric || cls == LineBreakClass.Space))
            {
                _lb31 = true;
            }

            // Reset LB31 if next is U+0028 (Left Opening Parenthesis)
            if (_lb31 &&
                _currentClass != LineBreakClass.PostfixNumeric &&
                _currentClass != LineBreakClass.PrefixNumeric &&
                cls == LineBreakClass.OpenPunctuation && cp.Value == 0x0028)
            {
                _lb31 = false;
            }

            // Rule LB24
            if (_first && (cls == LineBreakClass.ClosePunctuation || cls == LineBreakClass.CloseParenthesis))
            {
                _lb24ex = true;
            }

            // Rule LB25
            if (_first &&
                (cls == LineBreakClass.ClosePunctuation || cls == LineBreakClass.InfixNumeric || cls == LineBreakClass.BreakSymbols))
            {
                _lb25ex = true;
            }

            if (cls == LineBreakClass.Space || cls == LineBreakClass.WordJoiner || cls == LineBreakClass.Alphabetic)
            {
                var next = PeekNextCharClass();
                if (next == LineBreakClass.ClosePunctuation || next == LineBreakClass.InfixNumeric || next == LineBreakClass.BreakSymbols)
                {
                    _lb25ex = true;
                }
            }

            // AlphaNumeric + and combining marks can break for OP except.
            // - U+0028 (Left Opening Parenthesis)
            // - U+005B (Opening Square Bracket)
            // - U+007B (Left Curly Bracket)
            // See custom columns|rules in the text pair table.
            // https://www.unicode.org/Public/13.0.0/ucd/auxiliary/LineBreakTest.html
            _lb30 = _alphaNumericCount > 0 &&
                    cls == LineBreakClass.OpenPunctuation &&
                    cp.Value != 0x0028 &&
                    cp.Value != 0x005B &&
                    cp.Value != 0x007B;

            return(cls);
        }
예제 #8
0
        private LineBreakClass PeekNextCharClass()
        {
            var cp = Codepoint.ReadAt(_text, _position, out _);

            return(MapClass(cp));
        }
예제 #9
0
        private bool GetPairTableBreak(LineBreakClass lastClass)
        {
            // If not handled already, use the pair table
            bool shouldBreak = false;

            switch (LineBreakPairTable.Table[(int)_currentClass][(int)_nextClass])
            {
            case LineBreakPairTable.DIBRK:     // Direct break
                shouldBreak = true;
                break;

            // TODO: Rewrite this so that it defaults to true and rules are set as exceptions.
            case LineBreakPairTable.INBRK:     // Possible indirect break

                // LB31
                if (_lb31 && _nextClass == LineBreakClass.OpenPunctuation)
                {
                    shouldBreak = true;
                    _lb31       = false;
                    break;
                }

                // LB30
                if (_lb30)
                {
                    shouldBreak        = true;
                    _lb30              = false;
                    _alphaNumericCount = 0;
                    break;
                }

                // LB25
                if (_lb25ex && (_nextClass == LineBreakClass.PrefixNumeric || _nextClass == LineBreakClass.Numeric))
                {
                    shouldBreak = true;
                    _lb25ex     = false;
                    break;
                }

                // LB24
                if (_lb24ex && (_nextClass == LineBreakClass.PostfixNumeric || _nextClass == LineBreakClass.PrefixNumeric))
                {
                    shouldBreak = true;
                    _lb24ex     = false;
                    break;
                }

                // LB18
                shouldBreak = lastClass == LineBreakClass.Space;
                break;

            case LineBreakPairTable.CIBRK:
                shouldBreak = lastClass == LineBreakClass.Space;
                if (!shouldBreak)
                {
                    return(false);
                }

                break;

            case LineBreakPairTable.CPBRK:     // prohibited for combining marks
                if (lastClass != LineBreakClass.Space)
                {
                    return(false);
                }

                break;

            case LineBreakPairTable.PRBRK:
                break;
            }

            // Rule LB22
            if (_nextClass == LineBreakClass.Inseparable)
            {
                switch (lastClass)
                {
                case LineBreakClass.MandatoryBreak:
                case LineBreakClass.ContingentBreak:
                case LineBreakClass.Exclamation:
                case LineBreakClass.LineFeed:
                case LineBreakClass.NextLine:
                case LineBreakClass.Space:
                case LineBreakClass.ZWSpace:

                    // Allow break
                    break;

                case LineBreakClass.CombiningMark:
                    if (_lb22ex)
                    {
                        // Allow break
                        _lb22ex = false;
                        break;
                    }

                    shouldBreak = false;
                    break;

                default:
                    shouldBreak = false;
                    break;
                }
            }

            if (_lb8a)
            {
                shouldBreak = false;
            }

            // Rule LB21a
            if (_lb21a && (_currentClass == LineBreakClass.Hyphen || _currentClass == LineBreakClass.BreakAfter))
            {
                shouldBreak = false;
                _lb21a      = false;
            }
            else
            {
                _lb21a = _currentClass == LineBreakClass.HebrewLetter;
            }

            // Rule LB30a
            if (_currentClass == LineBreakClass.RegionalIndicator)
            {
                _lb30a++;
                if (_lb30a == 2 && _nextClass == LineBreakClass.RegionalIndicator)
                {
                    shouldBreak = true;
                    _lb30a      = 0;
                }
            }
            else
            {
                _lb30a = 0;
            }

            // Rule LB30b
            if (_nextClass == LineBreakClass.EModifier && _lastPosition > 0)
            {
                // Mahjong Tiles (Unicode block) are extended pictographics but have a class of ID
                // Unassigned codepoints with Line_Break=ID in some blocks are also assigned the Extended_Pictographic property.
                // Those blocks are intended for future allocation of emoji characters.
                var cp = Codepoint.ReadAt(_text, _lastPosition - 1, out int _);

                if (Codepoint.IsInRangeInclusive(cp, 0x1F000, 0x1F02F))
                {
                    shouldBreak = false;
                }
            }

            _currentClass = _nextClass;

            return(shouldBreak);
        }