private int FindPriorNonWhitespace(int from) { if (from > 0) { var cp = Codepoint.ReadAt(_text, from - 1, out var count); var cls = cp.LineBreakClass; if (cls == LineBreakClass.MandatoryBreak || cls == LineBreakClass.LineFeed || cls == LineBreakClass.CarriageReturn) { from -= count; } } while (from > 0) { var cp = Codepoint.ReadAt(_text, from - 1, out var count); var cls = cp.LineBreakClass; if (cls == LineBreakClass.Space) { from -= count; } else { break; } } return(from); }
// Get the next character class private LineBreakClass ReadCharClass() { var cp = Codepoint.ReadAt(_text, _pos, out var count); _pos += count; return(MapClass(cp.LineBreakClass)); }
/// <summary> /// Moves to the next <see cref="Codepoint"/>. /// </summary> /// <returns></returns> public bool MoveNext() { if (_text.IsEmpty) { Current = Codepoint.ReplacementCodepoint; return(false); } Current = Codepoint.ReadAt(_text, 0, out var count); _text = _text.Skip(count); return(true); }
public void MoveNext() { // For ill-formed subsequences (like unpaired UTF-16 surrogate code points), we rely on // the decoder's default behavior of interpreting these ill-formed subsequences as // equivalent to U+FFFD REPLACEMENT CHARACTER. This code point has a boundary property // of Other (XX), which matches the modifications made to UAX#29, Rev. 35. // See: https://www.unicode.org/reports/tr29/tr29-35.html#Modifications // This change is also reflected in the UCD files. For example, Unicode 11.0's UCD file // https://www.unicode.org/Public/11.0.0/ucd/auxiliary/GraphemeBreakProperty.txt // has the line "D800..DFFF ; Control # Cs [2048] <surrogate-D800>..<surrogate-DFFF>", // but starting with Unicode 12.0 that line has been removed. // // If a later version of the Unicode Standard further modifies this guidance we should reflect // that here. if (CurrentCodeUnitOffset == _buffer.Length) { CurrentCodepoint = Codepoint.ReplacementCodepoint; } else { CurrentCodeUnitOffset += _codeUnitLengthOfCurrentScalar; if (CurrentCodeUnitOffset < _buffer.Length) { CurrentCodepoint = Codepoint.ReadAt(_buffer, CurrentCodeUnitOffset, out _codeUnitLengthOfCurrentScalar); } else { CurrentCodepoint = Codepoint.ReplacementCodepoint; } } CurrentType = CurrentCodepoint.GraphemeBreakClass; }
public bool MoveNext() { // get the first char if we're at the beginning of the string if (!_curClass.HasValue) { _curClass = PeekCharClass() == LineBreakClass.Space ? LineBreakClass.WordJoiner : MapFirst(ReadCharClass()); } while (_pos < _text.Length) { _lastPos = _pos; var lastClass = _nextClass; _nextClass = ReadCharClass(); // explicit newline if (_curClass.HasValue && (_curClass == LineBreakClass.MandatoryBreak || _curClass == LineBreakClass.CarriageReturn && _nextClass != LineBreakClass.LineFeed)) { _curClass = MapFirst(MapClass(_nextClass.Value)); Current = new LineBreak(FindPriorNonWhitespace(_lastPos), _lastPos, true); return(true); } // handle classes not handled by the pair table LineBreakClass?cur = null; switch (_nextClass.Value) { case LineBreakClass.Space: cur = _curClass; break; case LineBreakClass.MandatoryBreak: case LineBreakClass.LineFeed: case LineBreakClass.NextLine: cur = LineBreakClass.MandatoryBreak; break; case LineBreakClass.CarriageReturn: cur = LineBreakClass.CarriageReturn; break; case LineBreakClass.ContingentBreak: cur = LineBreakClass.BreakAfter; break; } if (cur != null) { _curClass = cur; if (_nextClass.Value == LineBreakClass.MandatoryBreak) { _lastPos = _pos; Current = new LineBreak(FindPriorNonWhitespace(_lastPos), _lastPos, true); return(true); } continue; } // if not handled already, use the pair table var shouldBreak = false; switch (BreakPairTable.Map(_curClass.Value, _nextClass.Value)) { case PairBreakType.DI: // Direct break shouldBreak = true; _lastPos = _pos; break; case PairBreakType.IN: // possible indirect break shouldBreak = lastClass.HasValue && lastClass.Value == LineBreakClass.Space; break; case PairBreakType.CI: shouldBreak = lastClass.HasValue && lastClass.Value == LineBreakClass.Space; if (!shouldBreak) { continue; } break; case PairBreakType.CP: // prohibited for combining marks if (!lastClass.HasValue || lastClass.Value != LineBreakClass.Space) { continue; } break; } _curClass = _nextClass; if (shouldBreak) { Current = new LineBreak(FindPriorNonWhitespace(_lastPos), _lastPos); return(true); } } if (_pos >= _text.Length) { if (_lastPos < _text.Length) { _lastPos = _text.Length; var cls = Codepoint.ReadAt(_text, _text.Length - 1, out _).LineBreakClass; bool required = cls == LineBreakClass.MandatoryBreak || cls == LineBreakClass.LineFeed || cls == LineBreakClass.CarriageReturn; Current = new LineBreak(FindPriorNonWhitespace(_text.Length), _text.Length, required); return(true); } } return(false); }
private LineBreakClass PeekCharClass() { return(MapClass(Codepoint.ReadAt(_text, _pos, out _).LineBreakClass)); }
// Get the next character class private LineBreakClass NextCharClass() { var cp = Codepoint.ReadAt(_text, _position, out var count); var cls = MapClass(cp); _position += count; // Keep track of alphanumeric + any combining marks. // This is used for LB22 and LB30. if (IsAlphaNumeric(_currentClass) || _alphaNumericCount > 0 && cls == LineBreakClass.CombiningMark) { _alphaNumericCount++; } // Track combining mark exceptions. LB22 if (cls == LineBreakClass.CombiningMark) { switch (_currentClass) { case LineBreakClass.MandatoryBreak: case LineBreakClass.ContingentBreak: case LineBreakClass.Exclamation: case LineBreakClass.LineFeed: case LineBreakClass.NextLine: case LineBreakClass.Space: case LineBreakClass.ZWSpace: case LineBreakClass.CarriageReturn: _lb22ex = true; break; } } // Track combining mark exceptions. LB31 if (_first && cls == LineBreakClass.CombiningMark) { _lb31 = true; } if (cls == LineBreakClass.CombiningMark) { switch (_currentClass) { case LineBreakClass.MandatoryBreak: case LineBreakClass.ContingentBreak: case LineBreakClass.Exclamation: case LineBreakClass.LineFeed: case LineBreakClass.NextLine: case LineBreakClass.Space: case LineBreakClass.ZWSpace: case LineBreakClass.CarriageReturn: case LineBreakClass.ZWJ: _lb31 = true; break; } } if (_first && (cls == LineBreakClass.PostfixNumeric || cls == LineBreakClass.PrefixNumeric || cls == LineBreakClass.Space)) { _lb31 = true; } if (_currentClass == LineBreakClass.Alphabetic && (cls == LineBreakClass.PostfixNumeric || cls == LineBreakClass.PrefixNumeric || cls == LineBreakClass.Space)) { _lb31 = true; } // Reset LB31 if next is U+0028 (Left Opening Parenthesis) if (_lb31 && _currentClass != LineBreakClass.PostfixNumeric && _currentClass != LineBreakClass.PrefixNumeric && cls == LineBreakClass.OpenPunctuation && cp.Value == 0x0028) { _lb31 = false; } // Rule LB24 if (_first && (cls == LineBreakClass.ClosePunctuation || cls == LineBreakClass.CloseParenthesis)) { _lb24ex = true; } // Rule LB25 if (_first && (cls == LineBreakClass.ClosePunctuation || cls == LineBreakClass.InfixNumeric || cls == LineBreakClass.BreakSymbols)) { _lb25ex = true; } if (cls == LineBreakClass.Space || cls == LineBreakClass.WordJoiner || cls == LineBreakClass.Alphabetic) { var next = PeekNextCharClass(); if (next == LineBreakClass.ClosePunctuation || next == LineBreakClass.InfixNumeric || next == LineBreakClass.BreakSymbols) { _lb25ex = true; } } // AlphaNumeric + and combining marks can break for OP except. // - U+0028 (Left Opening Parenthesis) // - U+005B (Opening Square Bracket) // - U+007B (Left Curly Bracket) // See custom columns|rules in the text pair table. // https://www.unicode.org/Public/13.0.0/ucd/auxiliary/LineBreakTest.html _lb30 = _alphaNumericCount > 0 && cls == LineBreakClass.OpenPunctuation && cp.Value != 0x0028 && cp.Value != 0x005B && cp.Value != 0x007B; return(cls); }
private LineBreakClass PeekNextCharClass() { var cp = Codepoint.ReadAt(_text, _position, out _); return(MapClass(cp)); }
private bool GetPairTableBreak(LineBreakClass lastClass) { // If not handled already, use the pair table bool shouldBreak = false; switch (LineBreakPairTable.Table[(int)_currentClass][(int)_nextClass]) { case LineBreakPairTable.DIBRK: // Direct break shouldBreak = true; break; // TODO: Rewrite this so that it defaults to true and rules are set as exceptions. case LineBreakPairTable.INBRK: // Possible indirect break // LB31 if (_lb31 && _nextClass == LineBreakClass.OpenPunctuation) { shouldBreak = true; _lb31 = false; break; } // LB30 if (_lb30) { shouldBreak = true; _lb30 = false; _alphaNumericCount = 0; break; } // LB25 if (_lb25ex && (_nextClass == LineBreakClass.PrefixNumeric || _nextClass == LineBreakClass.Numeric)) { shouldBreak = true; _lb25ex = false; break; } // LB24 if (_lb24ex && (_nextClass == LineBreakClass.PostfixNumeric || _nextClass == LineBreakClass.PrefixNumeric)) { shouldBreak = true; _lb24ex = false; break; } // LB18 shouldBreak = lastClass == LineBreakClass.Space; break; case LineBreakPairTable.CIBRK: shouldBreak = lastClass == LineBreakClass.Space; if (!shouldBreak) { return(false); } break; case LineBreakPairTable.CPBRK: // prohibited for combining marks if (lastClass != LineBreakClass.Space) { return(false); } break; case LineBreakPairTable.PRBRK: break; } // Rule LB22 if (_nextClass == LineBreakClass.Inseparable) { switch (lastClass) { case LineBreakClass.MandatoryBreak: case LineBreakClass.ContingentBreak: case LineBreakClass.Exclamation: case LineBreakClass.LineFeed: case LineBreakClass.NextLine: case LineBreakClass.Space: case LineBreakClass.ZWSpace: // Allow break break; case LineBreakClass.CombiningMark: if (_lb22ex) { // Allow break _lb22ex = false; break; } shouldBreak = false; break; default: shouldBreak = false; break; } } if (_lb8a) { shouldBreak = false; } // Rule LB21a if (_lb21a && (_currentClass == LineBreakClass.Hyphen || _currentClass == LineBreakClass.BreakAfter)) { shouldBreak = false; _lb21a = false; } else { _lb21a = _currentClass == LineBreakClass.HebrewLetter; } // Rule LB30a if (_currentClass == LineBreakClass.RegionalIndicator) { _lb30a++; if (_lb30a == 2 && _nextClass == LineBreakClass.RegionalIndicator) { shouldBreak = true; _lb30a = 0; } } else { _lb30a = 0; } // Rule LB30b if (_nextClass == LineBreakClass.EModifier && _lastPosition > 0) { // Mahjong Tiles (Unicode block) are extended pictographics but have a class of ID // Unassigned codepoints with Line_Break=ID in some blocks are also assigned the Extended_Pictographic property. // Those blocks are intended for future allocation of emoji characters. var cp = Codepoint.ReadAt(_text, _lastPosition - 1, out int _); if (Codepoint.IsInRangeInclusive(cp, 0x1F000, 0x1F02F)) { shouldBreak = false; } } _currentClass = _nextClass; return(shouldBreak); }