/* * GetGraphemeBreakClusterType * =========================== * Data derived from https://unicode.org/reports/tr29/#Default_Grapheme_Cluster_Table. Represents * grapheme cluster boundary information for the given code point. */ internal static GraphemeClusterBreakType GetGraphemeClusterBreakType(Rune rune) { nuint offset = GetNumericGraphemeTableOffsetNoBoundsChecks((uint)rune.Value); //TODO Add MemoryMarshal and GraphemeSegmentationValues //return (GraphemeClusterBreakType)Unsafe.AddByteOffset(ref MemoryMarshal.GetReference(GraphemeSegmentationValues), offset); byte[] graphemeSegmentationValues = GetGraphemeSegmentationValues(); GraphemeClusterBreakType graphemeClusterBreakType = (GraphemeClusterBreakType)Unsafe.AddByteOffset(ref MemoryMarshal.GetArrayDataReference(graphemeSegmentationValues), offset); graphemeSegmentationValues.Free(); return(graphemeClusterBreakType); }
public void MoveNext() { // For ill-formed subsequences (like unpaired UTF-16 surrogate code points), we rely on // the decoder's default behavior of interpreting these ill-formed subsequences as // equivalent to U+FFFD REPLACEMENT CHARACTER. This code point has a boundary property // of Other (XX), which matches the modifications made to UAX#29, Rev. 35. // See: https://www.unicode.org/reports/tr29/tr29-35.html#Modifications // This change is also reflected in the UCD files. For example, Unicode 11.0's UCD file // https://www.unicode.org/Public/11.0.0/ucd/auxiliary/GraphemeBreakProperty.txt // has the line "D800..DFFF ; Control # Cs [2048] <surrogate-D800>..<surrogate-DFFF>", // but starting with Unicode 12.0 that line has been removed. // // If a later version of the Unicode Standard further modifies this guidance we should reflect // that here. CurrentCodeUnitOffset += _codeUnitLengthOfCurrentScalar; _decoder(_buffer.Slice(CurrentCodeUnitOffset), out Rune thisRune, out _codeUnitLengthOfCurrentScalar); CurrentType = CharUnicodeInfo.GetGraphemeClusterBreakType(thisRune); }
private static int GetLengthOfFirstExtendedGraphemeCluster <T>(ReadOnlySpan <T> input, DecodeFirstRune <T> decoder) { // Algorithm given at https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules. Processor <T> processor = new Processor <T>(input, decoder); processor.MoveNext(); // First, consume as many Prepend scalars as we can (rule GB9b). while (processor.CurrentType == GraphemeClusterBreakType.Prepend) { processor.MoveNext(); } // Next, make sure we're not about to violate control character restrictions. // Essentially, if we saw Prepend data, we can't have Control | CR | LF data afterward (rule GB5). if (processor.CurrentCodeUnitOffset > 0) { if (processor.CurrentType == GraphemeClusterBreakType.Control || processor.CurrentType == GraphemeClusterBreakType.CR || processor.CurrentType == GraphemeClusterBreakType.LF) { goto Return; } } // Now begin the main state machine. GraphemeClusterBreakType previousClusterBreakType = processor.CurrentType; processor.MoveNext(); switch (previousClusterBreakType) { case GraphemeClusterBreakType.CR: if (processor.CurrentType != GraphemeClusterBreakType.LF) { goto Return; // rules GB3 & GB4 (only <LF> can follow <CR>) } processor.MoveNext(); goto case GraphemeClusterBreakType.LF; case GraphemeClusterBreakType.Control: case GraphemeClusterBreakType.LF: goto Return; // rule GB4 (no data after Control | LF) case GraphemeClusterBreakType.L: if (processor.CurrentType == GraphemeClusterBreakType.L) { processor.MoveNext(); // rule GB6 (L x L) goto case GraphemeClusterBreakType.L; } else if (processor.CurrentType == GraphemeClusterBreakType.V) { processor.MoveNext(); // rule GB6 (L x V) goto case GraphemeClusterBreakType.V; } else if (processor.CurrentType == GraphemeClusterBreakType.LV) { processor.MoveNext(); // rule GB6 (L x LV) goto case GraphemeClusterBreakType.LV; } else if (processor.CurrentType == GraphemeClusterBreakType.LVT) { processor.MoveNext(); // rule GB6 (L x LVT) goto case GraphemeClusterBreakType.LVT; } else { break; } case GraphemeClusterBreakType.LV: case GraphemeClusterBreakType.V: if (processor.CurrentType == GraphemeClusterBreakType.V) { processor.MoveNext(); // rule GB7 (LV | V x V) goto case GraphemeClusterBreakType.V; } else if (processor.CurrentType == GraphemeClusterBreakType.T) { processor.MoveNext(); // rule GB7 (LV | V x T) goto case GraphemeClusterBreakType.T; } else { break; } case GraphemeClusterBreakType.LVT: case GraphemeClusterBreakType.T: if (processor.CurrentType == GraphemeClusterBreakType.T) { processor.MoveNext(); // rule GB8 (LVT | T x T) goto case GraphemeClusterBreakType.T; } else { break; } case GraphemeClusterBreakType.Extended_Pictograph: // Attempt processing extended pictographic (rules GB11, GB9). // First, drain any Extend scalars that might exist while (processor.CurrentType == GraphemeClusterBreakType.Extend) { processor.MoveNext(); } // Now see if there's a ZWJ + extended pictograph again. if (processor.CurrentType != GraphemeClusterBreakType.ZWJ) { break; } processor.MoveNext(); if (processor.CurrentType != GraphemeClusterBreakType.Extended_Pictograph) { break; } processor.MoveNext(); goto case GraphemeClusterBreakType.Extended_Pictograph; case GraphemeClusterBreakType.Regional_Indicator: // We've consumed a single RI scalar. Try to consume another (to make it a pair). if (processor.CurrentType == GraphemeClusterBreakType.Regional_Indicator) { processor.MoveNext(); } // Standlone RI scalars (or a single pair of RI scalars) can only be followed by trailers. break; // nothing but trailers after the final RI default: break; } // rules GB9, GB9a while (processor.CurrentType == GraphemeClusterBreakType.Extend || processor.CurrentType == GraphemeClusterBreakType.ZWJ || processor.CurrentType == GraphemeClusterBreakType.SpacingMark) { processor.MoveNext(); } Return: return(processor.CurrentCodeUnitOffset); // rules GB2, GB999 }