public static bool TryEncodeCodePoint(UnicodeCodePoint codePoint, ByteSpan buffer, out int encodedBytes) { encodedBytes = GetNumberOfEncodedBytes(codePoint); if (encodedBytes > buffer.Length) return false; switch (encodedBytes) { case 1: buffer[0] = (byte)(mask_0111_1111 & codePoint.Value); return true; case 2: buffer[0] = (byte)(((codePoint.Value >> 6) & mask_0001_1111) | mask_1100_0000); buffer[1] = (byte)(((codePoint.Value >> 0) & mask_0011_1111) | mask_1000_0000); return true; case 3: buffer[0] = (byte)(((codePoint.Value >> 12) & mask_0000_1111) | mask_1110_0000); buffer[1] = (byte)(((codePoint.Value >> 6) & mask_0011_1111) | mask_1000_0000); buffer[2] = (byte)(((codePoint.Value >> 0) & mask_0011_1111) | mask_1000_0000); return true; case 4: buffer[0] = (byte)(((codePoint.Value >> 18) & mask_0000_0111) | mask_1111_0000); buffer[1] = (byte)(((codePoint.Value >> 12) & mask_0011_1111) | mask_1000_0000); buffer[2] = (byte)(((codePoint.Value >> 6) & mask_0011_1111) | mask_1000_0000); buffer[3] = (byte)(((codePoint.Value >> 0) & mask_0011_1111) | mask_1000_0000); return true; default: return false; } }
public unsafe void ReverseEnumerateCodePointsConstructFromSpan() { TestCase[] testCases = new TestCase[] { new TestCase(GetRandomString(5, 32, 126), "Short ASCII string", 3000000), new TestCase(GetRandomString(5, 32, 0xD7FF), "Short string", 3000000), new TestCase(GetRandomString(50000, 32, 126), "Long ASCII string", 300), new TestCase(GetRandomString(50000, 32, 0xD7FF), "Long string", 300) }; foreach (TestCase testData in testCases) { string s = testData.String; Utf8String utf8s = new Utf8String(s); fixed(byte *bytes = utf8s.CopyBytes()) { utf8s = new Utf8String(new Span <byte>(bytes, utf8s.Length)); int iterations = testData.Iterations; _timer.Restart(); while (iterations-- != 0) { Utf8String.CodePointReverseEnumerator it = utf8s.CodePoints.GetReverseEnumerator(); while (it.MoveNext()) { UnicodeCodePoint codePoint = it.Current; } } PrintTime(testData); } } }
public static bool TryDecodeCodePoint(ReadOnlySpan <byte> buffer, out UnicodeCodePoint codePoint, out int encodedBytes) { if (buffer.Length == 0) { codePoint = default(UnicodeCodePoint); encodedBytes = default(int); return(false); } byte first = buffer[0]; if (!TryGetFirstByteCodePointValue(first, out codePoint, out encodedBytes)) { return(false); } if (buffer.Length < encodedBytes) { return(false); } // TODO: Should we manually inline this for values 1-4 or will compiler do this for us? for (int i = 1; i < encodedBytes; i++) { if (!TryReadCodePointByte(buffer[i], ref codePoint)) { return(false); } } return(true); }
public void IndexOfUnicodeCodePoint(int expected, string s, uint codePointValue) { Utf8String u8s = new Utf8String(s); UnicodeCodePoint codePoint = (UnicodeCodePoint)codePointValue; Assert.Equal(expected, u8s.IndexOf(codePoint)); }
public static bool TryDecodeCodePoint(ByteSpan buffer, out UnicodeCodePoint codePoint, out int encodedBytes) { if (buffer.Length == 0) { codePoint = default(UnicodeCodePoint); encodedBytes = default(int); return false; } byte first = buffer[0]; if (!TryGetFirstByteCodePointValue(first, out codePoint, out encodedBytes)) return false; if (buffer.Length < encodedBytes) return false; // TODO: Should we manually inline this for values 1-4 or will compiler do this for us? for (int i = 1; i < encodedBytes; i++) { if (!TryReadCodePointByte(buffer[i], ref codePoint)) return false; } return true; }
private static bool TryGetFirstByteCodePointValue(byte first, out UnicodeCodePoint codePoint, out int encodedBytes) { if (!TryGetNumberOfEncodedBytesFromFirstByte(first, out encodedBytes)) { codePoint = default(UnicodeCodePoint); return false; } switch (encodedBytes) { case 1: codePoint = (UnicodeCodePoint)(first & b0111_1111U); return true; case 2: codePoint = (UnicodeCodePoint)(first & b0001_1111U); return true; case 3: codePoint = (UnicodeCodePoint)(first & b0000_1111U); return true; case 4: codePoint = (UnicodeCodePoint)(first & b0000_0111U); return true; default: codePoint = default(UnicodeCodePoint); encodedBytes = 0; return false; } }
/// <summary> /// Builds a NFA from a unicode code point /// </summary> /// <param name="node">An AST node representing a NFA</param> /// <returns>The equivalent NFA</returns> private NFA BuildNFAFromCodepoint(ASTNode node) { // extract the code point value string value = node.Value; value = value.Substring(2, value.Length - 2); int cpValue = Convert.ToInt32(value, 16); if (cpValue < 0 || (cpValue >= 0xD800 && cpValue <= 0xDFFF) || cpValue >= 0x110000) { OnError(node.Position, "The value U+{0} is not a supported unicode code point", cpValue.ToString("X")); return(BuildEpsilonNFA()); } UnicodeCodePoint cp = new UnicodeCodePoint(cpValue); // build the NFA NFA automata = NFA.NewMinimal(); char[] data = cp.GetUTF16(); if (data.Length == 1) { automata.StateEntry.AddTransition(new CharSpan(data[0], data[0]), automata.StateExit); } else { NFAState intermediate = automata.AddNewState(); automata.StateEntry.AddTransition(new CharSpan(data[0], data[0]), intermediate); intermediate.AddTransition(new CharSpan(data[1], data[1]), automata.StateExit); } return(automata); }
public void BruteTestingRoundtripEncodeDecodeAllUnicodeCodePoints(TextEncoder encoder) { const uint maximumValidCodePoint = 0x10FFFF; UnicodeCodePoint[] expectedCodePoints = new UnicodeCodePoint[maximumValidCodePoint + 1]; for (int i = 0; i <= maximumValidCodePoint; i++) { if (i >= 0xD800 && i <= 0xDFFF) { expectedCodePoints[i] = new UnicodeCodePoint(0); // skip surrogate characters } else { expectedCodePoints[i] = new UnicodeCodePoint((uint)i); } } ReadOnlySpan <UnicodeCodePoint> expectedCodePointsSpan = new ReadOnlySpan <UnicodeCodePoint>(expectedCodePoints); uint maxBytes = 4 * (maximumValidCodePoint + 1); Span <byte> buffer = new Span <byte>(new byte[maxBytes]); int bytesWritten; Assert.True(encoder.TryEncodeFromUnicode(expectedCodePointsSpan, buffer, out bytesWritten)); Span <UnicodeCodePoint> codePoints = new Span <UnicodeCodePoint>(new UnicodeCodePoint[maximumValidCodePoint + 1]); Assert.True(encoder.TryDecodeToUnicode(buffer, codePoints, out bytesWritten)); for (int i = 0; i <= maximumValidCodePoint; i++) { Assert.Equal(expectedCodePointsSpan[i].Value, codePoints[i].Value); } }
private static bool TryGetFirstByteCodePointValue(byte first, out UnicodeCodePoint codePoint, out int encodedBytes) { if (!TryGetNumberOfEncodedBytesFromFirstByte(first, out encodedBytes)) { codePoint = default(UnicodeCodePoint); return(false); } switch (encodedBytes) { case 1: codePoint = (UnicodeCodePoint)(first & b0111_1111U); return(true); case 2: codePoint = (UnicodeCodePoint)(first & b0001_1111U); return(true); case 3: codePoint = (UnicodeCodePoint)(first & b0000_1111U); return(true); case 4: codePoint = (UnicodeCodePoint)(first & b0000_0111U); return(true); default: codePoint = default(UnicodeCodePoint); encodedBytes = 0; return(false); } }
private static bool TryReadCodePointByte(byte nextByte, ref UnicodeCodePoint codePoint) { uint current = nextByte; if ((current & b1100_0000U) != b1000_0000U) return false; codePoint = new UnicodeCodePoint((codePoint.Value << 6) | (b0011_1111U & current)); return true; }
public void BruteTestingEncodeAllUnicodeCodePoints(TextEncoder encoder, Encoding systemTextEncoder) { const uint maximumValidCodePoint = 0x10FFFF; UnicodeCodePoint[] codePoints = new UnicodeCodePoint[maximumValidCodePoint + 1]; var plainText = new StringBuilder(); for (int i = 0; i <= maximumValidCodePoint; i++) { if (i >= 0xD800 && i <= 0xDFFF) { codePoints[i] = new UnicodeCodePoint(0); // skip surrogate characters plainText.Append((char)0); // skip surrogate characters } else { codePoints[i] = new UnicodeCodePoint((uint)i); if (i > 0xFFFF) { plainText.Append(char.ConvertFromUtf32(i)); } else { plainText.Append((char)i); } } } ReadOnlySpan <UnicodeCodePoint> codePointsSpan = new ReadOnlySpan <UnicodeCodePoint>(codePoints); uint maxBytes = 4 * (maximumValidCodePoint + 1); Span <byte> buffer = new Span <byte>(new byte[maxBytes]); int bytesWritten; Assert.True(encoder.TryEncodeFromUnicode(codePointsSpan, buffer, out bytesWritten)); string unicodeString = plainText.ToString(); ReadOnlySpan <char> characters = unicodeString.Slice(); int byteCount = systemTextEncoder.GetByteCount(unicodeString); byte[] buff = new byte[byteCount]; Span <byte> expectedBuffer; char[] charArray = characters.ToArray(); systemTextEncoder.GetBytes(charArray, 0, characters.Length, buff, 0); expectedBuffer = new Span <byte>(buff); int minLength = Math.Min(expectedBuffer.Length, buffer.Length); for (int i = 0; i < minLength; i++) { Assert.Equal(expectedBuffer[i], buffer[i]); } }
public Utf8String TrimEnd() { CodePointReverseEnumerator it = CodePoints.GetReverseEnumerator(); while (it.MoveNext() && UnicodeCodePoint.IsWhitespace(it.Current)) { } return(Substring(0, it.PositionInCodeUnits)); }
public Utf8String TrimStart() { CodePointEnumerator it = GetCodePointEnumerator(); while (it.MoveNext() && UnicodeCodePoint.IsWhitespace(it.Current)) { } return(Substring(it.PositionInCodeUnits)); }
public static bool TryDecodeCodePoint(ByteSpan buffer, out UnicodeCodePoint codePoint, out int encodedBytes) { if (buffer.Length < 2) { codePoint = default(UnicodeCodePoint); encodedBytes = default(int); // buffer too small return(false); } uint codePointValue = (uint)buffer[0] | ((uint)buffer[1] << 8); encodedBytes = 2; // Notice: This is any surrogate, not only high surrogate bool isSurrogate = codePointValue >= SpecConstants.HighSurrogateFirstCodePoint && codePointValue <= SpecConstants.LowSurrogateLastCodePoint; if (isSurrogate) { isSurrogate = codePointValue <= SpecConstants.HighSurrogateLastCodePoint; if (!isSurrogate || buffer.Length < 4) { codePoint = default(UnicodeCodePoint); encodedBytes = default(int); // invalid high surrogate or buffer too small return(false); } unchecked { codePointValue -= SpecConstants.HighSurrogateFirstCodePoint; encodedBytes += 2; } // high surrogate contains 10 first bits of the code point codePointValue <<= 10; uint lowSurrogate = (uint)buffer[2] | ((uint)buffer[3] << 8); if (lowSurrogate < SpecConstants.LowSurrogateFirstCodePoint || lowSurrogate > SpecConstants.LowSurrogateLastCodePoint) { codePoint = default(UnicodeCodePoint); encodedBytes = default(int); // invalid low surrogate character return(false); } unchecked { lowSurrogate -= SpecConstants.LowSurrogateFirstCodePoint; } codePointValue |= lowSurrogate; } codePoint = (UnicodeCodePoint)codePointValue; return(true); }
public bool StartsWith(UnicodeCodePoint codePoint) { CodePointEnumerator e = GetCodePointEnumerator(); if (!e.MoveNext()) { return(false); } return(e.Current == codePoint); }
private static bool TryReadCodePointByte(byte nextByte, ref UnicodeCodePoint codePoint) { uint current = nextByte; if ((current & b1100_0000U) != b1000_0000U) { return(false); } codePoint = new UnicodeCodePoint((codePoint.Value << 6) | (b0011_1111U & current)); return(true); }
public override string ToString() { // get length first // TODO: Optimize for characters of length 1 or 2 in UTF-8 representation (no need to read anything) // TODO: is compiler gonna do the right thing here? // TODO: Should we use Linq's Count()? int len = 0; foreach (var codePoint in CodePoints) { len++; if (!UnicodeCodePoint.IsBmp(codePoint)) { len++; } } unsafe { Span <byte> buffer; char * stackChars = null; char[] characters = null; if (len <= 256) { char *stackallocedChars = stackalloc char[len]; stackChars = stackallocedChars; buffer = new Span <byte>(stackChars, len * 2); } else { // HACK: Can System.Buffers be used here? characters = new char[len]; buffer = characters.Slice().Cast <char, byte>(); } foreach (var codePoint in CodePoints) { int bytesEncoded; if (!Utf16LittleEndianEncoder.TryEncodeCodePoint(codePoint, buffer, out bytesEncoded)) { // TODO: Change Exception type throw new Exception("invalid character"); } buffer = buffer.Slice(bytesEncoded); } // TODO: We already have a char[] and this will copy, how to avoid that return(stackChars != null ? new string(stackChars, 0, len) : new string(characters)); } }
public override bool TryEncodeFromUnicode(ReadOnlySpan <UnicodeCodePoint> codePoints, Span <byte> buffer, out int bytesWritten) { int availableBytes = buffer.Length; var inputLength = codePoints.Length; int bytesWrittenForCodePoint = 0; bytesWritten = 0; for (int i = 0; i < inputLength; i++) { UnicodeCodePoint codePoint = codePoints[i]; bytesWrittenForCodePoint = GetNumberOfEncodedBytes(codePoint); if (!UnicodeCodePoint.IsSupportedCodePoint(codePoint) || bytesWritten + bytesWrittenForCodePoint > availableBytes) { bytesWritten = 0; return(false); } switch (bytesWrittenForCodePoint) { case 1: buffer[bytesWritten] = (byte)(b0111_1111U & codePoint.Value); break; case 2: buffer[bytesWritten] = (byte)(((codePoint.Value >> 6) & b0001_1111U) | b1100_0000U); buffer[bytesWritten + 1] = (byte)(((codePoint.Value >> 0) & b0011_1111U) | b1000_0000U); break; case 3: buffer[bytesWritten] = (byte)(((codePoint.Value >> 12) & b0000_1111U) | b1110_0000U); buffer[bytesWritten + 1] = (byte)(((codePoint.Value >> 6) & b0011_1111U) | b1000_0000U); buffer[bytesWritten + 2] = (byte)(((codePoint.Value >> 0) & b0011_1111U) | b1000_0000U); break; case 4: buffer[bytesWritten] = (byte)(((codePoint.Value >> 18) & b0000_0111U) | b1111_0000U); buffer[bytesWritten + 1] = (byte)(((codePoint.Value >> 12) & b0011_1111U) | b1000_0000U); buffer[bytesWritten + 2] = (byte)(((codePoint.Value >> 6) & b0011_1111U) | b1000_0000U); buffer[bytesWritten + 3] = (byte)(((codePoint.Value >> 0) & b0011_1111U) | b1000_0000U); break; default: bytesWritten = 0; return(false); } bytesWritten += bytesWrittenForCodePoint; } return(true); }
public bool TrySubstringTo(UnicodeCodePoint codePoint, out Utf8String result) { int idx = IndexOf(codePoint); if (idx == StringNotFound) { result = default(Utf8String); return(false); } result = Substring(0, idx); return(true); }
public static bool TryDecodeCodePoint(Span <byte> buffer, out UnicodeCodePoint codePoint, out int encodedBytes) { if (buffer.Length < 2) { codePoint = default(UnicodeCodePoint); encodedBytes = default(int); // buffer too small return(false); } uint codePointValue = (uint)buffer[0] | ((uint)buffer[1] << 8); encodedBytes = 2; if (UnicodeCodePoint.IsSurrogate((UnicodeCodePoint)codePointValue)) { // TODO: Check if compiler optimized it so codePointValue low range is checked only once if (!UnicodeCodePoint.IsHighSurrogate((UnicodeCodePoint)codePointValue) || buffer.Length < 4) { codePoint = default(UnicodeCodePoint); encodedBytes = default(int); // invalid high surrogate or buffer too small return(false); } unchecked { codePointValue -= UnicodeConstants.Utf16HighSurrogateFirstCodePoint; encodedBytes += 2; } // high surrogate contains 10 first bits of the code point codePointValue <<= 10; uint lowSurrogate = (uint)buffer[2] | ((uint)buffer[3] << 8); if (!UnicodeCodePoint.IsLowSurrogate((UnicodeCodePoint)lowSurrogate)) { codePoint = default(UnicodeCodePoint); encodedBytes = default(int); // invalid low surrogate character return(false); } unchecked { lowSurrogate -= UnicodeConstants.Utf16LowSurrogateFirstCodePoint; } codePointValue |= lowSurrogate; } codePoint = (UnicodeCodePoint)codePointValue; return(true); }
// TODO: Should this be public? public int IndexOf(UnicodeCodePoint codePoint) { CodePointEnumerator it = GetCodePointEnumerator(); while (it.MoveNext()) { if (it.Current == codePoint) { return(it.PositionInCodeUnits); } } return(StringNotFound); }
public static bool TryDecodeCodePoint(Span<byte> buffer, out UnicodeCodePoint codePoint, out int encodedBytes) { if (buffer.Length < 2) { codePoint = default(UnicodeCodePoint); encodedBytes = default(int); // buffer too small return false; } uint codePointValue = (uint)buffer[0] | ((uint)buffer[1] << 8); encodedBytes = 2; if (UnicodeCodePoint.IsSurrogate((UnicodeCodePoint)codePointValue)) { // TODO: Check if compiler optimized it so codePointValue low range is checked only once if (!UnicodeCodePoint.IsHighSurrogate((UnicodeCodePoint)codePointValue) || buffer.Length < 4) { codePoint = default(UnicodeCodePoint); encodedBytes = default(int); // invalid high surrogate or buffer too small return false; } unchecked { codePointValue -= UnicodeConstants.Utf16HighSurrogateFirstCodePoint; encodedBytes += 2; } // high surrogate contains 10 first bits of the code point codePointValue <<= 10; uint lowSurrogate = (uint)buffer[2] | ((uint)buffer[3] << 8); if (!UnicodeCodePoint.IsLowSurrogate((UnicodeCodePoint)lowSurrogate)) { codePoint = default(UnicodeCodePoint); encodedBytes = default(int); // invalid low surrogate character return false; } unchecked { lowSurrogate -= UnicodeConstants.Utf16LowSurrogateFirstCodePoint; } codePointValue |= lowSurrogate; } codePoint = (UnicodeCodePoint)codePointValue; return true; }
public unsafe Utf8EncodedCodePoint(char highSurrogate, char lowSurrogate) : this() { UnicodeCodePoint codePoint = (UnicodeCodePoint)(uint)char.ConvertToUtf32(highSurrogate, lowSurrogate); fixed(byte *encodedData = &_byte0) { Span <byte> buffer = new Span <byte>(encodedData, 4); if (!Utf8Encoder.TryEncodeCodePoint(codePoint, buffer, out _length)) { // TODO: Change exception type throw new Exception("Internal error: this should never happen as codePoint should be within acceptable range"); } } }
internal static int GetNumberOfEncodedBytes(UnicodeCodePoint codePoint) { if (codePoint.Value <= 0x7F) return 1; if (codePoint.Value <= 0x7FF) return 2; if (codePoint.Value <= 0xFFFF) return 3; if (codePoint.Value <= 0x1FFFFF) return 4; return -1; }
public override bool TryDecodeToUnicode(Span <byte> encoded, Span <UnicodeCodePoint> decoded, out int bytesWritten) { var avaliableBytes = encoded.Length; var outputLength = decoded.Length; int bytesWrittenForCodePoint = 0; bytesWritten = 0; for (int i = 0; i < outputLength; i++) { UnicodeCodePoint decodedCodePoint = decoded[i]; if (avaliableBytes - bytesWritten < 2) { decodedCodePoint = new UnicodeCodePoint(); bytesWritten = 0; return(false); } uint answer = (uint)(encoded[1 + bytesWritten] << 8 | encoded[bytesWritten]); decodedCodePoint = new UnicodeCodePoint(answer); bytesWrittenForCodePoint = 2; if (avaliableBytes - bytesWritten >= 4) { uint highBytes = answer; uint lowBytes = (uint)(encoded[3 + bytesWritten] << 8 | encoded[2 + bytesWritten]); if (highBytes >= UnicodeConstants.Utf16HighSurrogateFirstCodePoint && highBytes <= UnicodeConstants.Utf16HighSurrogateLastCodePoint && lowBytes >= UnicodeConstants.Utf16LowSurrogateFirstCodePoint && lowBytes <= UnicodeConstants.Utf16LowSurrogateLastCodePoint) { answer = (((highBytes - UnicodeConstants.Utf16HighSurrogateFirstCodePoint) << 10) | (lowBytes - UnicodeConstants.Utf16LowSurrogateFirstCodePoint)) + 0x10000; decodedCodePoint = new UnicodeCodePoint(answer); bytesWrittenForCodePoint = 4; } } decoded[i] = decodedCodePoint; bytesWritten += bytesWrittenForCodePoint; } return(true); }
public static bool TryEncodeCodePoint(UnicodeCodePoint codePoint, Span <byte> buffer, out int encodedBytes) { if (!UnicodeCodePoint.IsSupportedCodePoint(codePoint)) { encodedBytes = 0; return(false); } encodedBytes = GetNumberOfEncodedBytes(codePoint); if (encodedBytes > buffer.Length) { encodedBytes = 0; return(false); } switch (encodedBytes) { case 1: buffer[0] = (byte)(b0111_1111U & codePoint.Value); return(true); case 2: byte b0 = (byte)(((codePoint.Value >> 6) & b0001_1111U) | b1100_0000U); byte b1 = (byte)(((codePoint.Value >> 0) & b0011_1111U) | b1000_0000U); buffer.Write((ushort)(b0 | b1 << 8)); return(true); case 3: b0 = (byte)(((codePoint.Value >> 12) & b0000_1111U) | b1110_0000U); b1 = (byte)(((codePoint.Value >> 6) & b0011_1111U) | b1000_0000U); buffer.Write((ushort)(b0 | b1 << 8)); buffer[2] = (byte)(((codePoint.Value >> 0) & b0011_1111U) | b1000_0000U); return(true); case 4: b0 = (byte)(((codePoint.Value >> 18) & b0000_0111U) | b1111_0000U); b1 = (byte)(((codePoint.Value >> 12) & b0011_1111U) | b1000_0000U); byte b2 = (byte)(((codePoint.Value >> 6) & b0011_1111U) | b1000_0000U); byte b3 = (byte)(((codePoint.Value >> 0) & b0011_1111U) | b1000_0000U); buffer.Write((uint)(b0 | b1 << 8 | b2 << 16 | b3 << 24)); return(true); default: return(false); } }
public static bool TryEncodeCodePoint(UnicodeCodePoint codePoint, Span <byte> buffer, out int encodedBytes) { if (!UnicodeCodePoint.IsSupportedCodePoint(codePoint)) { encodedBytes = 0; return(false); } encodedBytes = GetNumberOfEncodedBytes(codePoint); if (encodedBytes > buffer.Length) { encodedBytes = 0; return(false); } switch (encodedBytes) { case 1: buffer[0] = (byte)(0b0111_1111U & codePoint.Value); return(true); case 2: buffer[0] = (byte)(((codePoint.Value >> 6) & 0b0001_1111U) | 0b1100_0000U); buffer[1] = (byte)(((codePoint.Value >> 0) & 0b0011_1111U) | 0b1000_0000U); return(true); case 3: buffer[0] = (byte)(((codePoint.Value >> 12) & 0b0000_1111U) | 0b1110_0000U); buffer[1] = (byte)(((codePoint.Value >> 6) & 0b0011_1111U) | 0b1000_0000U); buffer[2] = (byte)(((codePoint.Value >> 0) & 0b0011_1111U) | 0b1000_0000U); return(true); case 4: buffer[0] = (byte)(((codePoint.Value >> 18) & 0b0000_0111U) | 0b1111_0000U); buffer[1] = (byte)(((codePoint.Value >> 12) & 0b0011_1111U) | 0b1000_0000U); buffer[2] = (byte)(((codePoint.Value >> 6) & 0b0011_1111U) | 0b1000_0000U); buffer[3] = (byte)(((codePoint.Value >> 0) & 0b0011_1111U) | 0b1000_0000U); return(true); default: return(false); } }
public static bool TryEncodeCodePoint(UnicodeCodePoint codePoint, Span<byte> buffer, out int encodedBytes) { if (!UnicodeCodePoint.IsSupportedCodePoint(codePoint)) { encodedBytes = default(int); return false; } // TODO: Can we add this in UnicodeCodePoint class? // Should be represented as Surrogate? encodedBytes = ((uint)codePoint >= 0x10000) ? 4 : 2; if (buffer.Length < encodedBytes) { codePoint = default(UnicodeCodePoint); encodedBytes = default(int); // buffer too small return false; } if (encodedBytes == 2) { unchecked { buffer[0] = (byte)((uint)codePoint); buffer[1] = (byte)((uint)codePoint >> 8); } } else { unchecked { uint highSurrogate = ((uint)codePoint >> 10) + UnicodeConstants.Utf16HighSurrogateFirstCodePoint; uint lowSurrogate = ((uint)codePoint & MaskLow10Bits) + UnicodeConstants.Utf16LowSurrogateFirstCodePoint; buffer[0] = (byte)highSurrogate; buffer[1] = (byte)(highSurrogate >> 8); buffer[2] = (byte)lowSurrogate; buffer[3] = (byte)(lowSurrogate >> 8); } } return true; }
public static bool TryEncodeCodePoint(UnicodeCodePoint codePoint, Span <byte> buffer, out int encodedBytes) { if (!UnicodeCodePoint.IsSupportedCodePoint(codePoint)) { encodedBytes = default(int); return(false); } // TODO: Can we add this in UnicodeCodePoint class? // Should be represented as Surrogate? encodedBytes = ((uint)codePoint >= 0x10000) ? 4 : 2; if (buffer.Length < encodedBytes) { codePoint = default(UnicodeCodePoint); encodedBytes = default(int); // buffer too small return(false); } if (encodedBytes == 2) { unchecked { buffer[0] = (byte)((uint)codePoint); buffer[1] = (byte)((uint)codePoint >> 8); } } else { unchecked { uint highSurrogate = ((uint)codePoint >> 10) + UnicodeConstants.Utf16HighSurrogateFirstCodePoint; uint lowSurrogate = ((uint)codePoint & MaskLow10Bits) + UnicodeConstants.Utf16LowSurrogateFirstCodePoint; buffer[0] = (byte)highSurrogate; buffer[1] = (byte)(highSurrogate >> 8); buffer[2] = (byte)lowSurrogate; buffer[3] = (byte)(lowSurrogate >> 8); } } return(true); }
public unsafe static bool TryEncodeCodePoint(UnicodeCodePoint codePoint, char* buffer, out int encodedChars) { if (!UnicodeCodePoint.IsSupportedCodePoint(codePoint)) { encodedChars = default(int); return false; } // TODO: Can we add this in UnicodeCodePoint class? // Should be represented as Surrogate? encodedChars = ((uint)codePoint >= 0x10000) ? 2 : 1; /* Never happens. Max encodedBytes = 4 bytes = 2 chars. We already preallocate 2 chars for every UTF8 byte. if (buffer.Length < encodedBytes) { codePoint = default(UnicodeCodePoint); encodedBytes = default(int); // buffer too small return false; } */ if (encodedChars == 1) { unchecked { Write(buffer, (ushort)codePoint); } } else { unchecked { uint highSurrogate = ((uint)(codePoint.Value - 0x10000) >> 10) + UnicodeConstants.Utf16HighSurrogateFirstCodePoint; uint lowSurrogate = ((uint)codePoint & MaskLow10Bits) + UnicodeConstants.Utf16LowSurrogateFirstCodePoint; Write(buffer, highSurrogate | (lowSurrogate << 16)); } } return true; }
public unsafe static bool TryEncodeCodePoint(UnicodeCodePoint codePoint, char *buffer, out int encodedChars) { if (!UnicodeCodePoint.IsSupportedCodePoint(codePoint)) { encodedChars = default(int); return(false); } // TODO: Can we add this in UnicodeCodePoint class? // Should be represented as Surrogate? encodedChars = ((uint)codePoint >= 0x10000) ? 2 : 1; /* * Never happens. Max encodedBytes = 4 bytes = 2 chars. We already preallocate 2 chars for every UTF8 byte. * if (buffer.Length < encodedBytes) * { * codePoint = default(UnicodeCodePoint); * encodedBytes = default(int); * // buffer too small * return false; * } */ if (encodedChars == 1) { unchecked { Write(buffer, (ushort)codePoint); } } else { unchecked { uint highSurrogate = ((uint)(codePoint.Value - 0x10000) >> 10) + UnicodeConstants.Utf16HighSurrogateFirstCodePoint; uint lowSurrogate = ((uint)codePoint & MaskLow10Bits) + UnicodeConstants.Utf16LowSurrogateFirstCodePoint; Write(buffer, highSurrogate | (lowSurrogate << 16)); } } return(true); }
public static bool TryEncodeCodePoint(UnicodeCodePoint codePoint, ByteSpan buffer, out int encodedBytes) { if ((uint)codePoint > 0x10FFFF) { encodedBytes = default(int); return(false); } // is Surrogate? encodedBytes = ((uint)codePoint >= 0x10000) ? 4 : 2; if (buffer.Length < encodedBytes) { codePoint = default(UnicodeCodePoint); encodedBytes = default(int); // buffer too small return(false); } if (encodedBytes == 2) { unchecked { buffer[0] = (byte)((uint)codePoint); buffer[1] = (byte)((uint)codePoint >> 8); } } else { unchecked { uint highSurrogate = ((uint)codePoint >> 10) + 0xD800; uint lowSurrogate = ((uint)codePoint & MaskLow10Bits) + 0xDC00; buffer[0] = (byte)highSurrogate; buffer[1] = (byte)(highSurrogate >> 8); buffer[2] = (byte)lowSurrogate; buffer[3] = (byte)(lowSurrogate >> 8); } } return(true); }
public static bool TryEncodeCodePoint(UnicodeCodePoint codePoint, Span<byte> buffer, out int encodedBytes) { if (!UnicodeCodePoint.IsSupportedCodePoint(codePoint)) { encodedBytes = 0; return false; } encodedBytes = GetNumberOfEncodedBytes(codePoint); if (encodedBytes > buffer.Length) { encodedBytes = 0; return false; } switch (encodedBytes) { case 1: buffer[0] = (byte)(b0111_1111U & codePoint.Value); return true; case 2: byte b0 = (byte)(((codePoint.Value >> 6) & b0001_1111U) | b1100_0000U); byte b1 = (byte)(((codePoint.Value >> 0) & b0011_1111U) | b1000_0000U); buffer.Write((ushort)(b0 | b1 << 8)); return true; case 3: b0 = (byte)(((codePoint.Value >> 12) & b0000_1111U) | b1110_0000U); b1 = (byte)(((codePoint.Value >> 6) & b0011_1111U) | b1000_0000U); buffer.Write((ushort)(b0 | b1 << 8)); buffer[2] = (byte)(((codePoint.Value >> 0) & b0011_1111U) | b1000_0000U); return true; case 4: b0 = (byte)(((codePoint.Value >> 18) & b0000_0111U) | b1111_0000U); b1 = (byte)(((codePoint.Value >> 12) & b0011_1111U) | b1000_0000U); byte b2 = (byte)(((codePoint.Value >> 6) & b0011_1111U) | b1000_0000U); byte b3 = (byte)(((codePoint.Value >> 0) & b0011_1111U) | b1000_0000U); buffer.Write((uint)(b0 | b1 << 8 | b2 << 16 | b3 << 24)); return true; default: return false; } }
public unsafe Utf8EncodedCodePoint(char character) : this() { if (char.IsSurrogate(character)) { throw new ArgumentOutOfRangeException("character", "Surrogate characters are not allowed"); } UnicodeCodePoint codePoint = (UnicodeCodePoint)(uint)character; fixed(byte *encodedData = &_byte0) { ByteSpan buffer = new ByteSpan(encodedData, 4); if (!Utf8Encoder.TryEncodeCodePoint(codePoint, buffer, out _length)) { // TODO: Change exception type throw new Exception("Internal error: this should never happen as codePoint is within acceptable range and is not surrogate"); } } }
// TODO: Should we rewrite this to not use char.ConvertToUtf32 or is it fast enough? public static bool TryDecodeCodePointFromString(string s, int index, out UnicodeCodePoint codePoint, out int encodedChars) { if (index < 0 || index >= s.Length) { codePoint = default(UnicodeCodePoint); encodedChars = 0; return(false); } if (index == s.Length - 1 && char.IsSurrogate(s[index])) { codePoint = default(UnicodeCodePoint); encodedChars = 0; return(false); } encodedChars = char.IsHighSurrogate(s[index]) ? 2 : 1; codePoint = (UnicodeCodePoint)(unchecked ((uint)char.ConvertToUtf32(s, index))); return(true); }
public override string ToString() { // get length first // TODO: Optimize for characters of length 1 or 2 in UTF-8 representation (no need to read anything) // TODO: is compiler gonna do the right thing here? // TODO: Should we use Linq's Count()? int len = 0; foreach (var codePoint in CodePoints) { len++; if (UnicodeCodePoint.IsSurrogate(codePoint)) { len++; } } char[] characters = new char[len]; unsafe { fixed(char *pinnedCharacters = characters) { Span <byte> buffer = new Span <byte>((byte *)pinnedCharacters, len * 2); foreach (var codePoint in CodePoints) { int bytesEncoded; if (!Utf16LittleEndianEncoder.TryEncodeCodePoint(codePoint, buffer, out bytesEncoded)) { // TODO: Change Exception type throw new Exception("invalid character"); } buffer = buffer.Slice(bytesEncoded); } } } // TODO: We already have a char[] and this will copy, how to avoid that return(new string(characters)); }
public static bool TryDecodeCodePointBackwards(ReadOnlySpan <byte> buffer, out UnicodeCodePoint codePoint, out int encodedBytes) { if (TryFindEncodedCodePointBytesCountGoingBackwards(buffer, out encodedBytes)) { int realEncodedBytes; // TODO: Inline decoding, as the invalid surrogate check can be done faster bool ret = TryDecodeCodePoint(buffer.Slice(buffer.Length - encodedBytes), out codePoint, out realEncodedBytes); if (ret && encodedBytes != realEncodedBytes) { // invalid surrogate character // we know the character length by iterating on surrogate characters from the end // but the first byte of the character has also encoded length // seems like the lengths don't match return(false); } return(true); } codePoint = default(UnicodeCodePoint); encodedBytes = default(int); return(false); }
public static bool TryDecodeCodePointBackwards(Span<byte> buffer, out UnicodeCodePoint codePoint, out int encodedBytes) { if (TryFindEncodedCodePointBytesCountGoingBackwards(buffer, out encodedBytes)) { int realEncodedBytes; // TODO: Inline decoding, as the invalid surrogate check can be done faster bool ret = TryDecodeCodePoint(buffer.Slice(buffer.Length - encodedBytes), out codePoint, out realEncodedBytes); if (ret && encodedBytes != realEncodedBytes) { // invalid surrogate character // we know the character length by iterating on surrogate characters from the end // but the first byte of the character has also encoded length // seems like the lengths don't match return false; } return true; } codePoint = default(UnicodeCodePoint); encodedBytes = default(int); return false; }
public static bool TryEncodeCodePoint(UnicodeCodePoint codePoint, Span<byte> buffer, out int encodedBytes) { if (!UnicodeCodePoint.IsSupportedCodePoint(codePoint)) { encodedBytes = default(int); return false; } encodedBytes = UnicodeCodePoint.IsBmp(codePoint) ? 2 : 4; if (buffer.Length < encodedBytes) { codePoint = default(UnicodeCodePoint); encodedBytes = default(int); // buffer too small return false; } if (encodedBytes == 2) { unchecked { buffer.Write((ushort)codePoint); } } else { unchecked { uint codePointValue = (uint)codePoint; uint highSurrogate = ((codePointValue - 0x010000u) >> 10) + UnicodeConstants.Utf16HighSurrogateFirstCodePoint; uint lowSurrogate = (codePointValue & MaskLow10Bits) + UnicodeConstants.Utf16LowSurrogateFirstCodePoint; buffer.Write(highSurrogate | (lowSurrogate << 16)); } } return true; }
public static bool TryEncodeCodePoint(UnicodeCodePoint codePoint, Span <byte> buffer, out int encodedBytes) { if (!UnicodeCodePoint.IsSupportedCodePoint(codePoint)) { encodedBytes = default(int); return(false); } encodedBytes = UnicodeCodePoint.IsBmp(codePoint) ? 2 : 4; if (buffer.Length < encodedBytes) { codePoint = default(UnicodeCodePoint); encodedBytes = default(int); // buffer too small return(false); } if (encodedBytes == 2) { unchecked { buffer.Write((ushort)codePoint); } } else { unchecked { uint codePointValue = (uint)codePoint; uint highSurrogate = ((codePointValue - 0x010000u) >> 10) + UnicodeConstants.Utf16HighSurrogateFirstCodePoint; uint lowSurrogate = (codePointValue & MaskLow10Bits) + UnicodeConstants.Utf16LowSurrogateFirstCodePoint; buffer.Write(highSurrogate | (lowSurrogate << 16)); } } return(true); }
private static int GetNumberOfEncodedBytes(UnicodeCodePoint codePoint) { if (codePoint.Value <= 0x7F) { return(1); } if (codePoint.Value <= 0x7FF) { return(2); } if (codePoint.Value <= 0xFFFF) { return(3); } if (codePoint.Value <= 0x10FFFF) { return(4); } return(0); }
// TODO: Should we rewrite this to not use char.ConvertToUtf32 or is it fast enough? public static bool TryDecodeCodePointFromString(string s, int index, out UnicodeCodePoint codePoint, out int encodedChars) { if (index < 0 || index >= s.Length) { codePoint = default(UnicodeCodePoint); encodedChars = 0; return false; } if (index == s.Length - 1 && char.IsSurrogate(s[index])) { codePoint = default(UnicodeCodePoint); encodedChars = 0; return false; } encodedChars = char.IsHighSurrogate(s[index]) ? 2 : 1; codePoint = (UnicodeCodePoint)(unchecked((uint)char.ConvertToUtf32(s, index))); return true; }
public static bool TryEncodeCodePoint(UnicodeCodePoint codePoint, Span<byte> buffer, out int encodedBytes) { if (!UnicodeCodePoint.IsSupportedCodePoint(codePoint)) { encodedBytes = 0; return false; } encodedBytes = GetNumberOfEncodedBytes(codePoint); if (encodedBytes > buffer.Length) { encodedBytes = 0; return false; } switch (encodedBytes) { case 1: buffer[0] = (byte)(0b0111_1111U & codePoint.Value); return true; case 2: buffer[0] = (byte)(((codePoint.Value >> 6) & 0b0001_1111U) | 0b1100_0000U); buffer[1] = (byte)(((codePoint.Value >> 0) & 0b0011_1111U) | 0b1000_0000U); return true; case 3: buffer[0] = (byte)(((codePoint.Value >> 12) & 0b0000_1111U) | 0b1110_0000U); buffer[1] = (byte)(((codePoint.Value >> 6) & 0b0011_1111U) | 0b1000_0000U); buffer[2] = (byte)(((codePoint.Value >> 0) & 0b0011_1111U) | 0b1000_0000U); return true; case 4: buffer[0] = (byte)(((codePoint.Value >> 18) & 0b0000_0111U) | 0b1111_0000U); buffer[1] = (byte)(((codePoint.Value >> 12) & 0b0011_1111U) | 0b1000_0000U); buffer[2] = (byte)(((codePoint.Value >> 6) & 0b0011_1111U) | 0b1000_0000U); buffer[3] = (byte)(((codePoint.Value >> 0) & 0b0011_1111U) | 0b1000_0000U); return true; default: return false; } }
internal static int GetNumberOfEncodedBytes(UnicodeCodePoint codePoint) { if (codePoint.Value <= 0x7F) { return 1; } if (codePoint.Value <= 0x7FF) { return 2; } if (codePoint.Value <= 0xFFFF) { return 3; } if (codePoint.Value <= 0x1FFFFF) { return 4; } return 0; }