Example #1
0
        public static bool TryEncodeCodePoint(UnicodeCodePoint codePoint, ByteSpan buffer, out int encodedBytes)
        {
            encodedBytes = GetNumberOfEncodedBytes(codePoint);
            if (encodedBytes > buffer.Length)
                return false;

            switch (encodedBytes)
            {
                case 1:
                    buffer[0] = (byte)(mask_0111_1111 & codePoint.Value);
                    return true;
                case 2:
                    buffer[0] = (byte)(((codePoint.Value >> 6) & mask_0001_1111) | mask_1100_0000);
                    buffer[1] = (byte)(((codePoint.Value >> 0) & mask_0011_1111) | mask_1000_0000);
                    return true;
                case 3:
                    buffer[0] = (byte)(((codePoint.Value >> 12) & mask_0000_1111) | mask_1110_0000);
                    buffer[1] = (byte)(((codePoint.Value >> 6) & mask_0011_1111) | mask_1000_0000);
                    buffer[2] = (byte)(((codePoint.Value >> 0) & mask_0011_1111) | mask_1000_0000);
                    return true;
                case 4:
                    buffer[0] = (byte)(((codePoint.Value >> 18) & mask_0000_0111) | mask_1111_0000);
                    buffer[1] = (byte)(((codePoint.Value >> 12) & mask_0011_1111) | mask_1000_0000);
                    buffer[2] = (byte)(((codePoint.Value >> 6) & mask_0011_1111) | mask_1000_0000);
                    buffer[3] = (byte)(((codePoint.Value >> 0) & mask_0011_1111) | mask_1000_0000);
                    return true;
                default:
                    return false;
            }
        }
Example #2
0
        public unsafe void ReverseEnumerateCodePointsConstructFromSpan()
        {
            TestCase[] testCases = new TestCase[] {
                new TestCase(GetRandomString(5, 32, 126), "Short ASCII string", 3000000),
                new TestCase(GetRandomString(5, 32, 0xD7FF), "Short string", 3000000),
                new TestCase(GetRandomString(50000, 32, 126), "Long ASCII string", 300),
                new TestCase(GetRandomString(50000, 32, 0xD7FF), "Long string", 300)
            };
            foreach (TestCase testData in testCases)
            {
                string     s     = testData.String;
                Utf8String utf8s = new Utf8String(s);
                fixed(byte *bytes = utf8s.CopyBytes())
                {
                    utf8s = new Utf8String(new Span <byte>(bytes, utf8s.Length));
                    int iterations = testData.Iterations;

                    _timer.Restart();
                    while (iterations-- != 0)
                    {
                        Utf8String.CodePointReverseEnumerator it = utf8s.CodePoints.GetReverseEnumerator();
                        while (it.MoveNext())
                        {
                            UnicodeCodePoint codePoint = it.Current;
                        }
                    }
                    PrintTime(testData);
                }
            }
        }
Example #3
0
        public static bool TryDecodeCodePoint(ReadOnlySpan <byte> buffer, out UnicodeCodePoint codePoint, out int encodedBytes)
        {
            if (buffer.Length == 0)
            {
                codePoint    = default(UnicodeCodePoint);
                encodedBytes = default(int);
                return(false);
            }

            byte first = buffer[0];

            if (!TryGetFirstByteCodePointValue(first, out codePoint, out encodedBytes))
            {
                return(false);
            }

            if (buffer.Length < encodedBytes)
            {
                return(false);
            }

            // TODO: Should we manually inline this for values 1-4 or will compiler do this for us?
            for (int i = 1; i < encodedBytes; i++)
            {
                if (!TryReadCodePointByte(buffer[i], ref codePoint))
                {
                    return(false);
                }
            }

            return(true);
        }
Example #4
0
        public void IndexOfUnicodeCodePoint(int expected, string s, uint codePointValue)
        {
            Utf8String       u8s       = new Utf8String(s);
            UnicodeCodePoint codePoint = (UnicodeCodePoint)codePointValue;

            Assert.Equal(expected, u8s.IndexOf(codePoint));
        }
Example #5
0
        public static bool TryDecodeCodePoint(ByteSpan buffer, out UnicodeCodePoint codePoint, out int encodedBytes)
        {
            if (buffer.Length == 0)
            {
                codePoint = default(UnicodeCodePoint);
                encodedBytes = default(int);
                return false;
            }

            byte first = buffer[0];
            if (!TryGetFirstByteCodePointValue(first, out codePoint, out encodedBytes))
                return false;

            if (buffer.Length < encodedBytes)
                return false;

            // TODO: Should we manually inline this for values 1-4 or will compiler do this for us?
            for (int i = 1; i < encodedBytes; i++)
            {
                if (!TryReadCodePointByte(buffer[i], ref codePoint))
                    return false;
            }

            return true;
        }
        private static bool TryGetFirstByteCodePointValue(byte first, out UnicodeCodePoint codePoint, out int encodedBytes)
        {
            if (!TryGetNumberOfEncodedBytesFromFirstByte(first, out encodedBytes))
            {
                codePoint = default(UnicodeCodePoint);
                return false;
            }

            switch (encodedBytes)
            {
                case 1:
                    codePoint = (UnicodeCodePoint)(first & b0111_1111U);
                    return true;
                case 2:
                    codePoint = (UnicodeCodePoint)(first & b0001_1111U);
                    return true;
                case 3:
                    codePoint = (UnicodeCodePoint)(first & b0000_1111U);
                    return true;
                case 4:
                    codePoint = (UnicodeCodePoint)(first & b0000_0111U);
                    return true;
                default:
                    codePoint = default(UnicodeCodePoint);
                    encodedBytes = 0;
                    return false;
            }
        }
Example #7
0
        /// <summary>
        /// Builds a NFA from a unicode code point
        /// </summary>
        /// <param name="node">An AST node representing a NFA</param>
        /// <returns>The equivalent NFA</returns>
        private NFA BuildNFAFromCodepoint(ASTNode node)
        {
            // extract the code point value
            string value = node.Value;

            value = value.Substring(2, value.Length - 2);
            int cpValue = Convert.ToInt32(value, 16);

            if (cpValue < 0 || (cpValue >= 0xD800 && cpValue <= 0xDFFF) || cpValue >= 0x110000)
            {
                OnError(node.Position, "The value U+{0} is not a supported unicode code point", cpValue.ToString("X"));
                return(BuildEpsilonNFA());
            }
            UnicodeCodePoint cp = new UnicodeCodePoint(cpValue);
            // build the NFA
            NFA automata = NFA.NewMinimal();

            char[] data = cp.GetUTF16();
            if (data.Length == 1)
            {
                automata.StateEntry.AddTransition(new CharSpan(data[0], data[0]), automata.StateExit);
            }
            else
            {
                NFAState intermediate = automata.AddNewState();
                automata.StateEntry.AddTransition(new CharSpan(data[0], data[0]), intermediate);
                intermediate.AddTransition(new CharSpan(data[1], data[1]), automata.StateExit);
            }
            return(automata);
        }
Example #8
0
        public void BruteTestingRoundtripEncodeDecodeAllUnicodeCodePoints(TextEncoder encoder)
        {
            const uint maximumValidCodePoint = 0x10FFFF;

            UnicodeCodePoint[] expectedCodePoints = new UnicodeCodePoint[maximumValidCodePoint + 1];
            for (int i = 0; i <= maximumValidCodePoint; i++)
            {
                if (i >= 0xD800 && i <= 0xDFFF)
                {
                    expectedCodePoints[i] = new UnicodeCodePoint(0); // skip surrogate characters
                }
                else
                {
                    expectedCodePoints[i] = new UnicodeCodePoint((uint)i);
                }
            }

            ReadOnlySpan <UnicodeCodePoint> expectedCodePointsSpan = new ReadOnlySpan <UnicodeCodePoint>(expectedCodePoints);
            uint        maxBytes = 4 * (maximumValidCodePoint + 1);
            Span <byte> buffer   = new Span <byte>(new byte[maxBytes]);
            int         bytesWritten;

            Assert.True(encoder.TryEncodeFromUnicode(expectedCodePointsSpan, buffer, out bytesWritten));

            Span <UnicodeCodePoint> codePoints = new Span <UnicodeCodePoint>(new UnicodeCodePoint[maximumValidCodePoint + 1]);

            Assert.True(encoder.TryDecodeToUnicode(buffer, codePoints, out bytesWritten));

            for (int i = 0; i <= maximumValidCodePoint; i++)
            {
                Assert.Equal(expectedCodePointsSpan[i].Value, codePoints[i].Value);
            }
        }
Example #9
0
        private static bool TryGetFirstByteCodePointValue(byte first, out UnicodeCodePoint codePoint, out int encodedBytes)
        {
            if (!TryGetNumberOfEncodedBytesFromFirstByte(first, out encodedBytes))
            {
                codePoint = default(UnicodeCodePoint);
                return(false);
            }

            switch (encodedBytes)
            {
            case 1:
                codePoint = (UnicodeCodePoint)(first & b0111_1111U);
                return(true);

            case 2:
                codePoint = (UnicodeCodePoint)(first & b0001_1111U);
                return(true);

            case 3:
                codePoint = (UnicodeCodePoint)(first & b0000_1111U);
                return(true);

            case 4:
                codePoint = (UnicodeCodePoint)(first & b0000_0111U);
                return(true);

            default:
                codePoint    = default(UnicodeCodePoint);
                encodedBytes = 0;
                return(false);
            }
        }
Example #10
0
        private static bool TryReadCodePointByte(byte nextByte, ref UnicodeCodePoint codePoint)
        {
            uint current = nextByte;
            if ((current & b1100_0000U) != b1000_0000U)
                return false;

            codePoint = new UnicodeCodePoint((codePoint.Value << 6) | (b0011_1111U & current));
            return true;
        }
Example #11
0
        public void BruteTestingEncodeAllUnicodeCodePoints(TextEncoder encoder, Encoding systemTextEncoder)
        {
            const uint maximumValidCodePoint = 0x10FFFF;

            UnicodeCodePoint[] codePoints = new UnicodeCodePoint[maximumValidCodePoint + 1];

            var plainText = new StringBuilder();

            for (int i = 0; i <= maximumValidCodePoint; i++)
            {
                if (i >= 0xD800 && i <= 0xDFFF)
                {
                    codePoints[i] = new UnicodeCodePoint(0); // skip surrogate characters
                    plainText.Append((char)0);               // skip surrogate characters
                }
                else
                {
                    codePoints[i] = new UnicodeCodePoint((uint)i);

                    if (i > 0xFFFF)
                    {
                        plainText.Append(char.ConvertFromUtf32(i));
                    }
                    else
                    {
                        plainText.Append((char)i);
                    }
                }
            }

            ReadOnlySpan <UnicodeCodePoint> codePointsSpan = new ReadOnlySpan <UnicodeCodePoint>(codePoints);
            uint        maxBytes = 4 * (maximumValidCodePoint + 1);
            Span <byte> buffer   = new Span <byte>(new byte[maxBytes]);
            int         bytesWritten;

            Assert.True(encoder.TryEncodeFromUnicode(codePointsSpan, buffer, out bytesWritten));

            string unicodeString           = plainText.ToString();
            ReadOnlySpan <char> characters = unicodeString.Slice();
            int byteCount = systemTextEncoder.GetByteCount(unicodeString);

            byte[]      buff = new byte[byteCount];
            Span <byte> expectedBuffer;

            char[] charArray = characters.ToArray();

            systemTextEncoder.GetBytes(charArray, 0, characters.Length, buff, 0);
            expectedBuffer = new Span <byte>(buff);

            int minLength = Math.Min(expectedBuffer.Length, buffer.Length);

            for (int i = 0; i < minLength; i++)
            {
                Assert.Equal(expectedBuffer[i], buffer[i]);
            }
        }
Example #12
0
        public Utf8String TrimEnd()
        {
            CodePointReverseEnumerator it = CodePoints.GetReverseEnumerator();

            while (it.MoveNext() && UnicodeCodePoint.IsWhitespace(it.Current))
            {
            }

            return(Substring(0, it.PositionInCodeUnits));
        }
Example #13
0
        public Utf8String TrimStart()
        {
            CodePointEnumerator it = GetCodePointEnumerator();

            while (it.MoveNext() && UnicodeCodePoint.IsWhitespace(it.Current))
            {
            }

            return(Substring(it.PositionInCodeUnits));
        }
Example #14
0
        public static bool TryDecodeCodePoint(ByteSpan buffer, out UnicodeCodePoint codePoint, out int encodedBytes)
        {
            if (buffer.Length < 2)
            {
                codePoint    = default(UnicodeCodePoint);
                encodedBytes = default(int);
                // buffer too small
                return(false);
            }

            uint codePointValue = (uint)buffer[0] | ((uint)buffer[1] << 8);

            encodedBytes = 2;
            // Notice: This is any surrogate, not only high surrogate
            bool isSurrogate = codePointValue >= SpecConstants.HighSurrogateFirstCodePoint && codePointValue <= SpecConstants.LowSurrogateLastCodePoint;

            if (isSurrogate)
            {
                isSurrogate = codePointValue <= SpecConstants.HighSurrogateLastCodePoint;
                if (!isSurrogate || buffer.Length < 4)
                {
                    codePoint    = default(UnicodeCodePoint);
                    encodedBytes = default(int);
                    // invalid high surrogate or buffer too small
                    return(false);
                }
                unchecked
                {
                    codePointValue -= SpecConstants.HighSurrogateFirstCodePoint;
                    encodedBytes   += 2;
                }
                // high surrogate contains 10 first bits of the code point
                codePointValue <<= 10;

                uint lowSurrogate = (uint)buffer[2] | ((uint)buffer[3] << 8);
                if (lowSurrogate < SpecConstants.LowSurrogateFirstCodePoint || lowSurrogate > SpecConstants.LowSurrogateLastCodePoint)
                {
                    codePoint    = default(UnicodeCodePoint);
                    encodedBytes = default(int);
                    // invalid low surrogate character
                    return(false);
                }

                unchecked
                {
                    lowSurrogate -= SpecConstants.LowSurrogateFirstCodePoint;
                }
                codePointValue |= lowSurrogate;
            }

            codePoint = (UnicodeCodePoint)codePointValue;

            return(true);
        }
Example #15
0
        public bool StartsWith(UnicodeCodePoint codePoint)
        {
            CodePointEnumerator e = GetCodePointEnumerator();

            if (!e.MoveNext())
            {
                return(false);
            }

            return(e.Current == codePoint);
        }
Example #16
0
        private static bool TryReadCodePointByte(byte nextByte, ref UnicodeCodePoint codePoint)
        {
            uint current = nextByte;

            if ((current & b1100_0000U) != b1000_0000U)
            {
                return(false);
            }

            codePoint = new UnicodeCodePoint((codePoint.Value << 6) | (b0011_1111U & current));
            return(true);
        }
Example #17
0
        public override string ToString()
        {
            // get length first
            // TODO: Optimize for characters of length 1 or 2 in UTF-8 representation (no need to read anything)
            // TODO: is compiler gonna do the right thing here?
            // TODO: Should we use Linq's Count()?
            int len = 0;

            foreach (var codePoint in CodePoints)
            {
                len++;
                if (!UnicodeCodePoint.IsBmp(codePoint))
                {
                    len++;
                }
            }

            unsafe
            {
                Span <byte> buffer;
                char *      stackChars = null;
                char[]      characters = null;

                if (len <= 256)
                {
                    char *stackallocedChars = stackalloc char[len];
                    stackChars = stackallocedChars;
                    buffer     = new Span <byte>(stackChars, len * 2);
                }
                else
                {
                    // HACK: Can System.Buffers be used here?
                    characters = new char[len];
                    buffer     = characters.Slice().Cast <char, byte>();
                }

                foreach (var codePoint in CodePoints)
                {
                    int bytesEncoded;
                    if (!Utf16LittleEndianEncoder.TryEncodeCodePoint(codePoint, buffer, out bytesEncoded))
                    {
                        // TODO: Change Exception type
                        throw new Exception("invalid character");
                    }
                    buffer = buffer.Slice(bytesEncoded);
                }

                // TODO: We already have a char[] and this will copy, how to avoid that
                return(stackChars != null
                    ? new string(stackChars, 0, len)
                    : new string(characters));
            }
        }
Example #18
0
        public override bool TryEncodeFromUnicode(ReadOnlySpan <UnicodeCodePoint> codePoints, Span <byte> buffer, out int bytesWritten)
        {
            int availableBytes           = buffer.Length;
            var inputLength              = codePoints.Length;
            int bytesWrittenForCodePoint = 0;

            bytesWritten = 0;

            for (int i = 0; i < inputLength; i++)
            {
                UnicodeCodePoint codePoint = codePoints[i];
                bytesWrittenForCodePoint = GetNumberOfEncodedBytes(codePoint);
                if (!UnicodeCodePoint.IsSupportedCodePoint(codePoint) || bytesWritten + bytesWrittenForCodePoint > availableBytes)
                {
                    bytesWritten = 0;
                    return(false);
                }

                switch (bytesWrittenForCodePoint)
                {
                case 1:
                    buffer[bytesWritten] = (byte)(b0111_1111U & codePoint.Value);
                    break;

                case 2:
                    buffer[bytesWritten]     = (byte)(((codePoint.Value >> 6) & b0001_1111U) | b1100_0000U);
                    buffer[bytesWritten + 1] = (byte)(((codePoint.Value >> 0) & b0011_1111U) | b1000_0000U);
                    break;

                case 3:
                    buffer[bytesWritten]     = (byte)(((codePoint.Value >> 12) & b0000_1111U) | b1110_0000U);
                    buffer[bytesWritten + 1] = (byte)(((codePoint.Value >> 6) & b0011_1111U) | b1000_0000U);
                    buffer[bytesWritten + 2] = (byte)(((codePoint.Value >> 0) & b0011_1111U) | b1000_0000U);
                    break;

                case 4:
                    buffer[bytesWritten]     = (byte)(((codePoint.Value >> 18) & b0000_0111U) | b1111_0000U);
                    buffer[bytesWritten + 1] = (byte)(((codePoint.Value >> 12) & b0011_1111U) | b1000_0000U);
                    buffer[bytesWritten + 2] = (byte)(((codePoint.Value >> 6) & b0011_1111U) | b1000_0000U);
                    buffer[bytesWritten + 3] = (byte)(((codePoint.Value >> 0) & b0011_1111U) | b1000_0000U);
                    break;

                default:
                    bytesWritten = 0;
                    return(false);
                }

                bytesWritten += bytesWrittenForCodePoint;
            }

            return(true);
        }
Example #19
0
        public bool TrySubstringTo(UnicodeCodePoint codePoint, out Utf8String result)
        {
            int idx = IndexOf(codePoint);

            if (idx == StringNotFound)
            {
                result = default(Utf8String);
                return(false);
            }

            result = Substring(0, idx);
            return(true);
        }
        public static bool TryDecodeCodePoint(Span <byte> buffer, out UnicodeCodePoint codePoint, out int encodedBytes)
        {
            if (buffer.Length < 2)
            {
                codePoint    = default(UnicodeCodePoint);
                encodedBytes = default(int);
                // buffer too small
                return(false);
            }

            uint codePointValue = (uint)buffer[0] | ((uint)buffer[1] << 8);

            encodedBytes = 2;
            if (UnicodeCodePoint.IsSurrogate((UnicodeCodePoint)codePointValue))
            {
                // TODO: Check if compiler optimized it so codePointValue low range is checked only once
                if (!UnicodeCodePoint.IsHighSurrogate((UnicodeCodePoint)codePointValue) || buffer.Length < 4)
                {
                    codePoint    = default(UnicodeCodePoint);
                    encodedBytes = default(int);
                    // invalid high surrogate or buffer too small
                    return(false);
                }
                unchecked
                {
                    codePointValue -= UnicodeConstants.Utf16HighSurrogateFirstCodePoint;
                    encodedBytes   += 2;
                }
                // high surrogate contains 10 first bits of the code point
                codePointValue <<= 10;

                uint lowSurrogate = (uint)buffer[2] | ((uint)buffer[3] << 8);
                if (!UnicodeCodePoint.IsLowSurrogate((UnicodeCodePoint)lowSurrogate))
                {
                    codePoint    = default(UnicodeCodePoint);
                    encodedBytes = default(int);
                    // invalid low surrogate character
                    return(false);
                }

                unchecked
                {
                    lowSurrogate -= UnicodeConstants.Utf16LowSurrogateFirstCodePoint;
                }
                codePointValue |= lowSurrogate;
            }

            codePoint = (UnicodeCodePoint)codePointValue;

            return(true);
        }
Example #21
0
        // TODO: Should this be public?
        public int IndexOf(UnicodeCodePoint codePoint)
        {
            CodePointEnumerator it = GetCodePointEnumerator();

            while (it.MoveNext())
            {
                if (it.Current == codePoint)
                {
                    return(it.PositionInCodeUnits);
                }
            }

            return(StringNotFound);
        }
        public static bool TryDecodeCodePoint(Span<byte> buffer, out UnicodeCodePoint codePoint, out int encodedBytes)
        {
            if (buffer.Length < 2)
            {
                codePoint = default(UnicodeCodePoint);
                encodedBytes = default(int);
                // buffer too small
                return false;
            }

            uint codePointValue = (uint)buffer[0] | ((uint)buffer[1] << 8);
            encodedBytes = 2;
            if (UnicodeCodePoint.IsSurrogate((UnicodeCodePoint)codePointValue))
            {
                // TODO: Check if compiler optimized it so codePointValue low range is checked only once
                if (!UnicodeCodePoint.IsHighSurrogate((UnicodeCodePoint)codePointValue) || buffer.Length < 4)
                {
                    codePoint = default(UnicodeCodePoint);
                    encodedBytes = default(int);
                    // invalid high surrogate or buffer too small
                    return false;
                }
                unchecked
                {
                    codePointValue -= UnicodeConstants.Utf16HighSurrogateFirstCodePoint;
                    encodedBytes += 2;
                }
                // high surrogate contains 10 first bits of the code point
                codePointValue <<= 10;

                uint lowSurrogate = (uint)buffer[2] | ((uint)buffer[3] << 8);
                if (!UnicodeCodePoint.IsLowSurrogate((UnicodeCodePoint)lowSurrogate))
                {
                    codePoint = default(UnicodeCodePoint);
                    encodedBytes = default(int);
                    // invalid low surrogate character
                    return false;
                }

                unchecked
                {
                    lowSurrogate -= UnicodeConstants.Utf16LowSurrogateFirstCodePoint;
                }
                codePointValue |= lowSurrogate;
            }

            codePoint = (UnicodeCodePoint)codePointValue;

            return true;
        }
Example #23
0
        public unsafe Utf8EncodedCodePoint(char highSurrogate, char lowSurrogate) : this()
        {
            UnicodeCodePoint codePoint = (UnicodeCodePoint)(uint)char.ConvertToUtf32(highSurrogate, lowSurrogate);

            fixed(byte *encodedData = &_byte0)
            {
                Span <byte> buffer = new Span <byte>(encodedData, 4);

                if (!Utf8Encoder.TryEncodeCodePoint(codePoint, buffer, out _length))
                {
                    // TODO: Change exception type
                    throw new Exception("Internal error: this should never happen as codePoint should be within acceptable range");
                }
            }
        }
Example #24
0
        internal static int GetNumberOfEncodedBytes(UnicodeCodePoint codePoint)
        {
            if (codePoint.Value <= 0x7F)
                return 1;

            if (codePoint.Value <= 0x7FF)
                return 2;

            if (codePoint.Value <= 0xFFFF)
                return 3;

            if (codePoint.Value <= 0x1FFFFF)
                return 4;

            return -1;
        }
Example #25
0
        public override bool TryDecodeToUnicode(Span <byte> encoded, Span <UnicodeCodePoint> decoded, out int bytesWritten)
        {
            var avaliableBytes           = encoded.Length;
            var outputLength             = decoded.Length;
            int bytesWrittenForCodePoint = 0;

            bytesWritten = 0;

            for (int i = 0; i < outputLength; i++)
            {
                UnicodeCodePoint decodedCodePoint = decoded[i];

                if (avaliableBytes - bytesWritten < 2)
                {
                    decodedCodePoint = new UnicodeCodePoint();
                    bytesWritten     = 0;
                    return(false);
                }

                uint answer = (uint)(encoded[1 + bytesWritten] << 8 | encoded[bytesWritten]);
                decodedCodePoint         = new UnicodeCodePoint(answer);
                bytesWrittenForCodePoint = 2;

                if (avaliableBytes - bytesWritten >= 4)
                {
                    uint highBytes = answer;
                    uint lowBytes  = (uint)(encoded[3 + bytesWritten] << 8 | encoded[2 + bytesWritten]);

                    if (highBytes >= UnicodeConstants.Utf16HighSurrogateFirstCodePoint &&
                        highBytes <= UnicodeConstants.Utf16HighSurrogateLastCodePoint &&
                        lowBytes >= UnicodeConstants.Utf16LowSurrogateFirstCodePoint &&
                        lowBytes <= UnicodeConstants.Utf16LowSurrogateLastCodePoint)
                    {
                        answer = (((highBytes - UnicodeConstants.Utf16HighSurrogateFirstCodePoint) << 10)
                                  | (lowBytes - UnicodeConstants.Utf16LowSurrogateFirstCodePoint)) + 0x10000;

                        decodedCodePoint         = new UnicodeCodePoint(answer);
                        bytesWrittenForCodePoint = 4;
                    }
                }

                decoded[i]    = decodedCodePoint;
                bytesWritten += bytesWrittenForCodePoint;
            }

            return(true);
        }
Example #26
0
        public static bool TryEncodeCodePoint(UnicodeCodePoint codePoint, Span <byte> buffer, out int encodedBytes)
        {
            if (!UnicodeCodePoint.IsSupportedCodePoint(codePoint))
            {
                encodedBytes = 0;
                return(false);
            }

            encodedBytes = GetNumberOfEncodedBytes(codePoint);
            if (encodedBytes > buffer.Length)
            {
                encodedBytes = 0;
                return(false);
            }

            switch (encodedBytes)
            {
            case 1:
                buffer[0] = (byte)(b0111_1111U & codePoint.Value);
                return(true);

            case 2:
                byte b0 = (byte)(((codePoint.Value >> 6) & b0001_1111U) | b1100_0000U);
                byte b1 = (byte)(((codePoint.Value >> 0) & b0011_1111U) | b1000_0000U);
                buffer.Write((ushort)(b0 | b1 << 8));
                return(true);

            case 3:
                b0 = (byte)(((codePoint.Value >> 12) & b0000_1111U) | b1110_0000U);
                b1 = (byte)(((codePoint.Value >> 6) & b0011_1111U) | b1000_0000U);
                buffer.Write((ushort)(b0 | b1 << 8));
                buffer[2] = (byte)(((codePoint.Value >> 0) & b0011_1111U) | b1000_0000U);
                return(true);

            case 4:
                b0 = (byte)(((codePoint.Value >> 18) & b0000_0111U) | b1111_0000U);
                b1 = (byte)(((codePoint.Value >> 12) & b0011_1111U) | b1000_0000U);
                byte b2 = (byte)(((codePoint.Value >> 6) & b0011_1111U) | b1000_0000U);
                byte b3 = (byte)(((codePoint.Value >> 0) & b0011_1111U) | b1000_0000U);
                buffer.Write((uint)(b0 | b1 << 8 | b2 << 16 | b3 << 24));
                return(true);

            default:
                return(false);
            }
        }
Example #27
0
        public static bool TryEncodeCodePoint(UnicodeCodePoint codePoint, Span <byte> buffer, out int encodedBytes)
        {
            if (!UnicodeCodePoint.IsSupportedCodePoint(codePoint))
            {
                encodedBytes = 0;
                return(false);
            }

            encodedBytes = GetNumberOfEncodedBytes(codePoint);
            if (encodedBytes > buffer.Length)
            {
                encodedBytes = 0;
                return(false);
            }

            switch (encodedBytes)
            {
            case 1:
                buffer[0] = (byte)(0b0111_1111U & codePoint.Value);
                return(true);

            case 2:
                buffer[0] = (byte)(((codePoint.Value >> 6) & 0b0001_1111U) | 0b1100_0000U);
                buffer[1] = (byte)(((codePoint.Value >> 0) & 0b0011_1111U) | 0b1000_0000U);
                return(true);

            case 3:
                buffer[0] = (byte)(((codePoint.Value >> 12) & 0b0000_1111U) | 0b1110_0000U);
                buffer[1] = (byte)(((codePoint.Value >> 6) & 0b0011_1111U) | 0b1000_0000U);
                buffer[2] = (byte)(((codePoint.Value >> 0) & 0b0011_1111U) | 0b1000_0000U);
                return(true);

            case 4:
                buffer[0] = (byte)(((codePoint.Value >> 18) & 0b0000_0111U) | 0b1111_0000U);
                buffer[1] = (byte)(((codePoint.Value >> 12) & 0b0011_1111U) | 0b1000_0000U);
                buffer[2] = (byte)(((codePoint.Value >> 6) & 0b0011_1111U) | 0b1000_0000U);
                buffer[3] = (byte)(((codePoint.Value >> 0) & 0b0011_1111U) | 0b1000_0000U);
                return(true);

            default:
                return(false);
            }
        }
        public static bool TryEncodeCodePoint(UnicodeCodePoint codePoint, Span<byte> buffer, out int encodedBytes)
        {
            if (!UnicodeCodePoint.IsSupportedCodePoint(codePoint))
            {
                encodedBytes = default(int);
                return false;
            }

            // TODO: Can we add this in UnicodeCodePoint class?
            // Should be represented as Surrogate?
            encodedBytes = ((uint)codePoint >= 0x10000) ? 4 : 2;

            if (buffer.Length < encodedBytes)
            {
                codePoint = default(UnicodeCodePoint);
                encodedBytes = default(int);
                // buffer too small
                return false;
            }

            if (encodedBytes == 2)
            {
                unchecked
                {
                    buffer[0] = (byte)((uint)codePoint);
                    buffer[1] = (byte)((uint)codePoint >> 8);
                }
            }
            else
            {
                unchecked
                {
                    uint highSurrogate = ((uint)codePoint >> 10) + UnicodeConstants.Utf16HighSurrogateFirstCodePoint;
                    uint lowSurrogate = ((uint)codePoint & MaskLow10Bits) + UnicodeConstants.Utf16LowSurrogateFirstCodePoint;
                    buffer[0] = (byte)highSurrogate;
                    buffer[1] = (byte)(highSurrogate >> 8);

                    buffer[2] = (byte)lowSurrogate;
                    buffer[3] = (byte)(lowSurrogate >> 8);
                }
            }
            return true;
        }
        public static bool TryEncodeCodePoint(UnicodeCodePoint codePoint, Span <byte> buffer, out int encodedBytes)
        {
            if (!UnicodeCodePoint.IsSupportedCodePoint(codePoint))
            {
                encodedBytes = default(int);
                return(false);
            }

            // TODO: Can we add this in UnicodeCodePoint class?
            // Should be represented as Surrogate?
            encodedBytes = ((uint)codePoint >= 0x10000) ? 4 : 2;

            if (buffer.Length < encodedBytes)
            {
                codePoint    = default(UnicodeCodePoint);
                encodedBytes = default(int);
                // buffer too small
                return(false);
            }

            if (encodedBytes == 2)
            {
                unchecked
                {
                    buffer[0] = (byte)((uint)codePoint);
                    buffer[1] = (byte)((uint)codePoint >> 8);
                }
            }
            else
            {
                unchecked
                {
                    uint highSurrogate = ((uint)codePoint >> 10) + UnicodeConstants.Utf16HighSurrogateFirstCodePoint;
                    uint lowSurrogate  = ((uint)codePoint & MaskLow10Bits) + UnicodeConstants.Utf16LowSurrogateFirstCodePoint;
                    buffer[0] = (byte)highSurrogate;
                    buffer[1] = (byte)(highSurrogate >> 8);

                    buffer[2] = (byte)lowSurrogate;
                    buffer[3] = (byte)(lowSurrogate >> 8);
                }
            }
            return(true);
        }
        public unsafe static bool TryEncodeCodePoint(UnicodeCodePoint codePoint, char* buffer, out int encodedChars)
        {
            if (!UnicodeCodePoint.IsSupportedCodePoint(codePoint))
            {
                encodedChars = default(int);
                return false;
            }

            // TODO: Can we add this in UnicodeCodePoint class?
            // Should be represented as Surrogate?
            encodedChars = ((uint)codePoint >= 0x10000) ? 2 : 1;

            /*
            Never happens. Max encodedBytes = 4 bytes = 2 chars. We already preallocate 2 chars for every UTF8 byte.
            if (buffer.Length < encodedBytes)
            {
                codePoint = default(UnicodeCodePoint);
                encodedBytes = default(int);
                // buffer too small
                return false;
            }
            */

            if (encodedChars == 1)
            {
                unchecked
                {
                    Write(buffer, (ushort)codePoint);
                }
            }
            else
            {
                unchecked
                {
                    uint highSurrogate = ((uint)(codePoint.Value - 0x10000) >> 10) + UnicodeConstants.Utf16HighSurrogateFirstCodePoint;
                    uint lowSurrogate = ((uint)codePoint & MaskLow10Bits) + UnicodeConstants.Utf16LowSurrogateFirstCodePoint;

                    Write(buffer, highSurrogate | (lowSurrogate << 16));
                }
            }
            return true;
        }
        public unsafe static bool TryEncodeCodePoint(UnicodeCodePoint codePoint, char *buffer, out int encodedChars)
        {
            if (!UnicodeCodePoint.IsSupportedCodePoint(codePoint))
            {
                encodedChars = default(int);
                return(false);
            }

            // TODO: Can we add this in UnicodeCodePoint class?
            // Should be represented as Surrogate?
            encodedChars = ((uint)codePoint >= 0x10000) ? 2 : 1;

            /*
             * Never happens. Max encodedBytes = 4 bytes = 2 chars. We already preallocate 2 chars for every UTF8 byte.
             * if (buffer.Length < encodedBytes)
             * {
             *  codePoint = default(UnicodeCodePoint);
             *  encodedBytes = default(int);
             *  // buffer too small
             *  return false;
             * }
             */

            if (encodedChars == 1)
            {
                unchecked
                {
                    Write(buffer, (ushort)codePoint);
                }
            }
            else
            {
                unchecked
                {
                    uint highSurrogate = ((uint)(codePoint.Value - 0x10000) >> 10) + UnicodeConstants.Utf16HighSurrogateFirstCodePoint;
                    uint lowSurrogate  = ((uint)codePoint & MaskLow10Bits) + UnicodeConstants.Utf16LowSurrogateFirstCodePoint;

                    Write(buffer, highSurrogate | (lowSurrogate << 16));
                }
            }
            return(true);
        }
Example #32
0
        public static bool TryEncodeCodePoint(UnicodeCodePoint codePoint, ByteSpan buffer, out int encodedBytes)
        {
            if ((uint)codePoint > 0x10FFFF)
            {
                encodedBytes = default(int);
                return(false);
            }

            // is Surrogate?
            encodedBytes = ((uint)codePoint >= 0x10000) ? 4 : 2;

            if (buffer.Length < encodedBytes)
            {
                codePoint    = default(UnicodeCodePoint);
                encodedBytes = default(int);
                // buffer too small
                return(false);
            }

            if (encodedBytes == 2)
            {
                unchecked
                {
                    buffer[0] = (byte)((uint)codePoint);
                    buffer[1] = (byte)((uint)codePoint >> 8);
                }
            }
            else
            {
                unchecked
                {
                    uint highSurrogate = ((uint)codePoint >> 10) + 0xD800;
                    uint lowSurrogate  = ((uint)codePoint & MaskLow10Bits) + 0xDC00;
                    buffer[0] = (byte)highSurrogate;
                    buffer[1] = (byte)(highSurrogate >> 8);

                    buffer[2] = (byte)lowSurrogate;
                    buffer[3] = (byte)(lowSurrogate >> 8);
                }
            }
            return(true);
        }
Example #33
0
        public static bool TryEncodeCodePoint(UnicodeCodePoint codePoint, Span<byte> buffer, out int encodedBytes)
        {
            if (!UnicodeCodePoint.IsSupportedCodePoint(codePoint))
            {
                encodedBytes = 0;
                return false;
            }

            encodedBytes = GetNumberOfEncodedBytes(codePoint);
            if (encodedBytes > buffer.Length)
            {
                encodedBytes = 0;
                return false;
            }

            switch (encodedBytes)
            {
                case 1:
                    buffer[0] = (byte)(b0111_1111U & codePoint.Value);
                    return true;
                case 2:
                    byte b0 = (byte)(((codePoint.Value >> 6) & b0001_1111U) | b1100_0000U);
                    byte b1 = (byte)(((codePoint.Value >> 0) & b0011_1111U) | b1000_0000U);
                    buffer.Write((ushort)(b0 | b1 << 8));
                    return true;
                case 3:
                    b0 = (byte)(((codePoint.Value >> 12) & b0000_1111U) | b1110_0000U);
                    b1 = (byte)(((codePoint.Value >> 6) & b0011_1111U) | b1000_0000U);
                    buffer.Write((ushort)(b0 | b1 << 8));
                    buffer[2] = (byte)(((codePoint.Value >> 0) & b0011_1111U) | b1000_0000U);
                    return true;
                case 4:
                    b0 = (byte)(((codePoint.Value >> 18) & b0000_0111U) | b1111_0000U);
                    b1 = (byte)(((codePoint.Value >> 12) & b0011_1111U) | b1000_0000U);
                    byte b2 = (byte)(((codePoint.Value >> 6) & b0011_1111U) | b1000_0000U);
                    byte b3 = (byte)(((codePoint.Value >> 0) & b0011_1111U) | b1000_0000U);
                    buffer.Write((uint)(b0 | b1 << 8 | b2 << 16 | b3 << 24));
                    return true;
                default:
                    return false;
            }
        }
Example #34
0
        public unsafe Utf8EncodedCodePoint(char character) : this()
        {
            if (char.IsSurrogate(character))
            {
                throw new ArgumentOutOfRangeException("character", "Surrogate characters are not allowed");
            }

            UnicodeCodePoint codePoint = (UnicodeCodePoint)(uint)character;

            fixed(byte *encodedData = &_byte0)
            {
                ByteSpan buffer = new ByteSpan(encodedData, 4);

                if (!Utf8Encoder.TryEncodeCodePoint(codePoint, buffer, out _length))
                {
                    // TODO: Change exception type
                    throw new Exception("Internal error: this should never happen as codePoint is within acceptable range and is not surrogate");
                }
            }
        }
        // TODO: Should we rewrite this to not use char.ConvertToUtf32 or is it fast enough?
        public static bool TryDecodeCodePointFromString(string s, int index, out UnicodeCodePoint codePoint, out int encodedChars)
        {
            if (index < 0 || index >= s.Length)
            {
                codePoint    = default(UnicodeCodePoint);
                encodedChars = 0;
                return(false);
            }

            if (index == s.Length - 1 && char.IsSurrogate(s[index]))
            {
                codePoint    = default(UnicodeCodePoint);
                encodedChars = 0;
                return(false);
            }

            encodedChars = char.IsHighSurrogate(s[index]) ? 2 : 1;
            codePoint    = (UnicodeCodePoint)(unchecked ((uint)char.ConvertToUtf32(s, index)));

            return(true);
        }
Example #36
0
        public override string ToString()
        {
            // get length first
            // TODO: Optimize for characters of length 1 or 2 in UTF-8 representation (no need to read anything)
            // TODO: is compiler gonna do the right thing here?
            // TODO: Should we use Linq's Count()?
            int len = 0;

            foreach (var codePoint in CodePoints)
            {
                len++;
                if (UnicodeCodePoint.IsSurrogate(codePoint))
                {
                    len++;
                }
            }

            char[] characters = new char[len];
            unsafe
            {
                fixed(char *pinnedCharacters = characters)
                {
                    Span <byte> buffer = new Span <byte>((byte *)pinnedCharacters, len * 2);

                    foreach (var codePoint in CodePoints)
                    {
                        int bytesEncoded;
                        if (!Utf16LittleEndianEncoder.TryEncodeCodePoint(codePoint, buffer, out bytesEncoded))
                        {
                            // TODO: Change Exception type
                            throw new Exception("invalid character");
                        }
                        buffer = buffer.Slice(bytesEncoded);
                    }
                }
            }

            // TODO: We already have a char[] and this will copy, how to avoid that
            return(new string(characters));
        }
Example #37
0
        public static bool TryDecodeCodePointBackwards(ReadOnlySpan <byte> buffer, out UnicodeCodePoint codePoint, out int encodedBytes)
        {
            if (TryFindEncodedCodePointBytesCountGoingBackwards(buffer, out encodedBytes))
            {
                int realEncodedBytes;
                // TODO: Inline decoding, as the invalid surrogate check can be done faster
                bool ret = TryDecodeCodePoint(buffer.Slice(buffer.Length - encodedBytes), out codePoint, out realEncodedBytes);
                if (ret && encodedBytes != realEncodedBytes)
                {
                    // invalid surrogate character
                    // we know the character length by iterating on surrogate characters from the end
                    // but the first byte of the character has also encoded length
                    // seems like the lengths don't match
                    return(false);
                }
                return(true);
            }

            codePoint    = default(UnicodeCodePoint);
            encodedBytes = default(int);
            return(false);
        }
Example #38
0
        public static bool TryDecodeCodePointBackwards(Span<byte> buffer, out UnicodeCodePoint codePoint, out int encodedBytes)
        {
            if (TryFindEncodedCodePointBytesCountGoingBackwards(buffer, out encodedBytes))
            {
                int realEncodedBytes;
                // TODO: Inline decoding, as the invalid surrogate check can be done faster
                bool ret = TryDecodeCodePoint(buffer.Slice(buffer.Length - encodedBytes), out codePoint, out realEncodedBytes);
                if (ret && encodedBytes != realEncodedBytes)
                {
                    // invalid surrogate character
                    // we know the character length by iterating on surrogate characters from the end
                    // but the first byte of the character has also encoded length
                    // seems like the lengths don't match
                    return false;
                }
                return true;
            }

            codePoint = default(UnicodeCodePoint);
            encodedBytes = default(int);
            return false;
        }
        public static bool TryEncodeCodePoint(UnicodeCodePoint codePoint, Span<byte> buffer, out int encodedBytes)
        {
            if (!UnicodeCodePoint.IsSupportedCodePoint(codePoint))
            {
                encodedBytes = default(int);
                return false;
            }

            encodedBytes = UnicodeCodePoint.IsBmp(codePoint) ? 2 : 4;

            if (buffer.Length < encodedBytes)
            {
                codePoint = default(UnicodeCodePoint);
                encodedBytes = default(int);
                // buffer too small
                return false;
            }

            if (encodedBytes == 2)
            {
                unchecked
                {
                    buffer.Write((ushort)codePoint);
                }
            }
            else
            {
                unchecked
                {
                    uint codePointValue = (uint)codePoint;
                    uint highSurrogate = ((codePointValue - 0x010000u) >> 10) + UnicodeConstants.Utf16HighSurrogateFirstCodePoint;
                    uint lowSurrogate = (codePointValue & MaskLow10Bits) + UnicodeConstants.Utf16LowSurrogateFirstCodePoint;
                    buffer.Write(highSurrogate | (lowSurrogate << 16));
                }
            }
            return true;
        }
        public static bool TryEncodeCodePoint(UnicodeCodePoint codePoint, Span <byte> buffer, out int encodedBytes)
        {
            if (!UnicodeCodePoint.IsSupportedCodePoint(codePoint))
            {
                encodedBytes = default(int);
                return(false);
            }

            encodedBytes = UnicodeCodePoint.IsBmp(codePoint) ? 2 : 4;

            if (buffer.Length < encodedBytes)
            {
                codePoint    = default(UnicodeCodePoint);
                encodedBytes = default(int);
                // buffer too small
                return(false);
            }

            if (encodedBytes == 2)
            {
                unchecked
                {
                    buffer.Write((ushort)codePoint);
                }
            }
            else
            {
                unchecked
                {
                    uint codePointValue = (uint)codePoint;
                    uint highSurrogate  = ((codePointValue - 0x010000u) >> 10) + UnicodeConstants.Utf16HighSurrogateFirstCodePoint;
                    uint lowSurrogate   = (codePointValue & MaskLow10Bits) + UnicodeConstants.Utf16LowSurrogateFirstCodePoint;
                    buffer.Write(highSurrogate | (lowSurrogate << 16));
                }
            }
            return(true);
        }
Example #41
0
        private static int GetNumberOfEncodedBytes(UnicodeCodePoint codePoint)
        {
            if (codePoint.Value <= 0x7F)
            {
                return(1);
            }

            if (codePoint.Value <= 0x7FF)
            {
                return(2);
            }

            if (codePoint.Value <= 0xFFFF)
            {
                return(3);
            }

            if (codePoint.Value <= 0x10FFFF)
            {
                return(4);
            }

            return(0);
        }
        // TODO: Should we rewrite this to not use char.ConvertToUtf32 or is it fast enough?
        public static bool TryDecodeCodePointFromString(string s, int index, out UnicodeCodePoint codePoint, out int encodedChars)
        {
            if (index < 0 || index >= s.Length)
            {
                codePoint = default(UnicodeCodePoint);
                encodedChars = 0;
                return false;
            }

            if (index == s.Length - 1 && char.IsSurrogate(s[index]))
            {
                codePoint = default(UnicodeCodePoint);
                encodedChars = 0;
                return false;
            }

            encodedChars = char.IsHighSurrogate(s[index]) ? 2 : 1;
            codePoint = (UnicodeCodePoint)(unchecked((uint)char.ConvertToUtf32(s, index)));

            return true;
        }
Example #43
0
        public static bool TryEncodeCodePoint(UnicodeCodePoint codePoint, Span<byte> buffer, out int encodedBytes)
        {
            if (!UnicodeCodePoint.IsSupportedCodePoint(codePoint))
            {
                encodedBytes = 0;
                return false;
            }

            encodedBytes = GetNumberOfEncodedBytes(codePoint);
            if (encodedBytes > buffer.Length)
            {
                encodedBytes = 0;
                return false;
            }

            switch (encodedBytes)
            {
                case 1:
                    buffer[0] = (byte)(0b0111_1111U & codePoint.Value);
                    return true;
                case 2:
                    buffer[0] = (byte)(((codePoint.Value >> 6) & 0b0001_1111U) | 0b1100_0000U);
                    buffer[1] = (byte)(((codePoint.Value >> 0) & 0b0011_1111U) | 0b1000_0000U);
                    return true;
                case 3:
                    buffer[0] = (byte)(((codePoint.Value >> 12) & 0b0000_1111U) | 0b1110_0000U);
                    buffer[1] = (byte)(((codePoint.Value >> 6) & 0b0011_1111U) | 0b1000_0000U);
                    buffer[2] = (byte)(((codePoint.Value >> 0) & 0b0011_1111U) | 0b1000_0000U);
                    return true;
                case 4:
                    buffer[0] = (byte)(((codePoint.Value >> 18) & 0b0000_0111U) | 0b1111_0000U);
                    buffer[1] = (byte)(((codePoint.Value >> 12) & 0b0011_1111U) | 0b1000_0000U);
                    buffer[2] = (byte)(((codePoint.Value >> 6) & 0b0011_1111U) | 0b1000_0000U);
                    buffer[3] = (byte)(((codePoint.Value >> 0) & 0b0011_1111U) | 0b1000_0000U);
                    return true;
                default:
                    return false;
            }
        }
Example #44
0
        internal static int GetNumberOfEncodedBytes(UnicodeCodePoint codePoint)
        {
            if (codePoint.Value <= 0x7F)
            {
                return 1;
            }

            if (codePoint.Value <= 0x7FF)
            {
                return 2;
            }

            if (codePoint.Value <= 0xFFFF)
            {
                return 3;
            }

            if (codePoint.Value <= 0x1FFFFF)
            {
                return 4;
            }

            return 0;
        }