Example #1
0
        public override unsafe int FindFirstCharacterToEncodeUtf8(ReadOnlySpan <byte> utf8Text)
        {
            fixed(byte *ptr = utf8Text)
            {
                int idx = 0;

#if NETCOREAPP
                if (Sse2.IsSupported || AdvSimd.Arm64.IsSupported)
                {
                    sbyte *startingAddress = (sbyte *)ptr;
                    while (utf8Text.Length - 16 >= idx)
                    {
                        Debug.Assert(startingAddress >= ptr && startingAddress <= (ptr + utf8Text.Length - 16));

                        // Load the next 16 bytes.
                        Vector128 <sbyte> sourceValue;
                        bool containsNonAsciiBytes;

                        // Check for ASCII text. Any byte that's not in the ASCII range will already be negative when
                        // casted to signed byte.
                        if (Sse2.IsSupported)
                        {
                            sourceValue           = Sse2.LoadVector128(startingAddress);
                            containsNonAsciiBytes = Sse2Helper.ContainsNonAsciiByte(sourceValue);
                        }
                        else
                        {
                            sourceValue           = AdvSimd.LoadVector128(startingAddress);
                            containsNonAsciiBytes = AdvSimdHelper.ContainsNonAsciiByte(sourceValue);
                        }

                        if (containsNonAsciiBytes)
                        {
                            // At least one of the following 16 bytes is non-ASCII.

                            int processNextSixteen = idx + 16;
                            Debug.Assert(processNextSixteen <= utf8Text.Length);

                            while (idx < processNextSixteen)
                            {
                                Debug.Assert((ptr + idx) <= (ptr + utf8Text.Length));

                                if (UnicodeUtility.IsAsciiCodePoint(ptr[idx]))
                                {
                                    if (!_allowedCharacters.IsUnicodeScalarAllowed(ptr[idx]))
                                    {
                                        goto Return;
                                    }
                                    idx++;
                                }
                                else
                                {
                                    OperationStatus opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text.Slice(idx), out uint nextScalarValue, out int utf8BytesConsumedForScalar);

                                    Debug.Assert(nextScalarValue <= int.MaxValue);
                                    if (opStatus != OperationStatus.Done || WillEncode((int)nextScalarValue))
                                    {
                                        goto Return;
                                    }

                                    Debug.Assert(opStatus == OperationStatus.Done);
                                    idx += utf8BytesConsumedForScalar;
                                }
                            }
                            startingAddress = (sbyte *)ptr + idx;
                        }
                        else
                        {
                            // Check if any of the 16 bytes need to be escaped.
                            int index;

                            if (Sse2.IsSupported)
                            {
                                Vector128 <sbyte> mask = Sse2Helper.CreateEscapingMask_UnsafeRelaxedJavaScriptEncoder(sourceValue);
                                index = Sse2Helper.GetIndexOfFirstNonAsciiByte(mask.AsByte());
                            }
                            else
                            {
                                Vector128 <sbyte> mask = AdvSimdHelper.CreateEscapingMask_UnsafeRelaxedJavaScriptEncoder(sourceValue);
                                index = AdvSimdHelper.GetIndexOfFirstNonAsciiByte(mask.AsByte());
                            }

                            // If index >= 16, that means none of the 16 bytes needed to be escaped.
                            if (index < 16)
                            {
                                // Found at least one byte that needs to be escaped, figure out the index of
                                // the first one found that needed to be escaped within the 16 bytes.
                                idx += index;
                                goto Return;
                            }
                            idx             += 16;
                            startingAddress += 16;
                        }
                    }

                    // Process the remaining bytes.
                    Debug.Assert(utf8Text.Length - idx < 16);
                }
#endif

                while (idx < utf8Text.Length)
                {
                    Debug.Assert((ptr + idx) <= (ptr + utf8Text.Length));

                    if (UnicodeUtility.IsAsciiCodePoint(ptr[idx]))
                    {
                        if (!_allowedCharacters.IsUnicodeScalarAllowed(ptr[idx]))
                        {
                            goto Return;
                        }
                        idx++;
                    }
                    else
                    {
                        OperationStatus opStatus = UnicodeHelpers.DecodeScalarValueFromUtf8(utf8Text.Slice(idx), out uint nextScalarValue, out int utf8BytesConsumedForScalar);

                        Debug.Assert(nextScalarValue <= int.MaxValue);
                        if (opStatus != OperationStatus.Done || WillEncode((int)nextScalarValue))
                        {
                            goto Return;
                        }

                        Debug.Assert(opStatus == OperationStatus.Done);
                        idx += utf8BytesConsumedForScalar;
                    }
                }
                Debug.Assert(idx == utf8Text.Length);

                idx = -1; // All bytes are allowed.

Return:
                return(idx);
            }
        }
Example #2
0
        public static string Encode(this TextEncoder encoder, string value)
        {
            if (value == null)
            {
                return(value);
            }

            unsafe
            {
                fixed(char *valuePointer = value)
                {
                    int firstCharacterToEncode = encoder.FindFirstCharacterToEncode(valuePointer, value.Length);

                    if (firstCharacterToEncode == -1)
                    {
                        return(value);
                    }

                    int   bufferSize   = encoder.MaxOutputCharsPerInputChar * value.Length;
                    char *wholebuffer  = stackalloc char[bufferSize];
                    char *buffer       = wholebuffer;
                    int   totalWritten = 0;

                    if (firstCharacterToEncode > 0)
                    {
                        int bytesToCopy = firstCharacterToEncode + firstCharacterToEncode;
                        Buffer.MemoryCopy(valuePointer, buffer, bytesToCopy, bytesToCopy);
                        totalWritten += firstCharacterToEncode;
                        bufferSize   -= firstCharacterToEncode;
                        buffer       += firstCharacterToEncode;
                    }

                    int valueIndex = firstCharacterToEncode;

                    char firstChar        = value[valueIndex];
                    char secondChar       = firstChar;
                    bool wasSurrogatePair = false;
                    int  charsWritten;

                    // this loop processes character pairs (in case they are surrogates).
                    // there is an if block below to process single last character.
                    for (int secondCharIndex = valueIndex + 1; secondCharIndex < value.Length; secondCharIndex++)
                    {
                        if (!wasSurrogatePair)
                        {
                            firstChar = secondChar;
                        }
                        else
                        {
                            firstChar = value[secondCharIndex - 1];
                        }
                        secondChar = value[secondCharIndex];

                        if (!encoder.Encodes(firstChar))
                        {
                            wasSurrogatePair = false;
                            *buffer = firstChar;
                            buffer++;
                            bufferSize--;
                            totalWritten++;
                        }
                        else
                        {
                            int nextScalar = UnicodeHelpers.GetScalarValueFromUtf16(firstChar, secondChar, out wasSurrogatePair);
                            if (!encoder.TryEncodeUnicodeScalar(nextScalar, buffer, bufferSize, out charsWritten))
                            {
                                throw new ArgumentException("Argument encoder does not implement MaxOutputCharsPerInputChar correctly.");
                            }

                            buffer       += charsWritten;
                            bufferSize   -= charsWritten;
                            totalWritten += charsWritten;
                            if (wasSurrogatePair)
                            {
                                secondCharIndex++;
                            }
                        }
                    }

                    if (!wasSurrogatePair)
                    {
                        firstChar = value[value.Length - 1];
                        int nextScalar = UnicodeHelpers.GetScalarValueFromUtf16(firstChar, null, out wasSurrogatePair);
                        if (!encoder.TryEncodeUnicodeScalar(nextScalar, buffer, bufferSize, out charsWritten))
                        {
                            throw new ArgumentException("Argument encoder does not implement MaxOutputCharsPerInputChar correctly.");
                        }

                        buffer       += charsWritten;
                        bufferSize   -= charsWritten;
                        totalWritten += charsWritten;
                    }

                    var result = new String(wholebuffer, 0, totalWritten);

                    return(result);
                }
            }
        }
Example #3
0
        // skips the call to FindFirstCharacterToEncodeUtf8
        private protected virtual OperationStatus EncodeUtf8Core(
            ReadOnlySpan <byte> utf8Source,
            Span <byte> utf8Destination,
            out int bytesConsumed,
            out int bytesWritten,
            bool isFinalBlock)
        {
            int originalUtf8SourceLength      = utf8Source.Length;
            int originalUtf8DestinationLength = utf8Destination.Length;

            const int   TempUtf16CharBufferLength = 24; // arbitrarily chosen, but sufficient for any reasonable implementation
            Span <char> utf16ScratchBuffer        = stackalloc char[TempUtf16CharBufferLength];

            while (!utf8Source.IsEmpty)
            {
                OperationStatus opStatus = Rune.DecodeFromUtf8(utf8Source, out Rune scalarValue, out int bytesConsumedJustNow);
                if (opStatus != OperationStatus.Done)
                {
                    if (!isFinalBlock && opStatus == OperationStatus.NeedMoreData)
                    {
                        goto NeedMoreData;
                    }

                    Debug.Assert(scalarValue == Rune.ReplacementChar); // DecodeFromUtf8 should've performed substitution
                    goto MustEncode;
                }

                if (!WillEncode(scalarValue.Value))
                {
                    uint utf8lsb    = (uint)UnicodeHelpers.GetUtf8RepresentationForScalarValue((uint)scalarValue.Value);
                    int  dstIdxTemp = 0;
                    do
                    {
                        if ((uint)dstIdxTemp >= (uint)utf8Destination.Length)
                        {
                            goto DestinationTooSmall;
                        }
                        utf8Destination[dstIdxTemp++] = (byte)utf8lsb;
                    } while ((utf8lsb >>= 8) != 0);
                    utf8Source      = utf8Source.Slice(bytesConsumedJustNow);
                    utf8Destination = utf8Destination.Slice(dstIdxTemp);
                    continue;
                }

MustEncode:

                if (!TryEncodeUnicodeScalarUtf8((uint)scalarValue.Value, utf16ScratchBuffer, utf8Destination, out int bytesWrittenJustNow))
                {
                    goto DestinationTooSmall;
                }

                utf8Source      = utf8Source.Slice(bytesConsumedJustNow);
                utf8Destination = utf8Destination.Slice(bytesWrittenJustNow);
            }

            // And we're finished!

            OperationStatus retVal = OperationStatus.Done;

ReturnCommon:
            bytesConsumed = originalUtf8SourceLength - utf8Source.Length;
            bytesWritten  = originalUtf8DestinationLength - utf8Destination.Length;
            return(retVal);

NeedMoreData:
            retVal = OperationStatus.NeedMoreData;
            goto ReturnCommon;

DestinationTooSmall:
            retVal = OperationStatus.DestinationTooSmall;
            goto ReturnCommon;
        }