// Returns &inputBuffer[inputLength] if the input buffer is valid. /// <summary> /// Given an input buffer <paramref name="pInputBuffer"/> of byte length <paramref name="inputLength"/>, /// returns a pointer to where the first invalid data appears in <paramref name="pInputBuffer"/>. /// </summary> /// <remarks> /// Returns a pointer to the end of <paramref name="pInputBuffer"/> if the buffer is well-formed. /// </remarks> /// <param name="pInputBuffer">Pointer to Utf8 byte buffer</param> /// <param name="inputLength">Buffer length in bytes</param> /// <param name="utf16CodeUnitCountAdjustment">Zero or negative number to be added to the "bytes processed" return value to come up with the total UTF-16 code unit count.</param> /// <param name="scalarCountAdjustment">Zero or negative number to be added to the "total UTF-16 code unit count" value to come up with the total scalar count.</param> public static byte *GetPointerToFirstInvalidByte(byte *pInputBuffer, int inputLength, out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment) { Debug.Assert(inputLength >= 0, "Input length must not be negative."); Debug.Assert(pInputBuffer != null || inputLength == 0, "Input length must be zero if input buffer pointer is null."); var input = new ReadOnlySpan <byte>(pInputBuffer, inputLength); int cumulativeUtf16CodeUnitCount = 0; int cumulativeScalarValueCount = 0; while (!input.IsEmpty) { if (Rune.DecodeFromUtf8(input, out Rune rune, out int bytesConsumed) != OperationStatus.Done) { break; } input = input.Slice(bytesConsumed); cumulativeUtf16CodeUnitCount += rune.Utf16SequenceLength; cumulativeScalarValueCount++; } int cumulativeBytesConsumed = inputLength - input.Length; utf16CodeUnitCountAdjustment = cumulativeUtf16CodeUnitCount - cumulativeBytesConsumed; scalarCountAdjustment = cumulativeScalarValueCount - cumulativeUtf16CodeUnitCount; return(pInputBuffer + cumulativeBytesConsumed); }
private static bool TryParseSearchTermAsRune(object searchTerm, out Rune parsed) { if (searchTerm is char ch) { return(Rune.TryCreate(ch, out parsed)); } else if (searchTerm is Rune r) { parsed = r; return(true); } else if (searchTerm is string str) { if (Rune.DecodeFromUtf16(str, out parsed, out int charsConsumed) == OperationStatus.Done && charsConsumed == str.Length) { return(true); } } else if (searchTerm is ustring ustr) { if (Rune.DecodeFromUtf8(ustr.AsBytes(), out parsed, out int bytesConsumed) == OperationStatus.Done && bytesConsumed == ustr.Length) { return(true); } } parsed = default; // failed to turn the search term into a single Rune return(false); }
// On method return, pInputBufferRemaining and pOutputBufferRemaining will both point to where // the next byte would have been consumed from / the next char would have been written to. // inputLength in bytes, outputCharsRemaining in chars. public static OperationStatus TranscodeToUtf16(byte *pInputBuffer, int inputLength, char *pOutputBuffer, int outputCharsRemaining, out byte *pInputBufferRemaining, out char *pOutputBufferRemaining) { Debug.Assert(inputLength >= 0, "Input length must not be negative."); Debug.Assert(pInputBuffer != null || inputLength == 0, "Input length must be zero if input buffer pointer is null."); Debug.Assert(outputCharsRemaining >= 0, "Destination length must not be negative."); Debug.Assert(pOutputBuffer != null || outputCharsRemaining == 0, "Destination length must be zero if destination buffer pointer is null."); var input = new ReadOnlySpan <byte>(pInputBuffer, inputLength); var output = new Span <char>(pOutputBuffer, outputCharsRemaining); OperationStatus opStatus = OperationStatus.Done; while (!input.IsEmpty) { opStatus = Rune.DecodeFromUtf8(input, out Rune rune, out int bytesConsumedJustNow); if (opStatus != OperationStatus.Done) { break; } if (!rune.TryEncodeToUtf16(output, out int charsWrittenJustNow)) { opStatus = OperationStatus.DestinationTooSmall; break; } input = input.Slice(bytesConsumedJustNow); output = output.Slice(charsWrittenJustNow); } pInputBufferRemaining = pInputBuffer + inputLength - input.Length; pOutputBufferRemaining = pOutputBuffer + outputCharsRemaining - output.Length; return(opStatus); }
public ValueParseResult TryParse(ReadOnlySpan <byte> readerSpan, out string result, out int consumedLength, out int lineSpan, out int colSpan) { result = null; consumedLength = 0; lineSpan = 1; colSpan = 0; // is input empty if (readerSpan.Length <= 0) { // did any prior processing occur return(this._inString ? _cleanup(this, ValueParseResult.FailureEOF) : ValueParseResult.EOF); } // if we are not continuing, ensure it's a string that's being parsed var startPos = 0; if (!this._inString) { if (readerSpan[consumedLength++] != JsonTokens.QuoteMark) { if (Rune.DecodeFromUtf8(readerSpan, out var rune, out _) != OperationStatus.Done) { rune = default; } return(_cleanup(this, ValueParseResult.Failure("Unexpected token, expected \".", rune))); } startPos = consumedLength; this._inString = true; } // if continuing, check if anything is pending in the buffer var blen = (int)this._buffContent; Span <char> decoded = stackalloc char[512]; switch (this._buffContent) { // short escape: \" \\ \/ \b \f \n \r \t // long escape: \uXXXX case ContentType.EscapeSequence: case ContentType.ExtendedEscapeSequence: if (this._buffContent != ContentType.ExtendedEscapeSequence && readerSpan[0] == JsonTokens.UnicodePrefix) { this._buffContent = ContentType.ExtendedEscapeSequence; blen = (int)this._buffContent; } if (readerSpan.Length + this._buffPos < blen + 1) { readerSpan.CopyTo(this.Buffer[this._buffPos..].Span);
public Utf8Splitter(ReadOnlySpan <byte> span, ReadOnlySpan <byte> separator, StringSplitOptions splitOptions) { ReadOnlySpan <byte> separatorSlice = separator; do { var separatorStatus = Rune.DecodeFromUtf8(separatorSlice, out _, out int consumed); if (separatorStatus == OperationStatus.InvalidData) { throw new ArgumentException("The separator is not valid UTF8.", nameof(separator)); } separatorSlice = separatorSlice[consumed..];
private static bool EndsWithValidMultiByteUtf8Sequence(byte[] input) { for (int i = input.Length - 1; i >= 0; i--) { if (input[i] >= 0xC0) { return(Rune.DecodeFromUtf8(input.AsSpan(i), out _, out int bytesConsumed) == OperationStatus.Done && i + bytesConsumed == input.Length); } } return(false); // input was empty? }
/// <summary> /// Returns the index in <paramref name="utf8Data"/> where the first non-whitespace character /// appears, or the input length if the data contains only whitespace characters. /// </summary> public static int GetIndexOfFirstNonWhiteSpaceChar(ReadOnlySpan <byte> utf8Data) { // This method is optimized for the case where the input data is ASCII, and if the // data does need to be trimmed it's likely that only a relatively small number of // bytes will be trimmed. int i = 0; int length = utf8Data.Length; while (i < length) { // Very quick check: see if the byte is in the range [ 21 .. 7F ]. // If so, we can skip the more expensive logic later in this method. if ((sbyte)utf8Data[i] > (sbyte)0x20) { break; } uint possibleAsciiByte = utf8Data[i]; if (UnicodeUtility.IsAsciiCodePoint(possibleAsciiByte)) { // The simple comparison failed. Let's read the actual byte value, // and if it's ASCII we can delegate to Rune's inlined method // implementation. if (Rune.IsWhiteSpace(new Rune(possibleAsciiByte))) { i++; continue; } } else { // Not ASCII data. Go back to the slower "decode the entire scalar" // code path, then compare it against our Unicode tables. Rune.DecodeFromUtf8(utf8Data.Slice(i), out Rune decodedRune, out int bytesConsumed); if (Rune.IsWhiteSpace(decodedRune)) { i += bytesConsumed; continue; } } break; // If we got here, we saw a non-whitespace subsequence. } return(i); }
public ValueParseResult TryParse(ReadOnlySpan <byte> readerSpan, out bool result, out int consumedLength, out int lineSpan, out int colSpan) { result = false; consumedLength = 0; lineSpan = 1; colSpan = 0; // is input empty if (readerSpan.Length <= 0) { // did any prior processing occur return(this._buffPos > 0 ? _cleanup(this, ValueParseResult.FailureEOF) : ValueParseResult.EOF); } // determine what we're reading var expectedLength = 4; var src = this._buffPos > 0 ? this.Buffer.Span : readerSpan; switch (src[0]) { case JsonTokens.TrueFirst: result = true; break; case JsonTokens.FalseFirst: expectedLength = 5; break; default: this._buffPos = 0; if (Rune.DecodeFromUtf8(readerSpan, out var rune, out _) != OperationStatus.Done) { rune = default; } return(ValueParseResult.Failure("Unexpected token, expected true/false.", rune)); } // if reader buffer is too small, copy its contents then signal EOF var tooSmall = readerSpan.Length < expectedLength - this._buffPos; if (tooSmall || this._buffPos > 0) { var tlen = Math.Min(expectedLength - this._buffPos, readerSpan.Length); readerSpan.Slice(0, tlen).CopyTo(this.Buffer.Span[this._buffPos..]);
public virtual int FindFirstCharacterToEncodeUtf8(ReadOnlySpan <byte> utf8Text) { int utf8TextOriginalLength = utf8Text.Length; while (!utf8Text.IsEmpty) { OperationStatus opStatus = Rune.DecodeFromUtf8(utf8Text, out Rune scalarValue, out int bytesConsumed); if (opStatus != OperationStatus.Done || WillEncode(scalarValue.Value)) { break; } utf8Text = utf8Text.Slice(bytesConsumed); } return((utf8Text.IsEmpty) ? -1 : utf8TextOriginalLength - utf8Text.Length); }
/// <summary> /// Returns <paramref name="value"/> if it is null or contains only well-formed UTF-8 data; /// otherwises allocates a new <see cref="Utf8String"/> instance containing the same data as /// <paramref name="value"/> but where all invalid UTF-8 sequences have been replaced /// with U+FFD. /// </summary> public static Utf8String ValidateAndFixupUtf8String(Utf8String value) { if (Utf8String.IsNullOrEmpty(value)) { return(value); } ReadOnlySpan <byte> valueAsBytes = value.AsBytes(); int idxOfFirstInvalidData = GetIndexOfFirstInvalidUtf8Sequence(valueAsBytes, out _); if (idxOfFirstInvalidData < 0) { return(value); } // TODO_UTF8STRING: Replace this with the faster implementation once it's available. // (The faster implementation is in the dev/utf8string_bak branch currently.) MemoryStream memStream = new MemoryStream(); memStream.Write(valueAsBytes.Slice(0, idxOfFirstInvalidData)); valueAsBytes = valueAsBytes.Slice(idxOfFirstInvalidData); do { if (Rune.DecodeFromUtf8(valueAsBytes, out _, out int bytesConsumed) == OperationStatus.Done) { // Valid scalar value - copy data as-is to MemoryStream memStream.Write(valueAsBytes.Slice(0, bytesConsumed)); } else { // Invalid scalar value - copy U+FFFD to MemoryStream memStream.Write(ReplacementCharSequence); } valueAsBytes = valueAsBytes.Slice(bytesConsumed); } while (!valueAsBytes.IsEmpty); bool success = memStream.TryGetBuffer(out ArraySegment <byte> memStreamBuffer); Debug.Assert(success, "Couldn't get underlying MemoryStream buffer."); return(Utf8String.DangerousCreateWithoutValidation(memStreamBuffer, assumeWellFormed: true)); }
public static int GetIndexOfFirstInvalidUtf8Sequence(ReadOnlySpan <byte> utf8Data, out bool isAscii) { // TODO_UTF8STRING: Replace this with the faster drop-in replacement when it's available (coreclr #21948). bool tempIsAscii = true; int originalDataLength = utf8Data.Length; while (!utf8Data.IsEmpty) { if (Rune.DecodeFromUtf8(utf8Data, out Rune result, out int bytesConsumed) != OperationStatus.Done) { break; } tempIsAscii &= result.IsAscii; utf8Data = utf8Data.Slice(bytesConsumed); } isAscii = tempIsAscii; return((utf8Data.IsEmpty) ? -1 : (originalDataLength - utf8Data.Length)); }
public static void Trim(string input) { // Arrange using BoundedUtf8Span boundedSpan = new BoundedUtf8Span(input); Utf8Span span = boundedSpan.Span; // Act Utf8Span trimmed = span.Trim(); // Assert // Compute the trim manually and ensure it matches the trimmed span's characteristics. ReadOnlySpan <byte> utf8Bytes = span.Bytes; while (!utf8Bytes.IsEmpty) { OperationStatus status = Rune.DecodeFromUtf8(utf8Bytes, out Rune decodedRune, out int bytesConsumed); Assert.Equal(OperationStatus.Done, status); if (!Rune.IsWhiteSpace(decodedRune)) { break; } utf8Bytes = utf8Bytes.Slice(bytesConsumed); } while (!utf8Bytes.IsEmpty) { OperationStatus status = Rune.DecodeLastFromUtf8(utf8Bytes, out Rune decodedRune, out int bytesConsumed); Assert.Equal(OperationStatus.Done, status); if (!Rune.IsWhiteSpace(decodedRune)) { break; } utf8Bytes = utf8Bytes[..^ bytesConsumed];
public ValueParseResult TryParse(ReadOnlySpan <byte> readerSpan, out ImmutableArray <JsonValue> result, out int consumedLength, out int lineSpan, out int colSpan) { result = default; consumedLength = 0; lineSpan = 1; colSpan = 0; // is input empty if (readerSpan.Length <= 0 && this._innerReader == null) { // did any prior processing occur return(this._arr != null ? _cleanup(this, ValueParseResult.FailureEOF) : ValueParseResult.EOF); } // if we are not continuing, ensure it's an object that's being parsed if (this._arr == null) { if (readerSpan[consumedLength++] != JsonTokens.OpeningBracket) { if (Rune.DecodeFromUtf8(readerSpan, out var rune, out _) != OperationStatus.Done) { rune = default; } return(_cleanup(this, ValueParseResult.Failure("Unexpected token, expected {.", rune))); } this._expectedNext = ExpectedToken.ValueOrEnd; this._arr = ImmutableArray.CreateBuilder <JsonValue>(); ++this._colSpan; ++this._streamPos; } // if continuing, check if any value is being parsed if (this._innerReader != null) { // valid only if expecting value if (this._expectedNext != ExpectedToken.Value && this._expectedNext != ExpectedToken.ValueOrEnd) { return(_cleanup(this, ValueParseResult.Failure("Invalid internal state.", default))); } // parse inner value ++consumedLength; var innerResult = this.ParseInner(readerSpan, ref consumedLength); switch (innerResult.Type) { case ValueParseResultType.Success: this._innerReader.Reset(); this._innerReader = null; break; case ValueParseResultType.EOF: return(innerResult); case ValueParseResultType.Failure: return(_cleanup(this, innerResult)); } } // read and parse array items var completedParsing = false; while (consumedLength < readerSpan.Length) { switch (readerSpan[consumedLength++]) { case JsonTokens.WhitespaceSpace: ++this._colSpan; ++this._streamPos; break; case JsonTokens.WhitespaceHorizontalTab: this._colSpan += 4; // fite me ++this._streamPos; break; case JsonTokens.WhitespaceCarriageReturn: // usually as part of CRLF, really no other reason for it to exist // old macs don't exist break; case JsonTokens.WhitespaceNewline: ++this._lineSpan; this._colSpan = 0; ++this._streamPos; break; case JsonTokens.ItemSeparator: if (this._expectedNext != ExpectedToken.ItemSeparatorOrEnd) { return(_cleanup(this, ValueParseResult.Failure("Unexpected item separator.", new Rune(JsonTokens.ItemSeparator)))); } ++this._colSpan; ++this._streamPos; this._expectedNext = ExpectedToken.Value; break; case JsonTokens.ClosingBracket: if (this._expectedNext != ExpectedToken.ItemSeparatorOrEnd && this._expectedNext != ExpectedToken.ValueOrEnd) { return(_cleanup(this, ValueParseResult.Failure("Unexpected array end.", new Rune(JsonTokens.ClosingBracket)))); } ++this._colSpan; ++this._streamPos; completedParsing = true; break; case JsonTokens.NullFirst: if (this._expectedNext != ExpectedToken.Value && this._expectedNext != ExpectedToken.ValueOrEnd) { return(_cleanup(this, ValueParseResult.Failure("Unexpected array item (null).", new Rune(JsonTokens.NullFirst)))); } this._innerReader = this._innerReaders.NullReader; break; case JsonTokens.TrueFirst: case JsonTokens.FalseFirst: if (this._expectedNext != ExpectedToken.Value && this._expectedNext != ExpectedToken.ValueOrEnd) { return(_cleanup(this, ValueParseResult.Failure("Unexpected array item (boolean).", new Rune(readerSpan[consumedLength - 1])))); } this._innerReader = this._innerReaders.BooleanReader; break; case JsonTokens.NumberSign: case JsonTokens.Digit0: case JsonTokens.Digit1: case JsonTokens.Digit2: case JsonTokens.Digit3: case JsonTokens.Digit4: case JsonTokens.Digit5: case JsonTokens.Digit6: case JsonTokens.Digit7: case JsonTokens.Digit8: case JsonTokens.Digit9: if (this._expectedNext != ExpectedToken.Value && this._expectedNext != ExpectedToken.ValueOrEnd) { return(_cleanup(this, ValueParseResult.Failure("Unexpected array item (number).", new Rune(readerSpan[consumedLength - 1])))); } this._innerReader = this._innerReaders.NumberReader; break; case JsonTokens.QuoteMark: if (this._expectedNext != ExpectedToken.Value && this._expectedNext != ExpectedToken.ValueOrEnd) { return(_cleanup(this, ValueParseResult.Failure("Unexpected array item (string).", new Rune(JsonTokens.QuoteMark)))); } this._innerReader = this._innerReaders.StringReader; break; case JsonTokens.OpeningBracket: if (this._expectedNext != ExpectedToken.Value && this._expectedNext != ExpectedToken.ValueOrEnd) { return(_cleanup(this, ValueParseResult.Failure("Unexpected array item (array).", new Rune(JsonTokens.OpeningBracket)))); } this._innerReader = new JsonArrayReader(this._innerReaders); break; case JsonTokens.OpeningBrace: if (this._expectedNext != ExpectedToken.Value && this._expectedNext != ExpectedToken.ValueOrEnd) { return(_cleanup(this, ValueParseResult.Failure("Unexpected array item (object).", new Rune(JsonTokens.OpeningBracket)))); } this._innerReader = new JsonObjectReader(this._innerReaders); break; default: if (Rune.DecodeFromUtf8(readerSpan[(consumedLength - 1)..], out var rune, out _) != OperationStatus.Done)
// skips the call to FindFirstCharacterToEncodeUtf8 private protected virtual OperationStatus EncodeUtf8Core( ReadOnlySpan <byte> utf8Source, Span <byte> utf8Destination, out int bytesConsumed, out int bytesWritten, bool isFinalBlock) { int originalUtf8SourceLength = utf8Source.Length; int originalUtf8DestinationLength = utf8Destination.Length; const int TempUtf16CharBufferLength = 24; // arbitrarily chosen, but sufficient for any reasonable implementation Span <char> utf16ScratchBuffer = stackalloc char[TempUtf16CharBufferLength]; while (!utf8Source.IsEmpty) { OperationStatus opStatus = Rune.DecodeFromUtf8(utf8Source, out Rune scalarValue, out int bytesConsumedJustNow); if (opStatus != OperationStatus.Done) { if (!isFinalBlock && opStatus == OperationStatus.NeedMoreData) { goto NeedMoreData; } Debug.Assert(scalarValue == Rune.ReplacementChar); // DecodeFromUtf8 should've performed substitution goto MustEncode; } if (!WillEncode(scalarValue.Value)) { uint utf8lsb = (uint)UnicodeHelpers.GetUtf8RepresentationForScalarValue((uint)scalarValue.Value); int dstIdxTemp = 0; do { if ((uint)dstIdxTemp >= (uint)utf8Destination.Length) { goto DestinationTooSmall; } utf8Destination[dstIdxTemp++] = (byte)utf8lsb; } while ((utf8lsb >>= 8) != 0); utf8Source = utf8Source.Slice(bytesConsumedJustNow); utf8Destination = utf8Destination.Slice(dstIdxTemp); continue; } MustEncode: if (!TryEncodeUnicodeScalarUtf8((uint)scalarValue.Value, utf16ScratchBuffer, utf8Destination, out int bytesWrittenJustNow)) { goto DestinationTooSmall; } utf8Source = utf8Source.Slice(bytesConsumedJustNow); utf8Destination = utf8Destination.Slice(bytesWrittenJustNow); } // And we're finished! OperationStatus retVal = OperationStatus.Done; ReturnCommon: bytesConsumed = originalUtf8SourceLength - utf8Source.Length; bytesWritten = originalUtf8DestinationLength - utf8Destination.Length; return(retVal); NeedMoreData: retVal = OperationStatus.NeedMoreData; goto ReturnCommon; DestinationTooSmall: retVal = OperationStatus.DestinationTooSmall; goto ReturnCommon; }
[InlineData(new byte[] { 0xF0, 0x9F, 0x98, 0xB2 }, OperationStatus.Done, 0x1F632, 4)] // [ F0 9F 98 B2 ] is U+1F632 ASTONISHED FACE public static void DecodeFromUtf8(byte[] data, OperationStatus expectedOperationStatus, int expectedRuneValue, int expectedBytesConsumed) { Assert.Equal(expectedOperationStatus, Rune.DecodeFromUtf8(data, out Rune actualRune, out int actualBytesConsumed)); Assert.Equal(expectedRuneValue, actualRune.Value); Assert.Equal(expectedBytesConsumed, actualBytesConsumed); }
/// <summary> /// Transcodes the UTF-8 <paramref name="source"/> buffer to <paramref name="destination"/> as UTF-16. /// </summary> /// <remarks> /// If <paramref name="replaceInvalidSequences"/> is <see langword="true"/>, invalid UTF-8 sequences /// in <paramref name="source"/> will be replaced with U+FFFD in <paramref name="destination"/>, and /// this method will not return <see cref="OperationStatus.InvalidData"/>. /// </remarks> public static unsafe OperationStatus ToUtf16(ReadOnlySpan <byte> source, Span <char> destination, out int bytesRead, out int charsWritten, bool replaceInvalidSequences = true, bool isFinalBlock = true) { // Throwaway span accesses - workaround for https://github.com/dotnet/runtime/issues/12332 _ = source.Length; _ = destination.Length; // We'll be mutating these values throughout our loop. fixed(byte *pOriginalSource = &MemoryMarshal.GetReference(source)) fixed(char *pOriginalDestination = &MemoryMarshal.GetReference(destination)) { // We're going to bulk transcode as much as we can in a loop, iterating // every time we see bad data that requires replacement. OperationStatus operationStatus = OperationStatus.Done; byte * pInputBufferRemaining = pOriginalSource; char * pOutputBufferRemaining = pOriginalDestination; while (!source.IsEmpty) { // We've pinned the spans at the entry point to this method. // It's safe for us to use Unsafe.AsPointer on them during this loop. operationStatus = Utf8Utility.TranscodeToUtf16( pInputBuffer: (byte *)Unsafe.AsPointer(ref MemoryMarshal.GetReference(source)), inputLength: source.Length, pOutputBuffer: (char *)Unsafe.AsPointer(ref MemoryMarshal.GetReference(destination)), outputCharsRemaining: destination.Length, pInputBufferRemaining: out pInputBufferRemaining, pOutputBufferRemaining: out pOutputBufferRemaining); // If we finished the operation entirely or we ran out of space in the destination buffer, // or if we need more input data and the caller told us that there's possibly more data // coming, return immediately. if (operationStatus <= OperationStatus.DestinationTooSmall || (operationStatus == OperationStatus.NeedMoreData && !isFinalBlock)) { break; } // We encountered invalid data, or we need more data but the caller told us we're // at the end of the stream. In either case treat this as truly invalid. // If the caller didn't tell us to replace invalid sequences, return immediately. if (!replaceInvalidSequences) { operationStatus = OperationStatus.InvalidData; // status code may have been NeedMoreData - force to be error break; } // We're going to attempt to write U+FFFD to the destination buffer. // Do we even have enough space to do so? destination = destination.Slice((int)(pOutputBufferRemaining - (char *)Unsafe.AsPointer(ref MemoryMarshal.GetReference(destination)))); if (destination.IsEmpty) { operationStatus = OperationStatus.DestinationTooSmall; break; } destination[0] = (char)UnicodeUtility.ReplacementChar; destination = destination.Slice(1); // Now figure out how many bytes of the source we must skip over before we should retry // the operation. This might be more than 1 byte. source = source.Slice((int)(pInputBufferRemaining - (byte *)Unsafe.AsPointer(ref MemoryMarshal.GetReference(source)))); Debug.Assert(!source.IsEmpty, "Expected 'Done' if source is fully consumed."); Rune.DecodeFromUtf8(source, out _, out int bytesConsumedJustNow); source = source.Slice(bytesConsumedJustNow); operationStatus = OperationStatus.Done; // we patched the error - if we're about to break out of the loop this is a success case pInputBufferRemaining = (byte *)Unsafe.AsPointer(ref MemoryMarshal.GetReference(source)); pOutputBufferRemaining = (char *)Unsafe.AsPointer(ref MemoryMarshal.GetReference(destination)); } // Not possible to make any further progress - report to our caller how far we got. bytesRead = (int)(pInputBufferRemaining - pOriginalSource); charsWritten = (int)(pOutputBufferRemaining - pOriginalDestination); return(operationStatus); } }
public bool MoveNext() { var operationStatus = Rune.DecodeFromUtf8(_remaining, out _current, out var bytesConsumed); _remaining = _remaining[bytesConsumed..];
private void PollSDLEvents() { Span <char> textEditingBuffer = stackalloc char[SDL.Keyboard.TextEditingEvent.TextSize]; while (SDL.PollEvent(out SDL.Event ev) == 1) { switch (ev.Type) { case SDL.EventType.Quit: Interlocked.Increment(ref _isExiting); break; #region Joystick case SDL.EventType.JoyDeviceAdded: Joystick.AddDevice(ev.JoystickDevice.Which); break; case SDL.EventType.JoyDeviceRemoved: Joystick.RemoveDevice(ev.JoystickDevice.Which); break; #endregion #region GameController case SDL.EventType.ControllerDeviceRemoved: GamePad.RemoveDevice(ev.ControllerDevice.Which); break; case SDL.EventType.ControllerButtonUp: case SDL.EventType.ControllerButtonDown: case SDL.EventType.ControllerAxisMotion: GamePad.UpdatePacketInfo(ev.ControllerDevice.Which, ev.ControllerDevice.TimeStamp); break; #endregion #region Mouse case SDL.EventType.MouseWheel: _window.Mouse.ScrollX += ev.MouseWheel.X * MouseWheelDelta; _window.Mouse.ScrollY += ev.MouseWheel.Y * MouseWheelDelta; break; case SDL.EventType.MouseMotion: _window.Mouse.State.X = ev.MouseMotion.X; _window.Mouse.State.Y = ev.MouseMotion.Y; break; #endregion #region Keyboard case SDL.EventType.KeyDown: { bool hasMapping = KeyboardUtil.ToXna(ev.KeyboardKey.Keysym.Sym, out var key); if (hasMapping) { if (!Keyboard._keysDown.Contains(key)) { Keyboard._keysDown.Add(key); } } // TODO: validate rune? Rune.TryCreate(ev.KeyboardKey.Keysym.Sym, out var rune); var inputEv = new TextInputEventArgs(rune, hasMapping ? key : (Keys?)null); _window.OnKeyDown(inputEv); break; } case SDL.EventType.KeyUp: { bool hasMapping = KeyboardUtil.ToXna(ev.KeyboardKey.Keysym.Sym, out var key); if (hasMapping) { Keyboard._keysDown.Remove(key); } // TODO: validate rune? Rune.TryCreate(ev.KeyboardKey.Keysym.Sym, out var rune); _window.OnKeyUp(new TextInputEventArgs(rune, hasMapping ? key : (Keys?)null)); break; } #endregion #region Text-Input/Editing case SDL.EventType.TextInput: unsafe { var utf8 = new Span <byte>(ev.TextInput.Text, SDL.Keyboard.TextInputEvent.TextSize); utf8 = SliceToNullTerminator(utf8); while (!utf8.IsEmpty) { var status = Rune.DecodeFromUtf8(utf8, out Rune rune, out int bytesConsumed); if (status != OperationStatus.Done) { // This should never occur if SDL gives use valid data. throw new InvalidDataException("Failed to decode UTF-8 text input: " + status); } utf8 = utf8[bytesConsumed..];
public bool MoveNext() { var status = Rune.DecodeFromUtf8(_utf8, out _current, out int consumed); _utf8 = _utf8[consumed..];
/// <summary> /// Returns <paramref name="value"/> if it is null or contains only well-formed UTF-8 data; /// otherwises allocates a new <see cref="Utf8String"/> instance containing the same data as /// <paramref name="value"/> but where all invalid UTF-8 sequences have been replaced /// with U+FFFD. /// </summary> public static Utf8String ValidateAndFixupUtf8String(Utf8String value) { if (value.Length == 0) { return(value); } ReadOnlySpan <byte> valueAsBytes = value.AsBytes(); int idxOfFirstInvalidData = GetIndexOfFirstInvalidUtf8Sequence(valueAsBytes, out _); if (idxOfFirstInvalidData < 0) { return(value); } // TODO_UTF8STRING: Replace this with the faster implementation once it's available. // (The faster implementation is in the dev/utf8string_bak branch currently.) MemoryStream memStream = new MemoryStream(); #if (!NETSTANDARD2_0 && !NETFRAMEWORK) memStream.Write(valueAsBytes.Slice(0, idxOfFirstInvalidData)); valueAsBytes = valueAsBytes.Slice(idxOfFirstInvalidData); do { if (Rune.DecodeFromUtf8(valueAsBytes, out _, out int bytesConsumed) == OperationStatus.Done) { // Valid scalar value - copy data as-is to MemoryStream memStream.Write(valueAsBytes.Slice(0, bytesConsumed)); } else { // Invalid scalar value - copy U+FFFD to MemoryStream memStream.Write(ReplacementCharSequence); } valueAsBytes = valueAsBytes.Slice(bytesConsumed); } while (!valueAsBytes.IsEmpty); #else if (!MemoryMarshal.TryGetArray(value.AsMemoryBytes(), out ArraySegment <byte> valueArraySegment)) { Debug.Fail("Utf8String on netstandard should always be backed by an array."); } memStream.Write(valueArraySegment.Array, valueArraySegment.Offset, idxOfFirstInvalidData); valueArraySegment = new ArraySegment <byte>( valueArraySegment.Array, idxOfFirstInvalidData, valueArraySegment.Count - idxOfFirstInvalidData); do { if (Rune.DecodeFromUtf8(valueArraySegment, out _, out int bytesConsumed) == OperationStatus.Done) { // Valid scalar value - copy data as-is to MemoryStream memStream.Write(valueArraySegment.Array, valueArraySegment.Offset, bytesConsumed); } else { // Invalid scalar value - copy U+FFFD to MemoryStream memStream.Write(ReplacementCharSequence, 0, ReplacementCharSequence.Length); } valueArraySegment = new ArraySegment <byte>( valueArraySegment.Array, valueArraySegment.Offset + bytesConsumed, valueArraySegment.Count - bytesConsumed); } while (valueArraySegment.Count > 0); #endif bool success = memStream.TryGetBuffer(out ArraySegment <byte> memStreamBuffer); Debug.Assert(success, "Couldn't get underlying MemoryStream buffer."); return(Utf8String.UnsafeCreateWithoutValidation(memStreamBuffer)); }
public ValueParseResult TryParse(ReadOnlySpan <byte> readerSpan, out double result, out int consumedLength, out int lineSpan, out int colSpan) { result = double.NaN; consumedLength = 0; lineSpan = 1; colSpan = 0; // if span is empty, and no parsing occured, signal EOF immediately if (readerSpan.Length <= 0 && this._lastPart == NumberPart.None) { return(ValueParseResult.EOF); } // if we are not continuing, check what we're parsing if (this.Buffer.Length == 0) { switch (readerSpan[consumedLength++]) { // a number in JSON can begin with - or digits 0-9 case JsonTokens.NumberSign: this._currentStructure = NumberStructure.HasSign; this._lastPart = NumberPart.NumberSign; break; // digit zero is a bit special in that if it's the first digit in a number, it becomes the only // legal digit before decimal point, hence special handling for it case JsonTokens.Digit0: this._currentStructure = NumberStructure.LeadingZero; this._lastPart = NumberPart.FirstDigit; break; // digits 1-9 are also valid as starting characters of a number, and unlike 0, they do not // restrict pre-decimal point digit count (though IEEE754 64-bit binary float limits still apply) case JsonTokens.Digit1: case JsonTokens.Digit2: case JsonTokens.Digit3: case JsonTokens.Digit4: case JsonTokens.Digit5: case JsonTokens.Digit6: case JsonTokens.Digit7: case JsonTokens.Digit8: case JsonTokens.Digit9: this._currentStructure = NumberStructure.LeadingNonzero; this._lastPart = NumberPart.FirstDigit; break; // not a legal character default: if (Rune.DecodeFromUtf8(readerSpan, out var rune, out _) != OperationStatus.Done) { rune = default; } return(ValueParseResult.Failure("Unexpected token, expected 0-9 or -.", rune)); } } // if we got empty when previous parsing occured, just don't parse, it's an end-of-content marker var completedParsing = false; if (readerSpan.Length > 0) { var offByOne = false; // try reading the number while (consumedLength < readerSpan.Length) { switch (readerSpan[consumedLength++]) { // digit 0 is special // if it's the first digit in the non-fractional part, it is the only legal digit before decimal point // otherwise it behaves like a regular digit // this means it can appear: // - as first digit before decimal point // - as non-first digit before decimal point, if first digit was not a 0 // - as a digit after decimal point before exponent mark // - as a digit after exponent mark or exponent sign // see: https://www.json.org/img/number.png case JsonTokens.Digit0: if (this._lastPart == NumberPart.FirstDigit && this._currentStructure.HasFlag(NumberStructure.LeadingZero)) { return(_cleanup(this, ValueParseResult.Failure("Digit in illegal separator. Expected decimal point.", new Rune(readerSpan[consumedLength - 1])))); } if (this._lastPart == NumberPart.NumberSign) { this._currentStructure |= NumberStructure.LeadingZero; this._lastPart = NumberPart.FirstDigit; } else { this._lastPart = this._lastPart switch { NumberPart.FirstDigit => NumberPart.Digit, NumberPart.FractionDot => NumberPart.FractionDigit, NumberPart.ExponentMarker or NumberPart.ExponentSign => NumberPart.ExponentDigit, _ => this._lastPart }; } break; // non-0 digits can appear: // - as first digit before decimal points // - as non-first digit before decimal point, if first digit was not a 0 // - as a digit after decimal point before exponent mark // - as a digit after exponent mark or exponent sign // see: https://www.json.org/img/number.png case JsonTokens.Digit1: case JsonTokens.Digit2: case JsonTokens.Digit3: case JsonTokens.Digit4: case JsonTokens.Digit5: case JsonTokens.Digit6: case JsonTokens.Digit7: case JsonTokens.Digit8: case JsonTokens.Digit9: if (this._lastPart == NumberPart.FirstDigit && this._currentStructure.HasFlag(NumberStructure.LeadingZero)) { return(_cleanup(this, ValueParseResult.Failure("Digit in illegal separator. Expected decimal point.", new Rune(readerSpan[consumedLength - 1])))); } if (this._lastPart == NumberPart.NumberSign) { this._currentStructure |= NumberStructure.LeadingNonzero; this._lastPart = NumberPart.FirstDigit; } else { this._lastPart = this._lastPart switch { NumberPart.FirstDigit => NumberPart.Digit, NumberPart.FractionDot => NumberPart.FractionDigit, NumberPart.ExponentMarker or NumberPart.ExponentSign => NumberPart.ExponentDigit, _ => this._lastPart }; } break; // decimal separator can appear only after at least one digit, and only once case JsonTokens.DecimalSeparator: if (this._lastPart != NumberPart.Digit && this._lastPart != NumberPart.FirstDigit) { return(_cleanup(this, ValueParseResult.Failure("Unexpected decimal separator.", new Rune('.')))); } this._currentStructure |= NumberStructure.Fraction; this._lastPart = NumberPart.FractionDot; break; // exponent marker can appear only after at least one digit, or at least one digit after // decimal point, and only once, regardless of variety case JsonTokens.ExponentSmall: case JsonTokens.ExponentCapital: if (this._lastPart != NumberPart.FirstDigit && this._lastPart != NumberPart.Digit && this._lastPart != NumberPart.FractionDigit) { return(_cleanup(this, ValueParseResult.Failure("Unexpected exponent marker.", new Rune(readerSpan[consumedLength - 1])))); } this._currentStructure |= NumberStructure.Exponent; this._lastPart = NumberPart.ExponentMarker; if (this._currentStructure.HasFlag(NumberStructure.Fraction)) { this._currentStructure |= NumberStructure.FractionValid; } break; // exponent sign can appear only after exponent marker case JsonTokens.NumberSign: case JsonTokens.ExponentSignPositive: if (this._lastPart != NumberPart.ExponentMarker) { return(_cleanup(this, ValueParseResult.Failure("Unexpected exponent sign.", new Rune(readerSpan[consumedLength - 1])))); } this._currentStructure |= NumberStructure.SignedExponent; this._lastPart = NumberPart.ExponentSign; break; // this is a situation where a non number-character is encountered // this is invalid if immediately after number sign, decimal point, exponent marker, or // exponent sign, otherwise consider it a completed number default: switch (this._lastPart) { case NumberPart.NumberSign: case NumberPart.FractionDot: case NumberPart.ExponentMarker: case NumberPart.ExponentSign: if (Rune.DecodeFromUtf8(readerSpan[(consumedLength - 1)..], out var rune, out _) != OperationStatus.Done) { rune = default; } return(_cleanup(this, ValueParseResult.Failure("Unexpected token, expected 0-9.", rune))); } offByOne = true; completedParsing = true; break; } // if parsing is completed, do not attempt to resume if (completedParsing) { break; } }
public void RunTest() { Console.WriteLine("-- BEGIN TEST --"); int encodingCharCount = Encoding.UTF8.GetCharCount(_data.Span); Console.WriteLine($"Encoding.UTF8.GetCharCount returned {encodingCharCount}."); { ReadOnlySpan <byte> input = _data.Span; int runeIterCharCount = 0; while (!input.IsEmpty) { Rune.DecodeFromUtf8(input, out Rune thisRune, out int bytesConsumed); runeIterCharCount += thisRune.Utf16SequenceLength; // ok if U+FFFD replacement input = input.Slice(bytesConsumed); } Console.WriteLine($"Rune iteration said there were {runeIterCharCount} UTF-16 chars."); if (encodingCharCount != runeIterCharCount) { throw new Exception("Rune iteration char count mismatch!!"); } } char[] chars = new char[encodingCharCount]; int charsWritten = Encoding.UTF8.GetChars(_data.Span, chars); Console.WriteLine($"Encoding.UTF8.GetChars returned {charsWritten} chars written."); if (encodingCharCount != charsWritten) { throw new Exception("GetChars return value mismatch!!"); } { ReadOnlySpan <byte> inputUtf8 = _data.Span; ReadOnlySpan <char> inputUtf16 = chars; while (!inputUtf8.IsEmpty && !inputUtf16.IsEmpty) { Rune.DecodeFromUtf8(inputUtf8, out Rune inputUtf8Rune, out int bytesConsumed); Rune.DecodeFromUtf16(inputUtf16, out Rune inputUtf16Rune, out int charsConsumed); if (inputUtf8Rune != inputUtf16Rune) { throw new Exception("Enumerating runes mismatch!!"); } inputUtf8 = inputUtf8.Slice(bytesConsumed); inputUtf16 = inputUtf16.Slice(charsConsumed); } if (inputUtf8.Length != inputUtf16.Length) { throw new Exception("Rune enumeration returned mismatched lengths!"); } } Console.WriteLine("Running ToUtf16 with replace=true and exact size buffer."); { char[] chars2 = new char[chars.Length]; OperationStatus opStatus = Utf8.ToUtf16(_data.Span, chars2, out int bytesReadJustNow, out int charsWrittenJustNow, replaceInvalidSequences: true, isFinalBlock: true); if (opStatus != OperationStatus.Done) { throw new Exception("Utf8.ToUtf16 returned wrong OperationStatus!!"); } if (bytesReadJustNow != _data.Memory.Length) { throw new Exception("Utf8.ToUtf16 didn't read entire input!!"); } if (charsWrittenJustNow != chars2.Length) { throw new Exception("Utf8.ToUtf16 didn't fill entire response buffer!!"); } if (!chars.SequenceEqual(chars2)) { throw new Exception("Utf8.ToUtf16 returned different data than Encoding.UTF8.GetChars!!"); } } Console.WriteLine("Running ToUtf16 with replace=true and extra large buffer."); { char[] chars2 = new char[chars.Length + 1024]; OperationStatus opStatus = Utf8.ToUtf16(_data.Span, chars2, out int bytesReadJustNow, out int charsWrittenJustNow, replaceInvalidSequences: true, isFinalBlock: true); if (opStatus != OperationStatus.Done) { throw new Exception("Utf8.ToUtf16 returned wrong OperationStatus!!"); } if (bytesReadJustNow != _data.Memory.Length) { throw new Exception("Utf8.ToUtf16 didn't read entire input!!"); } if (charsWrittenJustNow != chars.Length) { throw new Exception("Utf8.ToUtf16 didn't fill entire response buffer!!"); } if (!chars2.AsSpan(0, charsWrittenJustNow).SequenceEqual(chars)) { throw new Exception("Utf8.ToUtf16 returned different data than Encoding.UTF8.GetChars!!"); } } Console.WriteLine("Running ToUtf16 with replace=false and extra large buffer."); { ReadOnlySpan <byte> input = _data.Span; Span <char> output = new char[chars.Length + 1024]; while (!input.IsEmpty) { OperationStatus opStatus = Utf8.ToUtf16(input, output, out int bytesReadJustNow, out int charsWrittenJustNow, replaceInvalidSequences: false, isFinalBlock: true); ReadOnlySpan <byte> dataReadJustNow = input.Slice(0, bytesReadJustNow); ReadOnlySpan <char> dataWrittenJustNow = output.Slice(0, charsWrittenJustNow); while (!dataReadJustNow.IsEmpty && !dataWrittenJustNow.IsEmpty) { OperationStatus utf8Status = Rune.DecodeFromUtf8(dataReadJustNow, out Rune inputUtf8Rune, out int bytesConsumed); OperationStatus utf16Status = Rune.DecodeFromUtf16(dataWrittenJustNow, out Rune inputUtf16Rune, out int charsConsumed); if (utf8Status != OperationStatus.Done) { throw new Exception("DecodeFromUtf8 returned unexpected value!!"); } if (utf16Status != OperationStatus.Done) { throw new Exception("DecodeFromUtf16 returned unexpected value!!"); } if (inputUtf8Rune != inputUtf16Rune) { throw new Exception("Enumerating runes mismatch!!"); } dataReadJustNow = dataReadJustNow.Slice(bytesConsumed); dataWrittenJustNow = dataWrittenJustNow.Slice(charsConsumed); } if (dataReadJustNow.Length != dataWrittenJustNow.Length) { throw new Exception("Unexpected length mismatch!!"); } input = input.Slice(bytesReadJustNow); if (opStatus != OperationStatus.Done) { // Skip over invalid data Rune.DecodeFromUtf8(input, out _, out int bytesToSkip); input = input.Slice(bytesToSkip); } } } Console.WriteLine("Trying custom decoder replacement."); { // use a custom replacement string Encoding encoding = Encoding.GetEncoding("utf-8", EncoderFallback.ExceptionFallback, new DecoderReplacementFallback("{BAD}")); string decoded = encoding.GetString(_data.Span); ReadOnlySpan <byte> input = _data.Span; char[] decoded2 = new char[decoded.Length]; StringBuilder builder = new StringBuilder(); while (!input.IsEmpty) { OperationStatus opStatus = Utf8.ToUtf16(input, decoded2, out int bytesReadJustNow, out int charsWrittenJustNow, replaceInvalidSequences: false, isFinalBlock: true); builder.Append(decoded2, 0, charsWrittenJustNow); input = input.Slice(bytesReadJustNow); if (opStatus != OperationStatus.Done) { // Skip over invalid data Rune.DecodeFromUtf8(input, out _, out int bytesToSkip); input = input.Slice(bytesToSkip); builder.Append("{BAD}"); } } if (new string(decoded) != builder.ToString()) { throw new Exception("Custom decoder replacement failed!!"); } } Console.WriteLine("-- END TEST - SUCCESS --"); }
/// <summary> /// Transcodes the UTF-8 <paramref name="source"/> buffer to <paramref name="destination"/> as UTF-16. /// </summary> /// <remarks> /// If <paramref name="replaceInvalidSequences"/> is <see langword="true"/>, invalid UTF-8 sequences /// in <paramref name="source"/> will be replaced with U+FFFD in <paramref name="destination"/>, and /// this method will not return <see cref="OperationStatus.InvalidData"/>. /// </remarks> public static OperationStatus ToUtf16(ReadOnlySpan <byte> source, Span <char> destination, out int numBytesRead, out int numCharsWritten, bool replaceInvalidSequences = true, bool isFinalBlock = true) { int originalSourceLength = source.Length; int originalDestinationLength = destination.Length; OperationStatus status = OperationStatus.Done; // In a loop, this is going to read and transcode one scalar value at a time // from the source to the destination. while (!source.IsEmpty) { status = Rune.DecodeFromUtf8(source, out Rune firstScalarValue, out int bytesConsumed); switch (status) { case OperationStatus.NeedMoreData: // Input buffer ended with a partial UTF-8 sequence. Only treat this as an error // if the caller told us that we shouldn't expect additional data in a // future call. if (!isFinalBlock) { goto Finish; } status = OperationStatus.InvalidData; goto case OperationStatus.InvalidData; case OperationStatus.InvalidData: // Input buffer contained invalid data. If the caller told us not to // perform U+FFFD replacement, terminate the loop immediately and return // an error to the caller. if (!replaceInvalidSequences) { goto Finish; } firstScalarValue = Rune.ReplacementChar; goto default; default: // We know which scalar value we need to transcode to UTF-16. // Do so now, and only terminate the loop if we ran out of space // in the destination buffer. if (firstScalarValue.TryEncodeToUtf16(destination, out int charsWritten)) { source = source.Slice(bytesConsumed); // don't use Rune.Utf16SequenceLength; we may have performed substitution destination = destination.Slice(charsWritten); status = OperationStatus.Done; // forcibly set success continue; } else { status = OperationStatus.DestinationTooSmall; goto Finish; } } } Finish: numBytesRead = originalSourceLength - source.Length; numCharsWritten = originalDestinationLength - destination.Length; Debug.Assert((status == OperationStatus.Done) == (numBytesRead == originalSourceLength), "Should report OperationStatus.Done if and only if we've consumed the entire input buffer."); return(status); }