/// <summary>Trims a string to have the given token count at max.</summary> /// <param name="value">The string to substring from the left side.</param> /// <param name="token">The max token count.</param> /// <returns>The new substring.</returns> private static string SubstringToken(string value, int token) { int tokens = 0; for (int i = 0; i < value.Length; i++) { int addToken = TsString.IsDoubleChar(value[i]) ? 2 : 1; if (tokens + addToken > token) { return(value.Substring(0, i)); } else { tokens += addToken; } } return(value); }
public static IEnumerable <string> Transform(string text, LongTextBehaviour behaviour, int limit = int.MaxValue, int maxMessageSize = TsConst.MaxSizeTextMessage) { if (maxMessageSize < 4) { throw new ArgumentOutOfRangeException(nameof(maxMessageSize), "The minimum split length must be at least 4 bytes to fit all utf8 characters"); } // Assuming worst case that each UTF-8 character which epands to 4 bytes. // If the message is still shorter we can safely return in 1 block. if (text.Length * 4 <= TsConst.MaxSizeTextMessage) { return new[] { text } } ; var bytes = Encoding.UTF8.GetBytes(text); // If the entire text UTF-8 encoded fits in one message we can return early. if (bytes.Length * 2 < TsConst.MaxSizeTextMessage) { return new[] { text } } ; var list = new List <string>(); Span <Ind> splitIndices = stackalloc Ind[SeparatorWeight.Length]; var block = bytes.AsSpan(); while (block.Length > 0) { int tokenCnt = 0; int i = 0; bool filled = false; for (; i < block.Length; i++) { tokenCnt += TsString.IsDoubleChar(block[i]) ? 2 : 1; if (tokenCnt > maxMessageSize) { if (behaviour == LongTextBehaviour.Drop) { return(Enumerable.Empty <string>()); } filled = true; break; } for (int j = 0; j < SeparatorWeight.Length; j++) { if (block[i] == SeparatorWeight[j]) { splitIndices[j] = new Ind(i, tokenCnt); } } } if (!filled) { list.Add(block.NewUtf8String()); break; } bool hasSplit = false; if (behaviour != LongTextBehaviour.SplitHard) { for (int j = 0; j < SeparatorWeight.Length; j++) { if (!hasSplit && splitIndices[j].i > 0) { list.Add(block.Slice(0, splitIndices[j].i + 1).NewUtf8String()); block = block.Slice(splitIndices[j].i + 1); hasSplit = true; } } splitIndices.Fill(new Ind()); } if (!hasSplit) { // UTF-8 adjustment while (i > 0 && (block[i] & 0xC0) == 0x80) { i--; } list.Add(block.Slice(0, i).NewUtf8String()); block = block.Slice(i); } if (--limit == 0) { break; } } return(list); }