public virtual void TestCharSequenceCharAt() { CharsRef c = new CharsRef("abc"); Assert.AreEqual('b', c.CharAt(1)); try { c.CharAt(-1); Assert.Fail(); } catch (System.IndexOutOfRangeException expected) { // expected exception } try { c.CharAt(3); Assert.Fail(); } catch (System.IndexOutOfRangeException expected) { // expected exception } }
/// <summary> /// Interprets stored bytes as UTF8 bytes, returning the /// resulting string /// </summary> public string Utf8ToString() { CharsRef @ref = new CharsRef(Length); UnicodeUtil.UTF8toUTF16(Bytes, Offset, Length, @ref); return(@ref.ToString()); }
public virtual void TestCharSequenceIndexer() { CharsRef c = new CharsRef("abc"); Assert.AreEqual('b', c[1]); try { var _ = c[-1]; Assert.Fail(); } catch (Exception expected) when(expected.IsIndexOutOfBoundsException()) { // expected exception } try { var _ = c[3]; Assert.Fail(); } catch (Exception expected) when(expected.IsIndexOutOfBoundsException()) { // expected exception } }
public virtual void TestCharSequenceIndexer() { CharsRef c = new CharsRef("abc"); Assert.AreEqual('b', c[1]); try { var _ = c[-1]; Assert.Fail(); } #pragma warning disable 168 catch (ArgumentOutOfRangeException expected) #pragma warning restore 168 { // expected exception } try { var _ = c[3]; Assert.Fail(); } #pragma warning disable 168 catch (ArgumentOutOfRangeException expected) #pragma warning restore 168 { // expected exception } }
public virtual void TestAppendChars() { char[] chars = new char[] { 'a', 'b', 'c', 'd' }; CharsRef c = new CharsRef(chars, 1, 3); // bcd c.Append(new char[] { 'e' }, 0, 1); Assert.AreEqual("bcde", c.ToString()); }
public virtual void TestCopyCharsRef() { char[] chars = new char[] { 'a', 'b', 'c', 'd' }; CharsRef c = new CharsRef(chars, 1, 3); // bcd char[] otherchars = new char[] { 'b', 'c', 'd', 'e' }; c.CopyChars(new CharsRef(otherchars, 0, 4)); Assert.AreEqual("bcde", c.ToString()); }
/// <summary> /// Interprets the given byte array as UTF-8 and converts to UTF-16. The <see cref="CharsRef"/> will be extended if /// it doesn't provide enough space to hold the worst case of each byte becoming a UTF-16 codepoint. /// <para/> /// NOTE: Full characters are read, even if this reads past the length passed (and /// can result in an <see cref="IndexOutOfRangeException"/> if invalid UTF-8 is passed). /// Explicit checks for valid UTF-8 are not performed. /// </summary> // TODO: broken if chars.offset != 0 public static void UTF8toUTF16(byte[] utf8, int offset, int length, CharsRef chars) { int out_offset = chars.Offset = 0; char[] @out = chars.Chars = ArrayUtil.Grow(chars.Chars, length); int limit = offset + length; while (offset < limit) { int b = utf8[offset++] & 0xff; if (b < 0xc0) { if (Debugging.AssertsEnabled) { Debugging.Assert(b < 0x80); } @out[out_offset++] = (char)b; } else if (b < 0xe0) { @out[out_offset++] = (char)(((b & 0x1f) << 6) + (utf8[offset++] & 0x3f)); } else if (b < 0xf0) { @out[out_offset++] = (char)(((b & 0xf) << 12) + ((utf8[offset] & 0x3f) << 6) + (utf8[offset + 1] & 0x3f)); offset += 2; } else { if (Debugging.AssertsEnabled) { Debugging.Assert(b < 0xf8, "b = 0x{0:x}", b); } int ch = ((b & 0x7) << 18) + ((utf8[offset] & 0x3f) << 12) + ((utf8[offset + 1] & 0x3f) << 6) + (utf8[offset + 2] & 0x3f); offset += 3; if (ch < UNI_MAX_BMP) { @out[out_offset++] = (char)ch; } else { int chHalf = ch - 0x0010000; @out[out_offset++] = (char)((chHalf >> 10) + 0xD800); @out[out_offset++] = (char)((chHalf & HALF_MASK) + 0xDC00); } } } chars.Length = out_offset - chars.Offset; }
public virtual void TestCopy() { int numIters = AtLeast(10); for (int i = 0; i < numIters; i++) { CharsRef @ref = new CharsRef(); char[] charArray = TestUtil.RandomRealisticUnicodeString(Random, 1, 100).ToCharArray(); int offset = Random.Next(charArray.Length); int length = charArray.Length - offset; string str = new string(charArray, offset, length); @ref.CopyChars(charArray, offset, length); Assert.AreEqual(str, @ref.ToString()); } }
public virtual void TestUTF8UTF16CharsRef() { int num = AtLeast(3989); for (int i = 0; i < num; i++) { string unicode = TestUtil.RandomRealisticUnicodeString(Random); BytesRef @ref = new BytesRef(unicode); char[] arr = new char[1 + Random.Next(100)]; int offset = Random.Next(arr.Length); int len = Random.Next(arr.Length - offset); CharsRef cRef = new CharsRef(arr, offset, len); UnicodeUtil.UTF8toUTF16(@ref, cRef); Assert.AreEqual(cRef.ToString(), unicode); } }
public static ICharSequence BytesToCharSequence(BytesRef @ref, Random random) { switch (random.Next(5)) { case 4: CharsRef chars = new CharsRef(@ref.Length); UnicodeUtil.UTF8toUTF16(@ref.Bytes, @ref.Offset, @ref.Length, chars); return(chars); case 3: return(CharBuffer.Wrap(@ref.Utf8ToString())); default: return(new StringCharSequence(@ref.Utf8ToString())); } }
public virtual void TestAppend() { CharsRef @ref = new CharsRef(); StringBuilder builder = new StringBuilder(); int numStrings = AtLeast(10); for (int i = 0; i < numStrings; i++) { char[] charArray = TestUtil.RandomRealisticUnicodeString(Random, 1, 100).ToCharArray(); int offset = Random.Next(charArray.Length); int length = charArray.Length - offset; builder.Append(charArray, offset, length); @ref.Append(charArray, offset, length); } Assert.AreEqual(builder.ToString(), @ref.ToString()); }
public void TestSerialization() { var chars = "The quick brown fox jumped over the lazy dog.".ToCharArray(); var charsRef = new CharsRef(chars, 8, 10); Assert.AreEqual(10, charsRef.Length); Assert.AreSame(chars, charsRef.Chars); Assert.AreEqual(chars, charsRef.Chars); Assert.AreEqual(8, charsRef.Offset); var clone = Clone(charsRef); Assert.AreEqual(10, clone.Length); Assert.AreNotSame(chars, clone.Chars); Assert.AreEqual(chars, clone.Chars); Assert.AreEqual(8, clone.Offset); }
public virtual void TestAllUnicodeChars() { BytesRef utf8 = new BytesRef(10); CharsRef utf16 = new CharsRef(10); char[] chars = new char[2]; for (int ch = 0; ch < 0x0010FFFF; ch++) { if (ch == 0xd800) // Skip invalid code points { ch = 0xe000; } int len = 0; if (ch <= 0xffff) { chars[len++] = (char)ch; } else { chars[len++] = (char)(((ch - 0x0010000) >> 10) + UnicodeUtil.UNI_SUR_HIGH_START); chars[len++] = (char)(((ch - 0x0010000) & 0x3FFL) + UnicodeUtil.UNI_SUR_LOW_START); } UnicodeUtil.UTF16toUTF8(chars, 0, len, utf8); string s1 = new string(chars, 0, len); string s2 = Encoding.UTF8.GetString(utf8.Bytes, utf8.Offset, utf8.Length); Assert.AreEqual(s1, s2, "codepoint " + ch); UnicodeUtil.UTF8toUTF16(utf8.Bytes, 0, utf8.Length, utf16); Assert.AreEqual(s1, new string(utf16.Chars, 0, utf16.Length), "codepoint " + ch); var b = s1.GetBytes(Encoding.UTF8); Assert.AreEqual(utf8.Length, b.Length); for (int j = 0; j < utf8.Length; j++) { Assert.AreEqual(utf8.Bytes[j], b[j]); } } }
public virtual void TestUTF16InUTF8Order() { int numStrings = AtLeast(1000); BytesRef[] utf8 = new BytesRef[numStrings]; CharsRef[] utf16 = new CharsRef[numStrings]; for (int i = 0; i < numStrings; i++) { string s = TestUtil.RandomUnicodeString(Random()); utf8[i] = new BytesRef(s); utf16[i] = new CharsRef(s); } Array.Sort(utf8); Array.Sort(utf16, CharsRef.UTF16SortedAsUTF8Comparer); for (int i = 0; i < numStrings; i++) { Assert.AreEqual(utf8[i].Utf8ToString(), utf16[i].ToString()); } }
/// <summary> /// Copies the UTF8 bytes for this string. /// </summary> /// <param name="text"> Must be well-formed unicode text, with no /// unpaired surrogates or invalid UTF16 code units. </param> public void CopyChars(CharsRef text) { Debug.Assert(Offset == 0); // TODO broken if offset != 0 UnicodeUtil.UTF16toUTF8(text, 0, text.Length, this); }
/// <summary> /// Initialize the byte[] from the UTF8 bytes /// for the provided String. /// </summary> /// <param name="text"> this must be well-formed /// unicode text, with no unpaired surrogates. </param> public BytesRef(CharsRef text) : this() { CopyChars(text); }
/// <summary> /// Utility method for <seealso cref="#UTF8toUTF16(byte[], int, int, CharsRef)"/> </summary> /// <seealso cref= #UTF8toUTF16(byte[], int, int, CharsRef) </seealso> public static void UTF8toUTF16(BytesRef bytesRef, CharsRef chars) { UTF8toUTF16(bytesRef.Bytes, bytesRef.Offset, bytesRef.Length, chars); }
/// <summary> /// Encode characters from a char[] source, starting at /// offset for length chars. After encoding, result.offset will always be 0. /// </summary> // TODO: broken if incoming result.offset != 0 public static void UTF16toUTF8(CharsRef source, int offset, int length, BytesRef result) { int upto = 0; int i = offset; int end = offset + length; var @out = result.Bytes; // Pre-allocate for worst case 4-for-1 int maxLen = length * 4; if (@out.Length < maxLen) { @out = result.Bytes = new byte[maxLen]; } result.Offset = 0; while (i < end) { int code = (int)source.CharAt(i++); if (code < 0x80) { @out[upto++] = (byte)code; } else if (code < 0x800) { @out[upto++] = (byte)(0xC0 | (code >> 6)); @out[upto++] = (byte)(0x80 | (code & 0x3F)); } else if (code < 0xD800 || code > 0xDFFF) { @out[upto++] = (byte)(0xE0 | (code >> 12)); @out[upto++] = (byte)(0x80 | ((code >> 6) & 0x3F)); @out[upto++] = (byte)(0x80 | (code & 0x3F)); } else { // surrogate pair // confirm valid high surrogate if (code < 0xDC00 && i < end) { var utf32 = (int)source.CharAt(i); // confirm valid low surrogate and write pair if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) { utf32 = (code << 10) + utf32 + SURROGATE_OFFSET; i++; @out[upto++] = (byte)(0xF0 | (utf32 >> 18)); @out[upto++] = (byte)(0x80 | ((utf32 >> 12) & 0x3F)); @out[upto++] = (byte)(0x80 | ((utf32 >> 6) & 0x3F)); @out[upto++] = (byte)(0x80 | (utf32 & 0x3F)); continue; } } // replace unpaired surrogate or out-of-order low surrogate // with substitution character @out[upto++] = unchecked ((byte)0xEF); @out[upto++] = unchecked ((byte)0xBF); @out[upto++] = unchecked ((byte)0xBD); } } //assert matches(source, offset, length, out, upto); result.Length = upto; }
public virtual void TestRandomUnicodeStrings() { char[] buffer = new char[20]; char[] expected = new char[20]; BytesRef utf8 = new BytesRef(20); CharsRef utf16 = new CharsRef(20); int num = AtLeast(100000); for (int iter = 0; iter < num; iter++) { bool hasIllegal = FillUnicode(buffer, expected, 0, 20); UnicodeUtil.UTF16toUTF8(buffer, 0, 20, utf8); if (!hasIllegal) { var b = (new string(buffer, 0, 20)).GetBytes(IOUtils.CHARSET_UTF_8); Assert.AreEqual(b.Length, utf8.Length); for (int i = 0; i < b.Length; i++) { Assert.AreEqual(b[i], utf8.Bytes[i]); } } UnicodeUtil.UTF8toUTF16(utf8.Bytes, 0, utf8.Length, utf16); Assert.AreEqual(utf16.Length, 20); for (int i = 0; i < 20; i++) { Assert.AreEqual(expected[i], utf16.Chars[i]); } } }