Esempio n. 1
0
        public virtual void TestCharSequenceCharAt()
        {
            CharsRef c = new CharsRef("abc");

            Assert.AreEqual('b', c.CharAt(1));

            try
            {
                c.CharAt(-1);
                Assert.Fail();
            }
            catch (System.IndexOutOfRangeException expected)
            {
                // expected exception
            }

            try
            {
                c.CharAt(3);
                Assert.Fail();
            }
            catch (System.IndexOutOfRangeException expected)
            {
                // expected exception
            }
        }
Esempio n. 2
0
        /// <summary>
        /// Interprets stored bytes as UTF8 bytes, returning the
        ///  resulting string
        /// </summary>
        public string Utf8ToString()
        {
            CharsRef @ref = new CharsRef(Length);

            UnicodeUtil.UTF8toUTF16(Bytes, Offset, Length, @ref);
            return(@ref.ToString());
        }
Esempio n. 3
0
        public virtual void TestCharSequenceIndexer()
        {
            CharsRef c = new CharsRef("abc");

            Assert.AreEqual('b', c[1]);

            try
            {
                var _ = c[-1];
                Assert.Fail();
            }
            catch (Exception expected) when(expected.IsIndexOutOfBoundsException())
            {
                // expected exception
            }

            try
            {
                var _ = c[3];
                Assert.Fail();
            }
            catch (Exception expected) when(expected.IsIndexOutOfBoundsException())
            {
                // expected exception
            }
        }
Esempio n. 4
0
        public virtual void TestCharSequenceIndexer()
        {
            CharsRef c = new CharsRef("abc");

            Assert.AreEqual('b', c[1]);

            try
            {
                var _ = c[-1];
                Assert.Fail();
            }
#pragma warning disable 168
            catch (ArgumentOutOfRangeException expected)
#pragma warning restore 168
            {
                // expected exception
            }

            try
            {
                var _ = c[3];
                Assert.Fail();
            }
#pragma warning disable 168
            catch (ArgumentOutOfRangeException expected)
#pragma warning restore 168
            {
                // expected exception
            }
        }
Esempio n. 5
0
        public virtual void TestAppendChars()
        {
            char[]   chars = new char[] { 'a', 'b', 'c', 'd' };
            CharsRef c     = new CharsRef(chars, 1, 3); // bcd

            c.Append(new char[] { 'e' }, 0, 1);
            Assert.AreEqual("bcde", c.ToString());
        }
Esempio n. 6
0
        public virtual void TestCopyCharsRef()
        {
            char[]   chars = new char[] { 'a', 'b', 'c', 'd' };
            CharsRef c     = new CharsRef(chars, 1, 3); // bcd

            char[] otherchars = new char[] { 'b', 'c', 'd', 'e' };
            c.CopyChars(new CharsRef(otherchars, 0, 4));
            Assert.AreEqual("bcde", c.ToString());
        }
Esempio n. 7
0
        /// <summary>
        /// Interprets the given byte array as UTF-8 and converts to UTF-16. The <see cref="CharsRef"/> will be extended if
        /// it doesn't provide enough space to hold the worst case of each byte becoming a UTF-16 codepoint.
        /// <para/>
        /// NOTE: Full characters are read, even if this reads past the length passed (and
        /// can result in an <see cref="IndexOutOfRangeException"/> if invalid UTF-8 is passed).
        /// Explicit checks for valid UTF-8 are not performed.
        /// </summary>
        // TODO: broken if chars.offset != 0
        public static void UTF8toUTF16(byte[] utf8, int offset, int length, CharsRef chars)
        {
            int out_offset = chars.Offset = 0;

            char[] @out  = chars.Chars = ArrayUtil.Grow(chars.Chars, length);
            int    limit = offset + length;

            while (offset < limit)
            {
                int b = utf8[offset++] & 0xff;
                if (b < 0xc0)
                {
                    if (Debugging.AssertsEnabled)
                    {
                        Debugging.Assert(b < 0x80);
                    }
                    @out[out_offset++] = (char)b;
                }
                else if (b < 0xe0)
                {
                    @out[out_offset++] = (char)(((b & 0x1f) << 6) + (utf8[offset++] & 0x3f));
                }
                else if (b < 0xf0)
                {
                    @out[out_offset++] = (char)(((b & 0xf) << 12) + ((utf8[offset] & 0x3f) << 6) + (utf8[offset + 1] & 0x3f));
                    offset            += 2;
                }
                else
                {
                    if (Debugging.AssertsEnabled)
                    {
                        Debugging.Assert(b < 0xf8, "b = 0x{0:x}", b);
                    }
                    int ch = ((b & 0x7) << 18) + ((utf8[offset] & 0x3f) << 12) + ((utf8[offset + 1] & 0x3f) << 6) + (utf8[offset + 2] & 0x3f);
                    offset += 3;
                    if (ch < UNI_MAX_BMP)
                    {
                        @out[out_offset++] = (char)ch;
                    }
                    else
                    {
                        int chHalf = ch - 0x0010000;
                        @out[out_offset++] = (char)((chHalf >> 10) + 0xD800);
                        @out[out_offset++] = (char)((chHalf & HALF_MASK) + 0xDC00);
                    }
                }
            }
            chars.Length = out_offset - chars.Offset;
        }
Esempio n. 8
0
        public virtual void TestCopy()
        {
            int numIters = AtLeast(10);

            for (int i = 0; i < numIters; i++)
            {
                CharsRef @ref      = new CharsRef();
                char[]   charArray = TestUtil.RandomRealisticUnicodeString(Random, 1, 100).ToCharArray();
                int      offset    = Random.Next(charArray.Length);
                int      length    = charArray.Length - offset;
                string   str       = new string(charArray, offset, length);
                @ref.CopyChars(charArray, offset, length);
                Assert.AreEqual(str, @ref.ToString());
            }
        }
Esempio n. 9
0
        public virtual void TestUTF8UTF16CharsRef()
        {
            int num = AtLeast(3989);

            for (int i = 0; i < num; i++)
            {
                string   unicode = TestUtil.RandomRealisticUnicodeString(Random);
                BytesRef @ref    = new BytesRef(unicode);
                char[]   arr     = new char[1 + Random.Next(100)];
                int      offset  = Random.Next(arr.Length);
                int      len     = Random.Next(arr.Length - offset);
                CharsRef cRef    = new CharsRef(arr, offset, len);
                UnicodeUtil.UTF8toUTF16(@ref, cRef);
                Assert.AreEqual(cRef.ToString(), unicode);
            }
        }
Esempio n. 10
0
        public static ICharSequence BytesToCharSequence(BytesRef @ref, Random random)
        {
            switch (random.Next(5))
            {
            case 4:
                CharsRef chars = new CharsRef(@ref.Length);
                UnicodeUtil.UTF8toUTF16(@ref.Bytes, @ref.Offset, @ref.Length, chars);
                return(chars);

            case 3:
                return(CharBuffer.Wrap(@ref.Utf8ToString()));

            default:
                return(new StringCharSequence(@ref.Utf8ToString()));
            }
        }
Esempio n. 11
0
        public virtual void TestAppend()
        {
            CharsRef      @ref       = new CharsRef();
            StringBuilder builder    = new StringBuilder();
            int           numStrings = AtLeast(10);

            for (int i = 0; i < numStrings; i++)
            {
                char[] charArray = TestUtil.RandomRealisticUnicodeString(Random, 1, 100).ToCharArray();
                int    offset    = Random.Next(charArray.Length);
                int    length    = charArray.Length - offset;
                builder.Append(charArray, offset, length);
                @ref.Append(charArray, offset, length);
            }

            Assert.AreEqual(builder.ToString(), @ref.ToString());
        }
Esempio n. 12
0
        public void TestSerialization()
        {
            var chars = "The quick brown fox jumped over the lazy dog.".ToCharArray();

            var charsRef = new CharsRef(chars, 8, 10);

            Assert.AreEqual(10, charsRef.Length);
            Assert.AreSame(chars, charsRef.Chars);
            Assert.AreEqual(chars, charsRef.Chars);
            Assert.AreEqual(8, charsRef.Offset);

            var clone = Clone(charsRef);

            Assert.AreEqual(10, clone.Length);
            Assert.AreNotSame(chars, clone.Chars);
            Assert.AreEqual(chars, clone.Chars);
            Assert.AreEqual(8, clone.Offset);
        }
        public virtual void TestAllUnicodeChars()
        {
            BytesRef utf8 = new BytesRef(10);
            CharsRef utf16 = new CharsRef(10);
            char[] chars = new char[2];
            for (int ch = 0; ch < 0x0010FFFF; ch++)
            {
                if (ch == 0xd800)
                // Skip invalid code points
                {
                    ch = 0xe000;
                }

                int len = 0;
                if (ch <= 0xffff)
                {
                    chars[len++] = (char)ch;
                }
                else
                {
                    chars[len++] = (char)(((ch - 0x0010000) >> 10) + UnicodeUtil.UNI_SUR_HIGH_START);
                    chars[len++] = (char)(((ch - 0x0010000) & 0x3FFL) + UnicodeUtil.UNI_SUR_LOW_START);
                }

                UnicodeUtil.UTF16toUTF8(chars, 0, len, utf8);

                string s1 = new string(chars, 0, len);
                string s2 = Encoding.UTF8.GetString(utf8.Bytes, utf8.Offset, utf8.Length);
                Assert.AreEqual(s1, s2, "codepoint " + ch);

                UnicodeUtil.UTF8toUTF16(utf8.Bytes, 0, utf8.Length, utf16);
                Assert.AreEqual(s1, new string(utf16.Chars, 0, utf16.Length), "codepoint " + ch);

                var b = s1.GetBytes(Encoding.UTF8);
                Assert.AreEqual(utf8.Length, b.Length);
                for (int j = 0; j < utf8.Length; j++)
                {
                    Assert.AreEqual(utf8.Bytes[j], b[j]);
                }
            }
        }
Esempio n. 14
0
        public virtual void TestUTF16InUTF8Order()
        {
            int numStrings = AtLeast(1000);

            BytesRef[] utf8  = new BytesRef[numStrings];
            CharsRef[] utf16 = new CharsRef[numStrings];

            for (int i = 0; i < numStrings; i++)
            {
                string s = TestUtil.RandomUnicodeString(Random());
                utf8[i]  = new BytesRef(s);
                utf16[i] = new CharsRef(s);
            }

            Array.Sort(utf8);
            Array.Sort(utf16, CharsRef.UTF16SortedAsUTF8Comparer);

            for (int i = 0; i < numStrings; i++)
            {
                Assert.AreEqual(utf8[i].Utf8ToString(), utf16[i].ToString());
            }
        }
Esempio n. 15
0
 /// <summary>
 /// Copies the UTF8 bytes for this string.
 /// </summary>
 /// <param name="text"> Must be well-formed unicode text, with no
 /// unpaired surrogates or invalid UTF16 code units. </param>
 public void CopyChars(CharsRef text)
 {
     Debug.Assert(Offset == 0); // TODO broken if offset != 0
     UnicodeUtil.UTF16toUTF8(text, 0, text.Length, this);
 }
Esempio n. 16
0
 /// <summary>
 /// Initialize the byte[] from the UTF8 bytes
 /// for the provided String.
 /// </summary>
 /// <param name="text"> this must be well-formed
 /// unicode text, with no unpaired surrogates. </param>
 public BytesRef(CharsRef text)
     : this()
 {
     CopyChars(text);
 }
Esempio n. 17
0
 /// <summary>
 /// Utility method for <seealso cref="#UTF8toUTF16(byte[], int, int, CharsRef)"/> </summary>
 /// <seealso cref= #UTF8toUTF16(byte[], int, int, CharsRef) </seealso>
 public static void UTF8toUTF16(BytesRef bytesRef, CharsRef chars)
 {
     UTF8toUTF16(bytesRef.Bytes, bytesRef.Offset, bytesRef.Length, chars);
 }
Esempio n. 18
0
        /// <summary>
        /// Encode characters from a char[] source, starting at
        ///  offset for length chars. After encoding, result.offset will always be 0.
        /// </summary>
        // TODO: broken if incoming result.offset != 0
        public static void UTF16toUTF8(CharsRef source, int offset, int length, BytesRef result)
        {
            int upto = 0;
            int i    = offset;
            int end  = offset + length;
            var @out = result.Bytes;
            // Pre-allocate for worst case 4-for-1
            int maxLen = length * 4;

            if (@out.Length < maxLen)
            {
                @out = result.Bytes = new byte[maxLen];
            }
            result.Offset = 0;

            while (i < end)
            {
                int code = (int)source.CharAt(i++);

                if (code < 0x80)
                {
                    @out[upto++] = (byte)code;
                }
                else if (code < 0x800)
                {
                    @out[upto++] = (byte)(0xC0 | (code >> 6));
                    @out[upto++] = (byte)(0x80 | (code & 0x3F));
                }
                else if (code < 0xD800 || code > 0xDFFF)
                {
                    @out[upto++] = (byte)(0xE0 | (code >> 12));
                    @out[upto++] = (byte)(0x80 | ((code >> 6) & 0x3F));
                    @out[upto++] = (byte)(0x80 | (code & 0x3F));
                }
                else
                {
                    // surrogate pair
                    // confirm valid high surrogate
                    if (code < 0xDC00 && i < end)
                    {
                        var utf32 = (int)source.CharAt(i);
                        // confirm valid low surrogate and write pair
                        if (utf32 >= 0xDC00 && utf32 <= 0xDFFF)
                        {
                            utf32 = (code << 10) + utf32 + SURROGATE_OFFSET;
                            i++;
                            @out[upto++] = (byte)(0xF0 | (utf32 >> 18));
                            @out[upto++] = (byte)(0x80 | ((utf32 >> 12) & 0x3F));
                            @out[upto++] = (byte)(0x80 | ((utf32 >> 6) & 0x3F));
                            @out[upto++] = (byte)(0x80 | (utf32 & 0x3F));
                            continue;
                        }
                    }
                    // replace unpaired surrogate or out-of-order low surrogate
                    // with substitution character
                    @out[upto++] = unchecked ((byte)0xEF);
                    @out[upto++] = unchecked ((byte)0xBF);
                    @out[upto++] = unchecked ((byte)0xBD);
                }
            }
            //assert matches(source, offset, length, out, upto);
            result.Length = upto;
        }
        public virtual void TestRandomUnicodeStrings()
        {
            char[] buffer = new char[20];
            char[] expected = new char[20];

            BytesRef utf8 = new BytesRef(20);
            CharsRef utf16 = new CharsRef(20);

            int num = AtLeast(100000);
            for (int iter = 0; iter < num; iter++)
            {
                bool hasIllegal = FillUnicode(buffer, expected, 0, 20);

                UnicodeUtil.UTF16toUTF8(buffer, 0, 20, utf8);
                if (!hasIllegal)
                {
                    var b = (new string(buffer, 0, 20)).GetBytes(IOUtils.CHARSET_UTF_8);
                    Assert.AreEqual(b.Length, utf8.Length);
                    for (int i = 0; i < b.Length; i++)
                    {
                        Assert.AreEqual(b[i], utf8.Bytes[i]);
                    }
                }

                UnicodeUtil.UTF8toUTF16(utf8.Bytes, 0, utf8.Length, utf16);
                Assert.AreEqual(utf16.Length, 20);
                for (int i = 0; i < 20; i++)
                {
                    Assert.AreEqual(expected[i], utf16.Chars[i]);
                }
            }
        }