Example #1
0
        /// <summary>
        /// Return the number of bytes that compress() would write.
        /// </summary>
        ///
        /// <param name="source">text source string</param>
        /// <returns>the length of the BOCU result</returns>
        /// <seealso cref="M:IBM.ICU.Impl.BOCU.Compress(System.String, null, System.Int32)"/>
        public static int GetCompressionLength(String source)
        {
            int prev   = 0;
            int result = 0;
            UCharacterIterator iterator = IBM.ICU.Text.UCharacterIterator.GetInstance(source);
            int codepoint = iterator.NextCodePoint();

            while (codepoint != IBM.ICU.Text.UForwardCharacterIterator_Constants.DONE)
            {
                if (prev < 0x4e00 || prev >= 0xa000)
                {
                    prev = (prev & ~0x7f) - SLOPE_REACH_NEG_1_;
                }
                else
                {
                    // Unihan U+4e00..U+9fa5:
                    // double-bytes down from the upper end
                    prev = 0x9fff - SLOPE_REACH_POS_2_;
                }

                codepoint = iterator.NextCodePoint();
                result   += LengthOfDiff(codepoint - prev);
                prev      = codepoint;
            }
            return(result);
        }
Example #2
0
        // public constructors --------------------------------------------------

        // public methods -------------------------------------------------------

        /// <summary>
        /// <p>
        /// Encode the code points of a string as a sequence of bytes, preserving
        /// lexical order.
        /// </p>
        /// <p>
        /// The minimum size of buffer required for the compression can be
        /// preflighted by getCompressionLength(String).
        /// </p>
        /// </summary>
        ///
        /// <param name="source">text source</param>
        /// <param name="buffer">output buffer</param>
        /// <param name="offset">to start writing to</param>
        /// <returns>end offset where the writing stopped</returns>
        /// <seealso cref="M:IBM.ICU.Impl.BOCU.GetCompressionLength(System.String)"/>
        /// <exception cref="ArrayIndexOutOfBoundsException">thrown if size of buffer is too small for the output.</exception>
        public static int Compress(String source, byte[] buffer, int offset)
        {
            int prev = 0;
            UCharacterIterator iterator = IBM.ICU.Text.UCharacterIterator.GetInstance(source);
            int codepoint = iterator.NextCodePoint();

            while (codepoint != IBM.ICU.Text.UForwardCharacterIterator_Constants.DONE)
            {
                if (prev < 0x4e00 || prev >= 0xa000)
                {
                    prev = (prev & ~0x7f) - SLOPE_REACH_NEG_1_;
                }
                else
                {
                    // Unihan U+4e00..U+9fa5:
                    // double-bytes down from the upper end
                    prev = 0x9fff - SLOPE_REACH_POS_2_;
                }

                offset    = WriteDiff(codepoint - prev, buffer, offset);
                prev      = codepoint;
                codepoint = iterator.NextCodePoint();
            }
            return(offset);
        }
Example #3
0
        public void TestJitterbug1952()
        {
            //test previous code point
            char[]             src  = new char[] { '\uDC00', '\uD800', '\uDC01', '\uD802', '\uDC02', '\uDC03' };
            UCharacterIterator iter = UCharacterIterator.GetInstance(src);

            iter.Index = 1;
            int ch;

            // this should never go into a infinite loop
            // if it does then we have a problem
            while ((ch = iter.PreviousCodePoint()) != UCharacterIterator.DONE)
            {
                if (ch != 0xDc00)
                {
                    Errln("iter.PreviousCodePoint() failed");
                }
            }
            iter.Index = (5);
            while ((ch = iter.NextCodePoint()) != UCharacterIterator.DONE)
            {
                if (ch != 0xDC03)
                {
                    Errln("iter.NextCodePoint() failed");
                }
            }
        }
Example #4
0
        public StringBuffer Prepare(String src, StringPrepOptions options)
        {
            int                ch;
            String             mapOut = Map(src, options);
            UCharacterIterator iter   = UCharacterIterator.GetInstance(mapOut);

            UCharacterDirection direction = UCharacterDirectionExtensions.CharDirectionCount,
                                firstCharDir = UCharacterDirectionExtensions.CharDirectionCount;
            int  rtlPos = -1, ltrPos = -1;
            bool rightToLeft = false, leftToRight = false;

            while ((ch = iter.NextCodePoint()) != UCharacterIterator.Done)
            {
                if (transform.prohibitedSet.Contains(ch) == true && ch != 0x0020)
                {
                    throw new StringPrepParseException("A prohibited code point was found in the input",
                                                       StringPrepErrorType.ProhibitedError,
                                                       iter.GetText(), iter.Index);
                }

                direction = UChar.GetDirection(ch);
                if (firstCharDir == UCharacterDirectionExtensions.CharDirectionCount)
                {
                    firstCharDir = direction;
                }
                if (direction == UCharacterDirection.LeftToRight)
                {
                    leftToRight = true;
                    ltrPos      = iter.Index - 1;
                }
                if (direction == UCharacterDirection.RightToLeft || direction == UCharacterDirection.RightToLeftArabic)
                {
                    rightToLeft = true;
                    rtlPos      = iter.Index - 1;
                }
            }

            // satisfy 2
            if (leftToRight == true && rightToLeft == true)
            {
                throw new StringPrepParseException("The input does not conform to the rules for BiDi code points.",
                                                   StringPrepErrorType.CheckBiDiError, iter.GetText(), (rtlPos > ltrPos) ? rtlPos : ltrPos);
            }

            //satisfy 3
            if (rightToLeft == true &&
                !((firstCharDir == UCharacterDirection.RightToLeft || firstCharDir == UCharacterDirection.RightToLeftArabic) &&
                  (direction == UCharacterDirection.RightToLeft || direction == UCharacterDirection.RightToLeftArabic))
                )
            {
                throw new StringPrepParseException("The input does not conform to the rules for BiDi code points.",
                                                   StringPrepErrorType.CheckBiDiError, iter.GetText(), (rtlPos > ltrPos) ? rtlPos : ltrPos);
            }

            return(new StringBuffer(mapOut));
        }
Example #5
0
        private String Map(String src, StringPrepOptions options)
        {
            // map
            bool allowUnassigned = ((options & ALLOW_UNASSIGNED) > 0);
            // disable test
            String             caseMapOut = mapTransform.Transliterate(src);
            UCharacterIterator iter       = UCharacterIterator.GetInstance(caseMapOut);
            int ch;

            while ((ch = iter.NextCodePoint()) != UCharacterIterator.Done)
            {
                if (transform.unassignedSet.Contains(ch) == true && allowUnassigned == false)
                {
                    throw new StringPrepParseException("An unassigned code point was found in the input",
                                                       StringPrepErrorType.UnassignedError);
                }
            }
            return(caseMapOut);
        }
Example #6
0
        public static StringBuffer Encode(StringBuffer input, char[] case_flags)
        {
            int[]              @in   = new int[input.Length];
            int                inLen = 0;
            int                ch;
            StringBuffer       result = new StringBuffer();
            UCharacterIterator iter   = UCharacterIterator.GetInstance(input);

            while ((ch = iter.NextCodePoint()) != UCharacterIterator.Done)
            {
                @in[inLen++] = ch;
            }

            int[] outLen = new int[1];
            outLen[0] = input.Length * 4;
            char[] output = new char[outLen[0]];
            int    rc     = punycode_success;

            for (; ;)
            {
                rc = Encode(inLen, @in, case_flags, outLen, output);
                if (rc == punycode_big_output)
                {
                    outLen[0] = outLen[0] * 4;
                    output    = new char[outLen[0]];
                    // continue to convert
                    continue;
                }
                break;
            }
            if (rc == punycode_success)
            {
                return(result.Append(output, 0, outLen[0]));
            }
            GetException(rc);
            return(result);
        }
Example #7
0
        public void TestIteration()
        {
            UCharacterIterator iterator = UCharacterIterator.GetInstance(
                ITERATION_STRING_);
            UCharacterIterator iterator2 = UCharacterIterator.GetInstance(
                ITERATION_STRING_);

            iterator.SetToStart();
            if (iterator.Current != ITERATION_STRING_[0])
            {
                Errln("Iterator failed retrieving first character");
            }
            iterator.SetToLimit();
            if (iterator.Previous() != ITERATION_STRING_[
                    ITERATION_STRING_.Length - 1])
            {
                Errln("Iterator failed retrieving last character");
            }
            if (iterator.Length != ITERATION_STRING_.Length)
            {
                Errln("Iterator failed determining begin and end index");
            }
            iterator2.Index = 0;
            iterator.Index  = 0;
            int ch = 0;

            while (ch != UCharacterIterator.DONE)
            {
                int index = iterator2.Index;
                ch = iterator2.NextCodePoint();
                if (index != ITERATION_SUPPLEMENTARY_INDEX)
                {
                    if (ch != iterator.Next() &&
                        ch != UCharacterIterator.DONE)
                    {
                        Errln("Error mismatch in next() and nextCodePoint()");
                    }
                }
                else
                {
                    if (UTF16.GetLeadSurrogate(ch) != iterator.Next() ||
                        UTF16.GetTrailSurrogate(ch) != iterator.Next())
                    {
                        Errln("Error mismatch in next and nextCodePoint for " +
                              "supplementary characters");
                    }
                }
            }
            iterator.Index  = ITERATION_STRING_.Length;
            iterator2.Index = ITERATION_STRING_.Length;
            while (ch != UCharacterIterator.DONE)
            {
                int index = iterator2.Index;
                ch = iterator2.PreviousCodePoint();
                if (index != ITERATION_SUPPLEMENTARY_INDEX)
                {
                    if (ch != iterator.Previous() &&
                        ch != UCharacterIterator.DONE)
                    {
                        Errln("Error mismatch in previous() and " +
                              "previousCodePoint()");
                    }
                }
                else
                {
                    if (UTF16.GetLeadSurrogate(ch) != iterator.Previous() ||
                        UTF16.GetTrailSurrogate(ch) != iterator.Previous())
                    {
                        Errln("Error mismatch in previous and " +
                              "previousCodePoint for supplementary characters");
                    }
                }
            }
        }
Example #8
0
        public void previousNext(UCharacterIterator iter)
        {
            int[] expect =
            {
                0x2f999,
                0x1d15f,
                0xc4,
                0x1ed0
            };

            // expected src indexes corresponding to expect indexes
            int[] expectIndex =
            {
                0, 0,
                1, 1,
                2,
                3,
                4 //needed
            };

            // initial indexes into the src and expect strings

            int SRC_MIDDLE    = 4;
            int EXPECT_MIDDLE = 2;


            // movement vector
            // - for previous(), 0 for current(), + for next()
            // not const so that we can terminate it below for the error message
            String moves = "0+0+0--0-0-+++0--+++++++0--------";


            UCharIterator iter32 = new UCharIterator(expect, expect.Length,
                                                     EXPECT_MIDDLE);

            int  c1, c2;
            char m;

            // initially set the indexes into the middle of the strings
            iter.Index = (SRC_MIDDLE);

            // move around and compare the iteration code points with
            // the expected ones
            int movesIndex = 0;

            while (movesIndex < moves.Length)
            {
                m = moves[movesIndex++];
                if (m == '-')
                {
                    c1 = iter.PreviousCodePoint();
                    c2 = iter32.Previous();
                }
                else if (m == '0')
                {
                    c1 = iter.CurrentCodePoint;
                    c2 = iter32.Current;
                }
                else
                {// m=='+'
                    c1 = iter.NextCodePoint();
                    c2 = iter32.Next();
                }

                // compare results
                if (c1 != c2)
                {
                    // copy the moves until the current (m) move, and terminate
                    String history = moves.Substring(0, movesIndex - 0); // ICU4N: Checked 2nd parameter
                    Errln("error: mismatch in Normalizer iteration at " + history + ": "
                          + "got c1= " + Hex(c1) + " != expected c2= " + Hex(c2));
                    break;
                }

                // compare indexes
                if (expectIndex[iter.Index] != iter32.Index)
                {
                    // copy the moves until the current (m) move, and terminate
                    String history = moves.Substring(0, movesIndex - 0); // ICU4N: Checked 2nd parameter
                    Errln("error: index mismatch in Normalizer iteration at "
                          + history + " : " + "Normalizer index " + iter.Index
                          + " expected " + expectIndex[iter32.Index]);
                    break;
                }
            }
        }
Example #9
0
        public void TestIterationUChar32()
        {
            String text = "\u0061\u0062\ud841\udc02\u20ac\ud7ff\ud842\udc06\ud801\udc00\u0061";
            int    c;
            int    i;
            {
                UCharacterIterator iter = UCharacterIterator.GetInstance(text);

                String iterText = iter.GetText();
                if (!iterText.Equals(text))
                {
                    Errln("iter.getText() failed");
                }

                iter.Index = (1);
                if (iter.CurrentCodePoint != UTF16.CharAt(text, 1))
                {
                    Errln("Iterator didn't start out in the right place.");
                }

                iter.SetToStart();
                c = iter.CurrentCodePoint;
                i = 0;
                i = iter.MoveCodePointIndex(1);
                c = iter.CurrentCodePoint;
                if (c != UTF16.CharAt(text, 1) || i != 1)
                {
                    Errln("moveCodePointIndex(1) didn't work correctly expected " + Hex(c) + " got " + Hex(UTF16.CharAt(text, 1)) + " i= " + i);
                }

                i = iter.MoveCodePointIndex(2);
                c = iter.CurrentCodePoint;
                if (c != UTF16.CharAt(text, 4) || i != 4)
                {
                    Errln("moveCodePointIndex(2) didn't work correctly expected " + Hex(c) + " got " + Hex(UTF16.CharAt(text, 4)) + " i= " + i);
                }

                i = iter.MoveCodePointIndex(-2);
                c = iter.CurrentCodePoint;
                if (c != UTF16.CharAt(text, 1) || i != 1)
                {
                    Errln("moveCodePointIndex(-2) didn't work correctly expected " + Hex(c) + " got " + Hex(UTF16.CharAt(text, 1)) + " i= " + i);
                }

                iter.SetToLimit();
                i = iter.MoveCodePointIndex(-2);
                c = iter.CurrentCodePoint;
                if (c != UTF16.CharAt(text, (text.Length - 3)) || i != (text.Length - 3))
                {
                    Errln("moveCodePointIndex(-2) didn't work correctly expected " + Hex(c) + " got " + Hex(UTF16.CharAt(text, (text.Length - 3))) + " i= " + i);
                }

                iter.SetToStart();
                c = iter.CurrentCodePoint;
                i = 0;

                //testing first32PostInc, nextCodePointPostInc, setTostart
                i = 0;
                iter.SetToStart();
                c = iter.Next();
                if (c != UTF16.CharAt(text, i))
                {
                    Errln("first32PostInc failed.  Expected->" + Hex(UTF16.CharAt(text, i)) + " Got-> " + Hex(c));
                }
                if (iter.Index != UTF16.GetCharCount(c) + i)
                {
                    Errln("getIndex() after first32PostInc() failed");
                }

                iter.SetToStart();
                i = 0;
                if (iter.Index != 0)
                {
                    Errln("setToStart failed");
                }

                Logln("Testing forward iteration...");
                do
                {
                    if (c != UCharacterIterator.DONE)
                    {
                        c = iter.NextCodePoint();
                    }

                    if (c != UTF16.CharAt(text, i))
                    {
                        Errln("Character mismatch at position " + i + ", iterator has " + Hex(c) + ", string has " + Hex(UTF16.CharAt(text, i)));
                    }

                    i += UTF16.GetCharCount(c);
                    if (iter.Index != i)
                    {
                        Errln("getIndex() aftr nextCodePointPostInc() isn't working right");
                    }
                    c = iter.CurrentCodePoint;
                    if (c != UCharacterIterator.DONE && c != UTF16.CharAt(text, i))
                    {
                        Errln("current() after nextCodePointPostInc() isn't working right");
                    }
                } while (c != UCharacterIterator.DONE);
                c = iter.NextCodePoint();
                if (c != UCharacterIterator.DONE)
                {
                    Errln("nextCodePointPostInc() didn't return DONE at the beginning");
                }
            }
        }
Example #10
0
        /// <summary>
        /// Is there an exception at this point?
        /// </summary>
        /// <param name="n">The location of the possible break.</param>
        /// <returns></returns>
        private bool BreakExceptionAt(int n)
        {
            // Note: the C++ version of this function is SimpleFilteredSentenceBreakIterator::breakExceptionAt()

            int bestPosn  = -1;
            int bestValue = -1;

            // loops while 'n' points to an exception
            text.Index = n;
            backwardsTrie.Reset();
            int uch;



            // Assume a space is following the '.' (so we handle the case: "Mr. /Brown")
            if ((uch = text.PreviousCodePoint()) == ' ')
            { // TODO: skip a class of chars here??
              // TODO only do this the 1st time?
            }
            else
            {
                uch = text.NextCodePoint();
            }

            Result r = Result.IntermediateValue;

            while ((uch = text.PreviousCodePoint()) != UCharacterIterator.Done && // more to consume backwards and..
                   ((r = backwardsTrie.NextForCodePoint(uch)).HasNext()))
            {                                                                     // more in the trie
                if (r.HasValue())
                {                                                                 // remember the best match so far
                    bestPosn  = text.Index;
                    bestValue = backwardsTrie.GetValue();
                }
            }

            if (r.Matches())
            { // exact match?
                bestValue = backwardsTrie.GetValue();
                bestPosn  = text.Index;
            }

            if (bestPosn >= 0)
            {
                if (bestValue == SimpleFilteredSentenceBreakIteratorBuilder.Match)
                {                 // exact match!
                    return(true); // Exception here.
                }
                else if (bestValue == SimpleFilteredSentenceBreakIteratorBuilder.Partial && forwardsPartialTrie != null)
                {
                    // make sure there's a forward trie
                    // We matched the "Ph." in "Ph.D." - now we need to run everything through the forwards trie
                    // to see if it matches something going forward.
                    forwardsPartialTrie.Reset();

                    Result rfwd = Result.IntermediateValue;
                    text.Index = bestPosn; // hope that's close ..
                    while ((uch = text.NextCodePoint()) != BreakIterator.Done &&
                           ((rfwd = forwardsPartialTrie.NextForCodePoint(uch)).HasNext()))
                    {
                    }
                    if (rfwd.Matches())
                    {
                        // Exception here
                        return(true);
                    } // else fall through
                }     // else fall through
            }         // else fall through
            return(false); // No exception here.
        }