コード例 #1
0
        /**
         * Do a normalization using the iterative API in the given direction.
         * @param str a Java StringCharacterIterator
         * @param buf scratch buffer
         * @param dir either +1 or -1
         */
        private String iterativeNorm(StringCharacterIterator str, Normalizer.Mode mode,
                                     StringBuffer buf, int dir, int options)
        {
            normalizer.SetText(str);
            normalizer.SetMode(mode);
            buf.Length = (0);
            normalizer.SetOption(-1, false);      // reset all options
            normalizer.SetOption(options, true);  // set desired options

            int ch;

            if (dir > 0)
            {
                for (ch = normalizer.First(); ch != Normalizer.DONE;
                     ch = normalizer.Next())
                {
                    buf.Append(UTF16.ValueOf(ch));
                }
            }
            else
            {
                for (ch = normalizer.Last(); ch != Normalizer.DONE;
                     ch = normalizer.Previous())
                {
                    buf.Insert(0, UTF16.ValueOf(ch));
                }
            }
            return(buf.ToString());
        }
コード例 #2
0
        public void TestAPI()
        {
            String text = "Hello, World";

            ICharSequence     cs   = text.ToCharSequence();
            CharacterIterator csci = new CSCharacterIterator(cs);
            CharacterIterator sci  = new StringCharacterIterator(text);

            assertEquals("", sci.SetIndex(6), csci.SetIndex(6));
            assertEquals("", sci.Index, csci.Index);
            assertEquals("", sci.Current, csci.Current);
            assertEquals("", sci.Previous(), csci.Previous());
            assertEquals("", sci.Next(), csci.Next());
            assertEquals("", sci.BeginIndex, csci.BeginIndex);
            assertEquals("", sci.EndIndex, csci.EndIndex);
            assertEquals("", sci.First(), csci.First());
            assertEquals("", sci.Last(), csci.Last());

            csci.SetIndex(4);
            sci.SetIndex(4);
            CharacterIterator clci = (CharacterIterator)csci.Clone();

            for (int i = 0; i < 50; ++i)
            {
                assertEquals("", sci.Next(), clci.Next());
            }
            for (int i = 0; i < 50; ++i)
            {
                assertEquals("", sci.Previous(), clci.Previous());
            }
        }
コード例 #3
0
        /// <summary>
        /// Compares the equality of two StringCharacterIterator objects. </summary>
        /// <param name="obj"> the StringCharacterIterator object to be compared with. </param>
        /// <returns> true if the given obj is the same as this
        /// StringCharacterIterator object; false otherwise. </returns>
        public override bool Equals(Object obj)
        {
            if (this == obj)
            {
                return(true);
            }
            if (!(obj is StringCharacterIterator))
            {
                return(false);
            }

            StringCharacterIterator that = (StringCharacterIterator)obj;

            if (HashCode() != that.HashCode())
            {
                return(false);
            }
            if (!Text_Renamed.Equals(that.Text_Renamed))
            {
                return(false);
            }
            if (Pos != that.Pos || Begin != that.Begin || End != that.End)
            {
                return(false);
            }
            return(true);
        }
コード例 #4
0
        public void TestGetCollationElementIteratorCharacterIterator()
        {
            {
                Locale            locale = new Locale("es", "", "TRADITIONAL");
                RuleBasedCollator coll   = (RuleBasedCollator)ILOG.J2CsMapping.Text.Collator
                                           .GetInstance(locale);
                String text = "cha";
                StringCharacterIterator source = new StringCharacterIterator(
                    text);
                CollationElementIterator iterator = coll
                                                    .GetCollationElementIterator(source);
                int[] e_offset = { 0, 1, 2, 3 };
                int   offset   = iterator.GetOffset();
                int   i        = 0;
                NUnit.Framework.Assert.AreEqual(e_offset[i++], offset);
                while (offset != text.Length)
                {
                    iterator.Next();
                    offset = iterator.GetOffset();
                    // System.out.println(offset);
                    NUnit.Framework.Assert.AreEqual(e_offset[i++], offset);
                }
            }

            {
                Locale            locale_0 = new Locale("de", "DE");
                RuleBasedCollator coll_1   = (RuleBasedCollator)ILOG.J2CsMapping.Text.Collator
                                             .GetInstance(locale_0);
                String text_2 = "\u00E6b";
                StringCharacterIterator source_3 = new StringCharacterIterator(
                    text_2);
                CollationElementIterator iterator_4 = coll_1
                                                      .GetCollationElementIterator(source_3);
                int[] e_offset_5 = { 0, 1, 1, 2 };
                int   offset_6   = iterator_4.GetOffset();
                int   i_7        = 0;
                NUnit.Framework.Assert.AreEqual(e_offset_5[i_7++], offset_6);
                while (offset_6 != text_2.Length)
                {
                    iterator_4.Next();
                    offset_6 = iterator_4.GetOffset();
                    NUnit.Framework.Assert.AreEqual(e_offset_5[i_7++], offset_6);
                }
            }
            // Regression for HARMONY-1352
            try
            {
                new RuleBasedCollator("< a< b< c< d")
                .GetCollationElementIterator((CharacterIterator)null);
                NUnit.Framework.Assert.Fail("NullPointerException expected");
            }
            catch (NullReferenceException e)
            {
                // expected
            }
        }
コード例 #5
0
        public void TestNormalizerAPI()
        {
            // instantiate a Normalizer from a CharacterIterator
            string s = Utility.Unescape("a\u0308\uac00\\U0002f800");
            // make s a bit longer and more interesting
            CharacterIterator iter = new StringCharacterIterator(s + s);
            //test deprecated constructors
            Normalizer norm = new Normalizer(iter, NormalizerMode.NFC, 0);

            if (norm.Next() != 0xe4)
            {
                Errln("error in Normalizer(CharacterIterator).next()");
            }
            Normalizer norm2 = new Normalizer(s, NormalizerMode.NFC, 0);

            if (norm2.Next() != 0xe4)
            {
                Errln("error in Normalizer(CharacterIterator).next()");
            }
            // test clone(), ==, and hashCode()
            Normalizer clone = (Normalizer)norm.Clone();

            if (clone.GetBeginIndex() != norm.GetBeginIndex())
            {
                Errln("error in Normalizer.getBeginIndex()");
            }

            if (clone.GetEndIndex() != norm.GetEndIndex())
            {
                Errln("error in Normalizer.getEndIndex()");
            }

            // test setOption() and getOption()
            clone.SetOption(0xaa0000, true);
            clone.SetOption(0x20000, false);
            if (clone.GetOption(0x880000) == 0 || clone.GetOption(0x20000) == 1)
            {
                Errln("error in Normalizer::setOption() or Normalizer::getOption()");
            }

            // ICU4N specific - test setting normalizer options via enum
            clone.UnicodeVersion = NormalizerUnicodeVersion.Unicode3_2;
            assertEquals("error in Normalizer.UnicodeVersion property", NormalizerUnicodeVersion.Unicode3_2, clone.UnicodeVersion);
            clone.UnicodeVersion = NormalizerUnicodeVersion.Default;
            assertEquals("error in Normalizer.UnicodeVersion property", NormalizerUnicodeVersion.Default, clone.UnicodeVersion);

            //test deprecated normalize method
            Normalizer.Normalize(s, NormalizerMode.NFC, 0);
            //test deprecated compose method
            Normalizer.Compose(s, false, 0);
            //test deprecated decompose method
            Normalizer.Decompose(s, false, 0);
        }
コード例 #6
0
 /// <summary>
 /// Creates a copy of this iterator. </summary>
 /// <returns> A copy of this </returns>
 public Object Clone()
 {
     try
     {
         StringCharacterIterator other = (StringCharacterIterator)base.Clone();
         return(other);
     }
     catch (CloneNotSupportedException e)
     {
         throw new InternalError(e);
     }
 }
コード例 #7
0
ファイル: Text.cs プロジェクト: orf53975/hadoop.net
        /// <summary>
        /// For the given string, returns the number of UTF-8 bytes
        /// required to encode the string.
        /// </summary>
        /// <param name="string">text to encode</param>
        /// <returns>number of UTF-8 bytes required to encode</returns>
        public static int Utf8Length(string @string)
        {
            CharacterIterator iter = new StringCharacterIterator(@string);
            char ch   = iter.First();
            int  size = 0;

            while (ch != CharacterIterator.Done)
            {
                if ((ch >= unchecked ((int)(0xD800))) && (ch < unchecked ((int)(0xDC00))))
                {
                    // surrogate pair?
                    char trail = iter.Next();
                    if ((trail > unchecked ((int)(0xDBFF))) && (trail < unchecked ((int)(0xE000))))
                    {
                        // valid pair
                        size += 4;
                    }
                    else
                    {
                        // invalid pair
                        size += 3;
                        iter.Previous();
                    }
                }
                else
                {
                    // rewind one
                    if (ch < unchecked ((int)(0x80)))
                    {
                        size++;
                    }
                    else
                    {
                        if (ch < unchecked ((int)(0x800)))
                        {
                            size += 2;
                        }
                        else
                        {
                            // ch < 0x10000, that is, the largest char value
                            size += 3;
                        }
                    }
                }
                ch = iter.Next();
            }
            return(size);
        }
コード例 #8
0
        public void TestGetSetText()
        {
            Logln("Testing getText setText ");
            String str1 = "first string.";
            String str2 = "Second string.";
            //RuleBasedBreakIterator charIter1 = (RuleBasedBreakIterator) BreakIterator.getCharacterInstance(Locale.getDefault());
            RuleBasedBreakIterator wordIter1 = (RuleBasedBreakIterator)BreakIterator.GetWordInstance(CultureInfo.CurrentCulture);
            CharacterIterator      text1     = new StringCharacterIterator(str1);

            //CharacterIterator text1Clone = (CharacterIterator) text1.Clone();
            //CharacterIterator text2 = new StringCharacterIterator(str2);
            wordIter1.SetText(str1);
            if (!wordIter1.Text.Equals(text1))
            {
                Errln("ERROR:1 error in setText or getText ");
            }
            if (wordIter1.Current != 0)
            {
                Errln("ERROR:1 setText did not set the iteration position to the beginning of the text, it is"
                      + wordIter1.Current + "\n");
            }
            wordIter1.Next(2);
            wordIter1.SetText(str2);
            if (wordIter1.Current != 0)
            {
                Errln("ERROR:2 setText did not reset the iteration position to the beginning of the text, it is"
                      + wordIter1.Current + "\n");
            }

            // Test the CharSequence overload of setText() for a simple case.
            BreakIterator lineIter = BreakIterator.GetLineInstance(new CultureInfo("en"));
            ICharSequence csText   = "Hello, World. ".ToCharSequence();
            // Expected Line Brks  ^      ^      ^
            //                     0123456789012345
            List <int> expected = new List <int>();

            expected.Add(0); expected.Add(7); expected.Add(14);
            lineIter.SetText(csText);
            for (int pos = lineIter.First(); pos != BreakIterator.Done; pos = lineIter.Next())
            {
                assertTrue("", expected.Contains(pos));
            }
            assertEquals("", csText.Length, lineIter.Current);
        }
コード例 #9
0
        public void TestSetText(/* char* par */)
        {
            RuleBasedCollator        en_us = (RuleBasedCollator)Collator.GetInstance(new CultureInfo("en-US"));
            CollationElementIterator iter1 = en_us.GetCollationElementIterator(test1);
            CollationElementIterator iter2 = en_us.GetCollationElementIterator(test2);

            // Run through the second iterator just to exercise it
            int c = iter2.Next();
            int i = 0;

            while (++i < 10 && c != CollationElementIterator.NULLORDER)
            {
                try
                {
                    c = iter2.Next();
                }
                catch (Exception e)
                {
                    Errln("iter2.Next() returned an error.");
                    break;
                }
            }

            // Now set it to point to the same string as the first iterator
            try
            {
                iter2.SetText(test1);
            }
            catch (Exception e)
            {
                Errln("call to iter2->setText(test1) failed.");
                return;
            }
            assertEqual(iter1, iter2);

            iter1.Reset();
            //now use the overloaded setText(ChracterIterator&, UErrorCode) function to set the text
            CharacterIterator chariter = new StringCharacterIterator(test1);

            try
            {
                iter2.SetText(chariter);
            }
            catch (Exception e)
            {
                Errln("call to iter2->setText(chariter(test1)) failed.");
                return;
            }
            assertEqual(iter1, iter2);

            iter1.Reset();
            //now use the overloaded setText(ChracterIterator&, UErrorCode) function to set the text
            UCharacterIterator uchariter = UCharacterIterator.GetInstance(test1);

            try
            {
                iter2.SetText(uchariter);
            }
            catch (Exception e)
            {
                Errln("call to iter2->setText(uchariter(test1)) failed.");
                return;
            }
            assertEqual(iter1, iter2);
        }
コード例 #10
0
        public void TestUCharacterIteratorWrapper()
        {
            String             source  = "asdfasdfjoiuyoiuy2341235679886765";
            UCharacterIterator it      = UCharacterIterator.GetInstance(source);
            CharacterIterator  wrap_ci = it.GetCharacterIterator();
            CharacterIterator  ci      = new StringCharacterIterator(source);

            wrap_ci.SetIndex(10);
            ci.SetIndex(10);
            String moves = "0+0+0--0-0-+++0--+++++++0--------++++0000----0-";
            int    c1, c2;
            char   m;
            int    movesIndex = 0;

            while (movesIndex < moves.Length)
            {
                m = moves[movesIndex++];
                if (m == '-')
                {
                    c1 = wrap_ci.Previous();
                    c2 = ci.Previous();
                }
                else if (m == '0')
                {
                    c1 = wrap_ci.Current;
                    c2 = ci.Current;
                }
                else
                {// m=='+'
                    c1 = wrap_ci.Next();
                    c2 = ci.Next();
                }

                // compare results
                if (c1 != c2)
                {
                    // copy the moves until the current (m) move, and terminate
                    String history = moves.Substring(0, movesIndex - 0); // ICU4N: Checked 2nd parameter
                    Errln("error: mismatch in Normalizer iteration at " + history + ": "
                          + "got c1= " + Hex(c1) + " != expected c2= " + Hex(c2));
                    break;
                }

                // compare indexes
                if (wrap_ci.Index != ci.Index)
                {
                    // copy the moves until the current (m) move, and terminate
                    String history = moves.Substring(0, movesIndex - 0); // ICU4N: Checked 2nd parameter
                    Errln("error: index mismatch in Normalizer iteration at "
                          + history + " : " + "Normalizer index " + wrap_ci.Index
                          + " expected " + ci.Index);
                    break;
                }
            }
            if (ci.First() != wrap_ci.First())
            {
                Errln("CharacterIteratorWrapper.First() failed. expected: " + ci.First() + " got: " + wrap_ci.First());
            }
            if (ci.Last() != wrap_ci.Last())
            {
                Errln("CharacterIteratorWrapper.Last() failed expected: " + ci.Last() + " got: " + wrap_ci.Last());
            }
            if (ci.BeginIndex != wrap_ci.BeginIndex)
            {
                Errln("CharacterIteratorWrapper.BeginIndex failed expected: " + ci.BeginIndex + " got: " + wrap_ci.BeginIndex);
            }
            if (ci.EndIndex != wrap_ci.EndIndex)
            {
                Errln("CharacterIteratorWrapper.EndIndex failed expected: " + ci.EndIndex + " got: " + wrap_ci.EndIndex);
            }
            try
            {
                CharacterIterator cloneWCI = (CharacterIterator)wrap_ci.Clone();
                if (wrap_ci.Index != cloneWCI.Index)
                {
                    Errln("CharacterIteratorWrapper.Clone() failed expected: " + wrap_ci.Index + " got: " + cloneWCI.Index);
                }
            }
            catch (Exception e)
            {
                Errln("CharacterIterator.Clone() failed");
            }
        }
コード例 #11
0
        public override int DivideUpDictionaryRange(CharacterIterator inText, int startPos, int endPos,
                                                    DequeI foundBreaks)
        {
            if (startPos >= endPos)
            {
                return(0);
            }

            inText.SetIndex(startPos);

            int inputLength = endPos - startPos;

            int[]        charPositions = new int[inputLength + 1];
            StringBuffer s             = new StringBuffer("");

            inText.SetIndex(startPos);
            while (inText.Index < endPos)
            {
                s.Append(inText.Current);
                inText.Next();
            }
            string prenormstr = s.ToString();

#pragma warning disable 612, 618
            bool isNormalized = Normalizer.QuickCheck(prenormstr, NormalizerMode.NFKC) == QuickCheckResult.Yes ||
                                Normalizer.IsNormalized(prenormstr, NormalizerMode.NFKC, 0);
#pragma warning restore 612, 618
            CharacterIterator text;
            int numChars = 0;
            if (isNormalized)
            {
                text = new StringCharacterIterator(prenormstr);
                int index = 0;
                charPositions[0] = 0;
                while (index < prenormstr.Length)
                {
                    int codepoint = prenormstr.CodePointAt(index);
                    index += Character.CharCount(codepoint);
                    numChars++;
                    charPositions[numChars] = index;
                }
            }
            else
            {
#pragma warning disable 612, 618
                string normStr = Normalizer.Normalize(prenormstr, NormalizerMode.NFKC);
                text          = new StringCharacterIterator(normStr);
                charPositions = new int[normStr.Length + 1];
                Normalizer normalizer = new Normalizer(prenormstr, NormalizerMode.NFKC, 0);
                int        index      = 0;
                charPositions[0] = 0;
                while (index < normalizer.EndIndex)
                {
                    normalizer.Next();
                    numChars++;
                    index = normalizer.Index;
                    charPositions[numChars] = index;
                }
#pragma warning restore 612, 618
            }

            // From here on out, do the algorithm. Note that our indices
            // refer to indices within the normalized string.
            int[] bestSnlp = new int[numChars + 1];
            bestSnlp[0] = 0;
            for (int i = 1; i <= numChars; i++)
            {
                bestSnlp[i] = kint32max;
            }

            int[] prev = new int[numChars + 1];
            for (int i = 0; i <= numChars; i++)
            {
                prev[i] = -1;
            }

            int   maxWordSize = 20;
            int[] values      = new int[numChars];
            int[] lengths     = new int[numChars];
            // dynamic programming to find the best segmentation
            bool is_prev_katakana = false;
            for (int i = 0; i < numChars; i++)
            {
                text.SetIndex(i);
                if (bestSnlp[i] == kint32max)
                {
                    continue;
                }

                int   maxSearchLength = (i + maxWordSize < numChars) ? maxWordSize : (numChars - i);
                int[] count_          = new int[1];
                fDictionary.Matches(text, maxSearchLength, lengths, count_, maxSearchLength, values);
                int count = count_[0];

                // if there are no single character matches found in the dictionary
                // starting with this character, treat character as a 1-character word
                // with the highest value possible (i.e. the least likely to occur).
                // Exclude Korean characters from this treatment, as they should be
                // left together by default.
                text.SetIndex(i);  // fDictionary.matches() advances the text position; undo that.
                if ((count == 0 || lengths[0] != 1) && CharacterIteration.Current32(text) != CharacterIteration.Done32 && !fHangulWordSet.Contains(CharacterIteration.Current32(text)))
                {
                    values[count]  = maxSnlp;
                    lengths[count] = 1;
                    count++;
                }

                for (int j = 0; j < count; j++)
                {
                    int newSnlp = bestSnlp[i] + values[j];
                    if (newSnlp < bestSnlp[lengths[j] + i])
                    {
                        bestSnlp[lengths[j] + i] = newSnlp;
                        prev[lengths[j] + i]     = i;
                    }
                }

                // In Japanese, single-character Katakana words are pretty rare.
                // So we apply the following heuristic to Katakana: any continuous
                // run of Katakana characters is considered a candidate word with
                // a default cost specified in the katakanaCost table according
                // to its length.
                bool is_katakana = IsKatakana(CharacterIteration.Current32(text));
                if (!is_prev_katakana && is_katakana)
                {
                    int j = i + 1;
                    CharacterIteration.Next32(text);
                    while (j < numChars && (j - i) < kMaxKatakanaGroupLength && IsKatakana(CharacterIteration.Current32(text)))
                    {
                        CharacterIteration.Next32(text);
                        ++j;
                    }

                    if ((j - i) < kMaxKatakanaGroupLength)
                    {
                        int newSnlp = bestSnlp[i] + GetKatakanaCost(j - i);
                        if (newSnlp < bestSnlp[j])
                        {
                            bestSnlp[j] = newSnlp;
                            prev[j]     = i;
                        }
                    }
                }
                is_prev_katakana = is_katakana;
            }

            int[] t_boundary = new int[numChars + 1];
            int   numBreaks  = 0;
            if (bestSnlp[numChars] == kint32max)
            {
                t_boundary[numBreaks] = numChars;
                numBreaks++;
            }
            else
            {
                for (int i = numChars; i > 0; i = prev[i])
                {
                    t_boundary[numBreaks] = i;
                    numBreaks++;
                }
                Assert.Assrt(prev[t_boundary[numBreaks - 1]] == 0);
            }

            if (foundBreaks.Count == 0 || foundBreaks.Peek() < startPos)
            {
                t_boundary[numBreaks++] = 0;
            }

            int correctedNumBreaks = 0;
            for (int i = numBreaks - 1; i >= 0; i--)
            {
                int pos = charPositions[t_boundary[i]] + startPos;
                if (!(foundBreaks.Contains(pos) || pos == startPos))
                {
                    foundBreaks.Push(charPositions[t_boundary[i]] + startPos);
                    correctedNumBreaks++;
                }
            }

            if (!foundBreaks.IsEmpty && foundBreaks.Peek() == endPos)
            {
                foundBreaks.Pop();
                correctedNumBreaks--;
            }
            if (!foundBreaks.IsEmpty)
            {
                inText.SetIndex(foundBreaks.Peek());
            }
            return(correctedNumBreaks);
        }