/** * Do a normalization using the iterative API in the given direction. * @param str a Java StringCharacterIterator * @param buf scratch buffer * @param dir either +1 or -1 */ private String iterativeNorm(StringCharacterIterator str, Normalizer.Mode mode, StringBuffer buf, int dir, int options) { normalizer.SetText(str); normalizer.SetMode(mode); buf.Length = (0); normalizer.SetOption(-1, false); // reset all options normalizer.SetOption(options, true); // set desired options int ch; if (dir > 0) { for (ch = normalizer.First(); ch != Normalizer.DONE; ch = normalizer.Next()) { buf.Append(UTF16.ValueOf(ch)); } } else { for (ch = normalizer.Last(); ch != Normalizer.DONE; ch = normalizer.Previous()) { buf.Insert(0, UTF16.ValueOf(ch)); } } return(buf.ToString()); }
public void TestAPI() { String text = "Hello, World"; ICharSequence cs = text.ToCharSequence(); CharacterIterator csci = new CSCharacterIterator(cs); CharacterIterator sci = new StringCharacterIterator(text); assertEquals("", sci.SetIndex(6), csci.SetIndex(6)); assertEquals("", sci.Index, csci.Index); assertEquals("", sci.Current, csci.Current); assertEquals("", sci.Previous(), csci.Previous()); assertEquals("", sci.Next(), csci.Next()); assertEquals("", sci.BeginIndex, csci.BeginIndex); assertEquals("", sci.EndIndex, csci.EndIndex); assertEquals("", sci.First(), csci.First()); assertEquals("", sci.Last(), csci.Last()); csci.SetIndex(4); sci.SetIndex(4); CharacterIterator clci = (CharacterIterator)csci.Clone(); for (int i = 0; i < 50; ++i) { assertEquals("", sci.Next(), clci.Next()); } for (int i = 0; i < 50; ++i) { assertEquals("", sci.Previous(), clci.Previous()); } }
/// <summary> /// Compares the equality of two StringCharacterIterator objects. </summary> /// <param name="obj"> the StringCharacterIterator object to be compared with. </param> /// <returns> true if the given obj is the same as this /// StringCharacterIterator object; false otherwise. </returns> public override bool Equals(Object obj) { if (this == obj) { return(true); } if (!(obj is StringCharacterIterator)) { return(false); } StringCharacterIterator that = (StringCharacterIterator)obj; if (HashCode() != that.HashCode()) { return(false); } if (!Text_Renamed.Equals(that.Text_Renamed)) { return(false); } if (Pos != that.Pos || Begin != that.Begin || End != that.End) { return(false); } return(true); }
public void TestGetCollationElementIteratorCharacterIterator() { { Locale locale = new Locale("es", "", "TRADITIONAL"); RuleBasedCollator coll = (RuleBasedCollator)ILOG.J2CsMapping.Text.Collator .GetInstance(locale); String text = "cha"; StringCharacterIterator source = new StringCharacterIterator( text); CollationElementIterator iterator = coll .GetCollationElementIterator(source); int[] e_offset = { 0, 1, 2, 3 }; int offset = iterator.GetOffset(); int i = 0; NUnit.Framework.Assert.AreEqual(e_offset[i++], offset); while (offset != text.Length) { iterator.Next(); offset = iterator.GetOffset(); // System.out.println(offset); NUnit.Framework.Assert.AreEqual(e_offset[i++], offset); } } { Locale locale_0 = new Locale("de", "DE"); RuleBasedCollator coll_1 = (RuleBasedCollator)ILOG.J2CsMapping.Text.Collator .GetInstance(locale_0); String text_2 = "\u00E6b"; StringCharacterIterator source_3 = new StringCharacterIterator( text_2); CollationElementIterator iterator_4 = coll_1 .GetCollationElementIterator(source_3); int[] e_offset_5 = { 0, 1, 1, 2 }; int offset_6 = iterator_4.GetOffset(); int i_7 = 0; NUnit.Framework.Assert.AreEqual(e_offset_5[i_7++], offset_6); while (offset_6 != text_2.Length) { iterator_4.Next(); offset_6 = iterator_4.GetOffset(); NUnit.Framework.Assert.AreEqual(e_offset_5[i_7++], offset_6); } } // Regression for HARMONY-1352 try { new RuleBasedCollator("< a< b< c< d") .GetCollationElementIterator((CharacterIterator)null); NUnit.Framework.Assert.Fail("NullPointerException expected"); } catch (NullReferenceException e) { // expected } }
public void TestNormalizerAPI() { // instantiate a Normalizer from a CharacterIterator string s = Utility.Unescape("a\u0308\uac00\\U0002f800"); // make s a bit longer and more interesting CharacterIterator iter = new StringCharacterIterator(s + s); //test deprecated constructors Normalizer norm = new Normalizer(iter, NormalizerMode.NFC, 0); if (norm.Next() != 0xe4) { Errln("error in Normalizer(CharacterIterator).next()"); } Normalizer norm2 = new Normalizer(s, NormalizerMode.NFC, 0); if (norm2.Next() != 0xe4) { Errln("error in Normalizer(CharacterIterator).next()"); } // test clone(), ==, and hashCode() Normalizer clone = (Normalizer)norm.Clone(); if (clone.GetBeginIndex() != norm.GetBeginIndex()) { Errln("error in Normalizer.getBeginIndex()"); } if (clone.GetEndIndex() != norm.GetEndIndex()) { Errln("error in Normalizer.getEndIndex()"); } // test setOption() and getOption() clone.SetOption(0xaa0000, true); clone.SetOption(0x20000, false); if (clone.GetOption(0x880000) == 0 || clone.GetOption(0x20000) == 1) { Errln("error in Normalizer::setOption() or Normalizer::getOption()"); } // ICU4N specific - test setting normalizer options via enum clone.UnicodeVersion = NormalizerUnicodeVersion.Unicode3_2; assertEquals("error in Normalizer.UnicodeVersion property", NormalizerUnicodeVersion.Unicode3_2, clone.UnicodeVersion); clone.UnicodeVersion = NormalizerUnicodeVersion.Default; assertEquals("error in Normalizer.UnicodeVersion property", NormalizerUnicodeVersion.Default, clone.UnicodeVersion); //test deprecated normalize method Normalizer.Normalize(s, NormalizerMode.NFC, 0); //test deprecated compose method Normalizer.Compose(s, false, 0); //test deprecated decompose method Normalizer.Decompose(s, false, 0); }
/// <summary> /// Creates a copy of this iterator. </summary> /// <returns> A copy of this </returns> public Object Clone() { try { StringCharacterIterator other = (StringCharacterIterator)base.Clone(); return(other); } catch (CloneNotSupportedException e) { throw new InternalError(e); } }
/// <summary> /// For the given string, returns the number of UTF-8 bytes /// required to encode the string. /// </summary> /// <param name="string">text to encode</param> /// <returns>number of UTF-8 bytes required to encode</returns> public static int Utf8Length(string @string) { CharacterIterator iter = new StringCharacterIterator(@string); char ch = iter.First(); int size = 0; while (ch != CharacterIterator.Done) { if ((ch >= unchecked ((int)(0xD800))) && (ch < unchecked ((int)(0xDC00)))) { // surrogate pair? char trail = iter.Next(); if ((trail > unchecked ((int)(0xDBFF))) && (trail < unchecked ((int)(0xE000)))) { // valid pair size += 4; } else { // invalid pair size += 3; iter.Previous(); } } else { // rewind one if (ch < unchecked ((int)(0x80))) { size++; } else { if (ch < unchecked ((int)(0x800))) { size += 2; } else { // ch < 0x10000, that is, the largest char value size += 3; } } } ch = iter.Next(); } return(size); }
public void TestGetSetText() { Logln("Testing getText setText "); String str1 = "first string."; String str2 = "Second string."; //RuleBasedBreakIterator charIter1 = (RuleBasedBreakIterator) BreakIterator.getCharacterInstance(Locale.getDefault()); RuleBasedBreakIterator wordIter1 = (RuleBasedBreakIterator)BreakIterator.GetWordInstance(CultureInfo.CurrentCulture); CharacterIterator text1 = new StringCharacterIterator(str1); //CharacterIterator text1Clone = (CharacterIterator) text1.Clone(); //CharacterIterator text2 = new StringCharacterIterator(str2); wordIter1.SetText(str1); if (!wordIter1.Text.Equals(text1)) { Errln("ERROR:1 error in setText or getText "); } if (wordIter1.Current != 0) { Errln("ERROR:1 setText did not set the iteration position to the beginning of the text, it is" + wordIter1.Current + "\n"); } wordIter1.Next(2); wordIter1.SetText(str2); if (wordIter1.Current != 0) { Errln("ERROR:2 setText did not reset the iteration position to the beginning of the text, it is" + wordIter1.Current + "\n"); } // Test the CharSequence overload of setText() for a simple case. BreakIterator lineIter = BreakIterator.GetLineInstance(new CultureInfo("en")); ICharSequence csText = "Hello, World. ".ToCharSequence(); // Expected Line Brks ^ ^ ^ // 0123456789012345 List <int> expected = new List <int>(); expected.Add(0); expected.Add(7); expected.Add(14); lineIter.SetText(csText); for (int pos = lineIter.First(); pos != BreakIterator.Done; pos = lineIter.Next()) { assertTrue("", expected.Contains(pos)); } assertEquals("", csText.Length, lineIter.Current); }
public void TestSetText(/* char* par */) { RuleBasedCollator en_us = (RuleBasedCollator)Collator.GetInstance(new CultureInfo("en-US")); CollationElementIterator iter1 = en_us.GetCollationElementIterator(test1); CollationElementIterator iter2 = en_us.GetCollationElementIterator(test2); // Run through the second iterator just to exercise it int c = iter2.Next(); int i = 0; while (++i < 10 && c != CollationElementIterator.NULLORDER) { try { c = iter2.Next(); } catch (Exception e) { Errln("iter2.Next() returned an error."); break; } } // Now set it to point to the same string as the first iterator try { iter2.SetText(test1); } catch (Exception e) { Errln("call to iter2->setText(test1) failed."); return; } assertEqual(iter1, iter2); iter1.Reset(); //now use the overloaded setText(ChracterIterator&, UErrorCode) function to set the text CharacterIterator chariter = new StringCharacterIterator(test1); try { iter2.SetText(chariter); } catch (Exception e) { Errln("call to iter2->setText(chariter(test1)) failed."); return; } assertEqual(iter1, iter2); iter1.Reset(); //now use the overloaded setText(ChracterIterator&, UErrorCode) function to set the text UCharacterIterator uchariter = UCharacterIterator.GetInstance(test1); try { iter2.SetText(uchariter); } catch (Exception e) { Errln("call to iter2->setText(uchariter(test1)) failed."); return; } assertEqual(iter1, iter2); }
public void TestUCharacterIteratorWrapper() { String source = "asdfasdfjoiuyoiuy2341235679886765"; UCharacterIterator it = UCharacterIterator.GetInstance(source); CharacterIterator wrap_ci = it.GetCharacterIterator(); CharacterIterator ci = new StringCharacterIterator(source); wrap_ci.SetIndex(10); ci.SetIndex(10); String moves = "0+0+0--0-0-+++0--+++++++0--------++++0000----0-"; int c1, c2; char m; int movesIndex = 0; while (movesIndex < moves.Length) { m = moves[movesIndex++]; if (m == '-') { c1 = wrap_ci.Previous(); c2 = ci.Previous(); } else if (m == '0') { c1 = wrap_ci.Current; c2 = ci.Current; } else {// m=='+' c1 = wrap_ci.Next(); c2 = ci.Next(); } // compare results if (c1 != c2) { // copy the moves until the current (m) move, and terminate String history = moves.Substring(0, movesIndex - 0); // ICU4N: Checked 2nd parameter Errln("error: mismatch in Normalizer iteration at " + history + ": " + "got c1= " + Hex(c1) + " != expected c2= " + Hex(c2)); break; } // compare indexes if (wrap_ci.Index != ci.Index) { // copy the moves until the current (m) move, and terminate String history = moves.Substring(0, movesIndex - 0); // ICU4N: Checked 2nd parameter Errln("error: index mismatch in Normalizer iteration at " + history + " : " + "Normalizer index " + wrap_ci.Index + " expected " + ci.Index); break; } } if (ci.First() != wrap_ci.First()) { Errln("CharacterIteratorWrapper.First() failed. expected: " + ci.First() + " got: " + wrap_ci.First()); } if (ci.Last() != wrap_ci.Last()) { Errln("CharacterIteratorWrapper.Last() failed expected: " + ci.Last() + " got: " + wrap_ci.Last()); } if (ci.BeginIndex != wrap_ci.BeginIndex) { Errln("CharacterIteratorWrapper.BeginIndex failed expected: " + ci.BeginIndex + " got: " + wrap_ci.BeginIndex); } if (ci.EndIndex != wrap_ci.EndIndex) { Errln("CharacterIteratorWrapper.EndIndex failed expected: " + ci.EndIndex + " got: " + wrap_ci.EndIndex); } try { CharacterIterator cloneWCI = (CharacterIterator)wrap_ci.Clone(); if (wrap_ci.Index != cloneWCI.Index) { Errln("CharacterIteratorWrapper.Clone() failed expected: " + wrap_ci.Index + " got: " + cloneWCI.Index); } } catch (Exception e) { Errln("CharacterIterator.Clone() failed"); } }
public override int DivideUpDictionaryRange(CharacterIterator inText, int startPos, int endPos, DequeI foundBreaks) { if (startPos >= endPos) { return(0); } inText.SetIndex(startPos); int inputLength = endPos - startPos; int[] charPositions = new int[inputLength + 1]; StringBuffer s = new StringBuffer(""); inText.SetIndex(startPos); while (inText.Index < endPos) { s.Append(inText.Current); inText.Next(); } string prenormstr = s.ToString(); #pragma warning disable 612, 618 bool isNormalized = Normalizer.QuickCheck(prenormstr, NormalizerMode.NFKC) == QuickCheckResult.Yes || Normalizer.IsNormalized(prenormstr, NormalizerMode.NFKC, 0); #pragma warning restore 612, 618 CharacterIterator text; int numChars = 0; if (isNormalized) { text = new StringCharacterIterator(prenormstr); int index = 0; charPositions[0] = 0; while (index < prenormstr.Length) { int codepoint = prenormstr.CodePointAt(index); index += Character.CharCount(codepoint); numChars++; charPositions[numChars] = index; } } else { #pragma warning disable 612, 618 string normStr = Normalizer.Normalize(prenormstr, NormalizerMode.NFKC); text = new StringCharacterIterator(normStr); charPositions = new int[normStr.Length + 1]; Normalizer normalizer = new Normalizer(prenormstr, NormalizerMode.NFKC, 0); int index = 0; charPositions[0] = 0; while (index < normalizer.EndIndex) { normalizer.Next(); numChars++; index = normalizer.Index; charPositions[numChars] = index; } #pragma warning restore 612, 618 } // From here on out, do the algorithm. Note that our indices // refer to indices within the normalized string. int[] bestSnlp = new int[numChars + 1]; bestSnlp[0] = 0; for (int i = 1; i <= numChars; i++) { bestSnlp[i] = kint32max; } int[] prev = new int[numChars + 1]; for (int i = 0; i <= numChars; i++) { prev[i] = -1; } int maxWordSize = 20; int[] values = new int[numChars]; int[] lengths = new int[numChars]; // dynamic programming to find the best segmentation bool is_prev_katakana = false; for (int i = 0; i < numChars; i++) { text.SetIndex(i); if (bestSnlp[i] == kint32max) { continue; } int maxSearchLength = (i + maxWordSize < numChars) ? maxWordSize : (numChars - i); int[] count_ = new int[1]; fDictionary.Matches(text, maxSearchLength, lengths, count_, maxSearchLength, values); int count = count_[0]; // if there are no single character matches found in the dictionary // starting with this character, treat character as a 1-character word // with the highest value possible (i.e. the least likely to occur). // Exclude Korean characters from this treatment, as they should be // left together by default. text.SetIndex(i); // fDictionary.matches() advances the text position; undo that. if ((count == 0 || lengths[0] != 1) && CharacterIteration.Current32(text) != CharacterIteration.Done32 && !fHangulWordSet.Contains(CharacterIteration.Current32(text))) { values[count] = maxSnlp; lengths[count] = 1; count++; } for (int j = 0; j < count; j++) { int newSnlp = bestSnlp[i] + values[j]; if (newSnlp < bestSnlp[lengths[j] + i]) { bestSnlp[lengths[j] + i] = newSnlp; prev[lengths[j] + i] = i; } } // In Japanese, single-character Katakana words are pretty rare. // So we apply the following heuristic to Katakana: any continuous // run of Katakana characters is considered a candidate word with // a default cost specified in the katakanaCost table according // to its length. bool is_katakana = IsKatakana(CharacterIteration.Current32(text)); if (!is_prev_katakana && is_katakana) { int j = i + 1; CharacterIteration.Next32(text); while (j < numChars && (j - i) < kMaxKatakanaGroupLength && IsKatakana(CharacterIteration.Current32(text))) { CharacterIteration.Next32(text); ++j; } if ((j - i) < kMaxKatakanaGroupLength) { int newSnlp = bestSnlp[i] + GetKatakanaCost(j - i); if (newSnlp < bestSnlp[j]) { bestSnlp[j] = newSnlp; prev[j] = i; } } } is_prev_katakana = is_katakana; } int[] t_boundary = new int[numChars + 1]; int numBreaks = 0; if (bestSnlp[numChars] == kint32max) { t_boundary[numBreaks] = numChars; numBreaks++; } else { for (int i = numChars; i > 0; i = prev[i]) { t_boundary[numBreaks] = i; numBreaks++; } Assert.Assrt(prev[t_boundary[numBreaks - 1]] == 0); } if (foundBreaks.Count == 0 || foundBreaks.Peek() < startPos) { t_boundary[numBreaks++] = 0; } int correctedNumBreaks = 0; for (int i = numBreaks - 1; i >= 0; i--) { int pos = charPositions[t_boundary[i]] + startPos; if (!(foundBreaks.Contains(pos) || pos == startPos)) { foundBreaks.Push(charPositions[t_boundary[i]] + startPos); correctedNumBreaks++; } } if (!foundBreaks.IsEmpty && foundBreaks.Peek() == endPos) { foundBreaks.Pop(); correctedNumBreaks--; } if (!foundBreaks.IsEmpty) { inText.SetIndex(foundBreaks.Peek()); } return(correctedNumBreaks); }