public void TestNormalization() { String rules = "&a < \u0300\u0315 < A\u0300\u0315 < \u0316\u0315B < \u0316\u0300\u0315"; String[] testdata = { "\u1ED9", "o\u0323\u0302", "\u0300\u0315", "\u0315\u0300", "A\u0300\u0315B", "A\u0315\u0300B", "A\u0316\u0315B", "A\u0315\u0316B", "\u0316\u0300\u0315", "\u0315\u0300\u0316", "A\u0316\u0300\u0315B", "A\u0315\u0300\u0316B", "\u0316\u0315\u0300", "A\u0316\u0315\u0300B" }; RuleBasedCollator coll = null; try { coll = new RuleBasedCollator(rules); coll.Decomposition = NormalizationMode.CanonicalDecomposition; //(Collator.CANONICAL_DECOMPOSITION); } catch (Exception e) { Warnln("ERROR: in creation of collator using rules " + rules); return; } CollationElementIterator iter = coll.GetCollationElementIterator("testing"); for (int count = 0; count < testdata.Length; count++) { iter.SetText(testdata[count]); CollationTest.BackAndForth(this, iter); } }
public void TestNext() { String text = "abc"; CollationElementIterator iterator = coll .GetCollationElementIterator(text); int[] orders = new int[text.Length]; int order = iterator.Next(); int i = 0; while (order != CollationElementIterator.NULLORDER) { orders[i++] = order; order = iterator.Next(); } int offset = iterator.GetOffset(); NUnit.Framework.Assert.AreEqual(text.Length, offset); order = iterator.Previous(); while (order != CollationElementIterator.NULLORDER) { NUnit.Framework.Assert.AreEqual(orders[--i], order); order = iterator.Previous(); } NUnit.Framework.Assert.AreEqual(0, iterator.GetOffset()); }
public void TestNormalizedUnicodeChar() { // thai should have normalization on RuleBasedCollator th_th = null; try { th_th = (RuleBasedCollator)Collator.GetInstance( new CultureInfo("th-TH")); } catch (Exception e) { Warnln("Error creating Thai collator"); return; } StringBuffer source = new StringBuffer(); source.Append('\uFDFA'); CollationElementIterator iter = th_th.GetCollationElementIterator(source.ToString()); CollationTest.BackAndForth(this, iter); for (char codepoint = (char)0x1; codepoint < 0xfffe;) { source.Delete(0, source.Length); while (codepoint % 0xFF != 0) { if (UChar.IsDefined(codepoint)) { source.Append(codepoint); } codepoint++; } if (UChar.IsDefined(codepoint)) { source.Append(codepoint); } if (codepoint != 0xFFFF) { codepoint++; } /*if (((int)codepoint) >= 0xfe00) { * String str = source.substring(185, 190); * System.out.println(com.ibm.icu.impl.Utility.escape(str)); * System.out.println("codepoint " + Integer.toHexString(codepoint) + "length " + str.Length); + iter = th_th.GetCollationElementIterator(str); + CollationTest.BackAndForth(this, iter); */ iter = th_th.GetCollationElementIterator(source.ToString()); // A basic test to see if it's working at all CollationTest.BackAndForth(this, iter); } }
public void TestGetCollationElementIteratorCharacterIterator() { { Locale locale = new Locale("es", "", "TRADITIONAL"); RuleBasedCollator coll = (RuleBasedCollator)ILOG.J2CsMapping.Text.Collator .GetInstance(locale); String text = "cha"; StringCharacterIterator source = new StringCharacterIterator( text); CollationElementIterator iterator = coll .GetCollationElementIterator(source); int[] e_offset = { 0, 1, 2, 3 }; int offset = iterator.GetOffset(); int i = 0; NUnit.Framework.Assert.AreEqual(e_offset[i++], offset); while (offset != text.Length) { iterator.Next(); offset = iterator.GetOffset(); // System.out.println(offset); NUnit.Framework.Assert.AreEqual(e_offset[i++], offset); } } { Locale locale_0 = new Locale("de", "DE"); RuleBasedCollator coll_1 = (RuleBasedCollator)ILOG.J2CsMapping.Text.Collator .GetInstance(locale_0); String text_2 = "\u00E6b"; StringCharacterIterator source_3 = new StringCharacterIterator( text_2); CollationElementIterator iterator_4 = coll_1 .GetCollationElementIterator(source_3); int[] e_offset_5 = { 0, 1, 1, 2 }; int offset_6 = iterator_4.GetOffset(); int i_7 = 0; NUnit.Framework.Assert.AreEqual(e_offset_5[i_7++], offset_6); while (offset_6 != text_2.Length) { iterator_4.Next(); offset_6 = iterator_4.GetOffset(); NUnit.Framework.Assert.AreEqual(e_offset_5[i_7++], offset_6); } } // Regression for HARMONY-1352 try { new RuleBasedCollator("< a< b< c< d") .GetCollationElementIterator((CharacterIterator)null); NUnit.Framework.Assert.Fail("NullPointerException expected"); } catch (NullReferenceException e) { // expected } }
public void TestSetOffset() { // Failed in java too RuleBasedCollator rbColl = (RuleBasedCollator)ILOG.J2CsMapping.Text.Collator .GetInstance(new Locale("es", "", "TRADITIONAL")); String text = "cha"; CollationElementIterator iterator = rbColl .GetCollationElementIterator(text); iterator.SetOffset(1); NUnit.Framework.Assert.AreEqual(1, iterator.GetOffset()); }
public void TestGetCollationElementIteratorString() { { Locale locale = new Locale("es", "", "TRADITIONAL"); RuleBasedCollator coll = (RuleBasedCollator)ILOG.J2CsMapping.Text.Collator .GetInstance(locale); String source = "cha"; CollationElementIterator iterator = coll .GetCollationElementIterator(source); int[] e_offset = { 0, 1, 2, 3 }; int offset = iterator.GetOffset(); int i = 0; NUnit.Framework.Assert.AreEqual(e_offset[i++], offset); while (offset != source.Length) { iterator.Next(); offset = iterator.GetOffset(); NUnit.Framework.Assert.AreEqual(e_offset[i++], offset); } } { Locale locale_0 = new Locale("de", "DE"); RuleBasedCollator coll_1 = (RuleBasedCollator)ILOG.J2CsMapping.Text.Collator .GetInstance(locale_0); String source_2 = "\u00E6b"; CollationElementIterator iterator_3 = coll_1 .GetCollationElementIterator(source_2); int[] e_offset_4 = { 0, 1, 1, 2 }; int offset_5 = iterator_3.GetOffset(); int i_6 = 0; NUnit.Framework.Assert.AreEqual(e_offset_4[i_6++], offset_5); while (offset_5 != source_2.Length) { iterator_3.Next(); offset_5 = iterator_3.GetOffset(); NUnit.Framework.Assert.AreEqual(e_offset_4[i_6++], offset_5); } } // Regression for HARMONY-1352 try { new RuleBasedCollator("< a< b< c< d") .GetCollationElementIterator((String)null); NUnit.Framework.Assert.Fail("NullPointerException expected"); } catch (NullReferenceException e) { // expected } }
public void TestSecondaryOrder() { RuleBasedCollator rbColl = (RuleBasedCollator)ILOG.J2CsMapping.Text.Collator .GetInstance(new Locale("fr", "FR")); String text = "a\u00e0"; CollationElementIterator iterator = rbColl .GetCollationElementIterator(text); int order = iterator.Next(); int sOrder1 = CollationElementIterator.SecondaryOrder(order); order = iterator.Next(); int sOrder2 = CollationElementIterator.SecondaryOrder(order); NUnit.Framework.Assert.AreEqual(sOrder1, sOrder2); }
public void TestGetMaxExpansion() { String text = "cha"; RuleBasedCollator rbColl = (RuleBasedCollator)ILOG.J2CsMapping.Text.Collator .GetInstance(new Locale("es", "", "TRADITIONAL")); CollationElementIterator iterator = rbColl .GetCollationElementIterator(text); int order = iterator.Next(); while (order != CollationElementIterator.NULLORDER) { NUnit.Framework.Assert.AreEqual(1, iterator.GetMaxExpansion(order)); order = iterator.Next(); } }
public void TestPrimaryOrder() { RuleBasedCollator rbColl = (RuleBasedCollator)ILOG.J2CsMapping.Text.Collator .GetInstance(new Locale("de", "DE")); String text = "\u00e6"; CollationElementIterator iterator = rbColl .GetCollationElementIterator(text); int order = iterator.Next(); int pOrder = CollationElementIterator.PrimaryOrder(order); CollationElementIterator iterator2 = rbColl .GetCollationElementIterator("ae"); int order2 = iterator2.Next(); int pOrder2 = CollationElementIterator.PrimaryOrder(order2); NUnit.Framework.Assert.AreEqual(pOrder, pOrder2); }
public void TestGetOffset() { String text = "abc"; CollationElementIterator iterator = coll .GetCollationElementIterator(text); int[] offsets = { 0, 1, 2, 3 }; int offset = iterator.GetOffset(); int i = 0; NUnit.Framework.Assert.AreEqual(offsets[i++], offset); while (offset != text.Length) { iterator.Next(); offset = iterator.GetOffset(); NUnit.Framework.Assert.AreEqual(offsets[i++], offset); } }
void assertEqual(CollationElementIterator i1, CollationElementIterator i2) { int c1, c2, count = 0; do { c1 = i1.Next(); c2 = i2.Next(); if (c1 != c2) { Errln(" " + count + ": strength(0x" + (c1).ToHexString() + ") != strength(0x" + (c2).ToHexString() + ")"); break; } count += 1; } while (c1 != CollationElementIterator.NULLORDER); CollationTest.BackAndForth(this, i1); CollationTest.BackAndForth(this, i2); }
public void TestInvalidThai() { String[] tests = { "\u0E44\u0E01\u0E44\u0E01", "\u0E44\u0E01\u0E01\u0E44", "\u0E01\u0E44\u0E01\u0E44", "\u0E01\u0E01\u0E44\u0E44", "\u0E44\u0E44\u0E01\u0E01", "\u0E01\u0E44\u0E44\u0E01", }; RuleBasedCollator collator; StrCmp comparator; try { collator = GetThaiCollator(); comparator = new StrCmp(); } catch (Exception e) { Warnln("could not construct Thai collator"); return; } Array.Sort(tests, comparator); for (int i = 0; i < tests.Length; i++) { for (int j = i + 1; j < tests.Length; j++) { if (collator.Compare(tests[i], tests[j]) > 0) { // inconsistency ordering found! Errln("Inconsistent ordering between strings " + i + " and " + j); } } CollationElementIterator iterator = collator.GetCollationElementIterator(tests[i]); CollationTest.BackAndForth(this, iterator); } }
public void TestSetText(/* char* par */) { RuleBasedCollator en_us = (RuleBasedCollator)Collator.GetInstance(new CultureInfo("en-US")); CollationElementIterator iter1 = en_us.GetCollationElementIterator(test1); CollationElementIterator iter2 = en_us.GetCollationElementIterator(test2); // Run through the second iterator just to exercise it int c = iter2.Next(); int i = 0; while (++i < 10 && c != CollationElementIterator.NULLORDER) { try { c = iter2.Next(); } catch (Exception e) { Errln("iter2.Next() returned an error."); break; } } // Now set it to point to the same string as the first iterator try { iter2.SetText(test1); } catch (Exception e) { Errln("call to iter2->setText(test1) failed."); return; } assertEqual(iter1, iter2); iter1.Reset(); //now use the overloaded setText(ChracterIterator&, UErrorCode) function to set the text CharacterIterator chariter = new StringCharacterIterator(test1); try { iter2.SetText(chariter); } catch (Exception e) { Errln("call to iter2->setText(chariter(test1)) failed."); return; } assertEqual(iter1, iter2); iter1.Reset(); //now use the overloaded setText(ChracterIterator&, UErrorCode) function to set the text UCharacterIterator uchariter = UCharacterIterator.GetInstance(test1); try { iter2.SetText(uchariter); } catch (Exception e) { Errln("call to iter2->setText(uchariter(test1)) failed."); return; } assertEqual(iter1, iter2); }
public void TestPrevious(/* char* par */) { RuleBasedCollator en_us = (RuleBasedCollator)Collator.GetInstance(new CultureInfo("en-US")); CollationElementIterator iter = en_us.GetCollationElementIterator(test1); // A basic test to see if it's working at all CollationTest.BackAndForth(this, iter); // Test with a contracting character sequence String source; RuleBasedCollator c1 = null; try { c1 = new RuleBasedCollator("&a,A < b,B < c,C, d,D < z,Z < ch,cH,Ch,CH"); } catch (Exception e) { Errln("Couldn't create a RuleBasedCollator with a contracting sequence."); return; } source = "abchdcba"; iter = c1.GetCollationElementIterator(source); CollationTest.BackAndForth(this, iter); // Test with an expanding character sequence RuleBasedCollator c2 = null; try { c2 = new RuleBasedCollator("&a < b < c/abd < d"); } catch (Exception e) { Errln("Couldn't create a RuleBasedCollator with an expanding sequence."); return; } source = "abcd"; iter = c2.GetCollationElementIterator(source); CollationTest.BackAndForth(this, iter); // Now try both RuleBasedCollator c3 = null; try { c3 = new RuleBasedCollator("&a < b < c/aba < d < z < ch"); } catch (Exception e) { Errln("Couldn't create a RuleBasedCollator with both an expanding and a contracting sequence."); return; } source = "abcdbchdc"; iter = c3.GetCollationElementIterator(source); CollationTest.BackAndForth(this, iter); source = "\u0e41\u0e02\u0e41\u0e02\u0e27abc"; Collator c4 = null; try { c4 = Collator.GetInstance(new CultureInfo("th-TH")); } catch (Exception e) { Errln("Couldn't create a collator"); return; } iter = ((RuleBasedCollator)c4).GetCollationElementIterator(source); CollationTest.BackAndForth(this, iter); source = "\u0061\u30CF\u3099\u30FC"; Collator c5 = null; try { c5 = Collator.GetInstance(new CultureInfo("ja-JP")); } catch (Exception e) { Errln("Couldn't create Japanese collator\n"); return; } iter = ((RuleBasedCollator)c5).GetCollationElementIterator(source); CollationTest.BackAndForth(this, iter); }
public void TestClearBuffers(/* char* par */) { RuleBasedCollator c = null; try { c = new RuleBasedCollator("&a < b < c & ab = d"); } catch (Exception e) { Warnln("Couldn't create a RuleBasedCollator."); return; } String source = "abcd"; CollationElementIterator i = c.GetCollationElementIterator(source); int e0 = 0; try { e0 = i.Next(); // save the first collation element } catch (Exception e) { Errln("call to i.Next() failed."); return; } try { i.SetOffset(3); // go to the expanding character } catch (Exception e) { Errln("call to i.setOffset(3) failed."); return; } try { i.Next(); // but only use up half of it } catch (Exception e) { Errln("call to i.Next() failed."); return; } try { i.SetOffset(0); // go back to the beginning } catch (Exception e) { Errln("call to i.setOffset(0) failed. "); } { int e = 0; try { e = i.Next(); // and get this one again } catch (Exception ee) { Errln("call to i.Next() failed. "); return; } if (e != e0) { Errln("got 0x" + (e).ToHexString() + ", expected 0x" + (e0).ToHexString()); } } }
public void TestMaxExpansion(/* char* par */) { int unassigned = 0xEFFFD; String rule = "&a < ab < c/aba < d < z < ch"; RuleBasedCollator coll = null; try { coll = new RuleBasedCollator(rule); } catch (Exception e) { Warnln("Fail to create RuleBasedCollator"); return; } char ch = (char)0; String str = ch + ""; CollationElementIterator iter = coll.GetCollationElementIterator(str); while (ch < 0xFFFF) { int count = 1; ch++; str = ch + ""; iter.SetText(str); int order = iter.Previous(); // thai management if (order == 0) { order = iter.Previous(); } while (iter.Previous() != CollationElementIterator.NULLORDER) { count++; } if (iter.GetMaxExpansion(order) < count) { Errln("Failure at codepoint " + ch + ", maximum expansion count < " + count); } } // testing for exact max expansion ch = (char)0; while (ch < 0x61) { str = ch + ""; iter.SetText(str); int order = iter.Previous(); if (iter.GetMaxExpansion(order) != 1) { Errln("Failure at codepoint 0x" + (ch).ToHexString() + " maximum expansion count == 1"); } ch++; } ch = (char)0x63; str = ch + ""; iter.SetText(str); int temporder = iter.Previous(); if (iter.GetMaxExpansion(temporder) != 3) { Errln("Failure at codepoint 0x" + (ch).ToHexString() + " maximum expansion count == 3"); } ch = (char)0x64; str = ch + ""; iter.SetText(str); temporder = iter.Previous(); if (iter.GetMaxExpansion(temporder) != 1) { Errln("Failure at codepoint 0x" + (ch).ToHexString() + " maximum expansion count == 1"); } str = UChar.ToString(unassigned); iter.SetText(str); temporder = iter.Previous(); if (iter.GetMaxExpansion(temporder) != 2) { Errln("Failure at codepoint 0x" + (ch).ToHexString() + " maximum expansion count == 2"); } // testing jamo ch = (char)0x1165; str = ch + ""; iter.SetText(str); temporder = iter.Previous(); if (iter.GetMaxExpansion(temporder) > 3) { Errln("Failure at codepoint 0x" + (ch).ToHexString() + " maximum expansion count < 3"); } // testing special jamo &a<\u1165 rule = "\u0026\u0071\u003c\u1165\u002f\u0071\u0071\u0071\u0071"; try { coll = new RuleBasedCollator(rule); } catch (Exception e) { Errln("Fail to create RuleBasedCollator"); return; } iter = coll.GetCollationElementIterator(str); temporder = iter.Previous(); if (iter.GetMaxExpansion(temporder) != 6) { Errln("Failure at codepoint 0x" + (ch).ToHexString() + " maximum expansion count == 6"); } }
public void TestOffset(/* char* par */) { RuleBasedCollator en_us; try { en_us = (RuleBasedCollator)Collator.GetInstance(new CultureInfo("en-US")); } catch (Exception e) { Warnln("ERROR: in creation of collator of ENGLISH locale"); return; } CollationElementIterator iter = en_us.GetCollationElementIterator(test1); // testing boundaries iter.SetOffset(0); if (iter.Previous() != CollationElementIterator.NULLORDER) { Errln("Error: After setting offset to 0, we should be at the end " + "of the backwards iteration"); } iter.SetOffset(test1.Length); if (iter.Next() != CollationElementIterator.NULLORDER) { Errln("Error: After setting offset to the end of the string, we " + "should be at the end of the forwards iteration"); } // Run all the way through the iterator, then get the offset int[] orders = CollationTest.GetOrders(iter); Logln("orders.Length = " + orders.Length); int offset = iter.GetOffset(); if (offset != test1.Length) { String msg1 = "offset at end != length: "; String msg2 = " vs "; Errln(msg1 + offset + msg2 + test1.Length); } // Now set the offset back to the beginning and see if it works CollationElementIterator pristine = en_us.GetCollationElementIterator(test1); try { iter.SetOffset(0); } catch (Exception e) { Errln("setOffset failed."); } assertEqual(iter, pristine); // setting offset in the middle of a contraction String contraction = "change"; RuleBasedCollator tailored = null; try { tailored = new RuleBasedCollator("& a < ch"); } catch (Exception e) { Errln("Error: in creation of Spanish collator"); return; } iter = tailored.GetCollationElementIterator(contraction); int[] order = CollationTest.GetOrders(iter); iter.SetOffset(1); // sets offset in the middle of ch int[] order2 = CollationTest.GetOrders(iter); if (!Arrays.Equals(order, order2)) { Errln("Error: setting offset in the middle of a contraction should be the same as setting it to the start of the contraction"); } contraction = "peache"; iter = tailored.GetCollationElementIterator(contraction); iter.SetOffset(3); order = CollationTest.GetOrders(iter); iter.SetOffset(4); // sets offset in the middle of ch order2 = CollationTest.GetOrders(iter); if (!Arrays.Equals(order, order2)) { Errln("Error: setting offset in the middle of a contraction should be the same as setting it to the start of the contraction"); } // setting offset in the middle of a surrogate pair String surrogate = "\ud800\udc00str"; iter = tailored.GetCollationElementIterator(surrogate); order = CollationTest.GetOrders(iter); iter.SetOffset(1); // sets offset in the middle of surrogate order2 = CollationTest.GetOrders(iter); if (!Arrays.Equals(order, order2)) { Errln("Error: setting offset in the middle of a surrogate pair should be the same as setting it to the start of the surrogate pair"); } surrogate = "simple\ud800\udc00str"; iter = tailored.GetCollationElementIterator(surrogate); iter.SetOffset(6); order = CollationTest.GetOrders(iter); iter.SetOffset(7); // sets offset in the middle of surrogate order2 = CollationTest.GetOrders(iter); if (!Arrays.Equals(order, order2)) { Errln("Error: setting offset in the middle of a surrogate pair should be the same as setting it to the start of the surrogate pair"); } // TODO: try iterating halfway through a messy string. }
public void TestDiscontiguous() { String rulestr = "&z < AB < X\u0300 < ABC < X\u0300\u0315"; String[] src = { "ADB", "ADBC", "A\u0315B", "A\u0315BC", // base character blocked "XD\u0300", "XD\u0300\u0315", // non blocking combining character "X\u0319\u0300", "X\u0319\u0300\u0315", // blocking combining character "X\u0314\u0300", "X\u0314\u0300\u0315", // contraction prefix "ABDC", "AB\u0315C", "X\u0300D\u0315", "X\u0300\u0319\u0315", "X\u0300\u031A\u0315", // ends not with a contraction character "X\u0319\u0300D", "X\u0319\u0300\u0315D", "X\u0300D\u0315D", "X\u0300\u0319\u0315D", "X\u0300\u031A\u0315D" }; String[] tgt = // non blocking combining character { "A D B", "A D BC", "A \u0315 B", "A \u0315 BC", // base character blocked "X D \u0300", "X D \u0300\u0315", // non blocking combining character "X\u0300 \u0319", "X\u0300\u0315 \u0319", // blocking combining character "X \u0314 \u0300", "X \u0314 \u0300\u0315", // contraction prefix "AB DC", "AB \u0315 C", "X\u0300 D \u0315", "X\u0300\u0315 \u0319", "X\u0300 \u031A \u0315", // ends not with a contraction character "X\u0300 \u0319D", "X\u0300\u0315 \u0319D", "X\u0300 D\u0315D", "X\u0300\u0315 \u0319D", "X\u0300 \u031A\u0315D" }; int count = 0; try { RuleBasedCollator coll = new RuleBasedCollator(rulestr); CollationElementIterator iter = coll.GetCollationElementIterator(""); CollationElementIterator resultiter = coll.GetCollationElementIterator(""); while (count < src.Length) { iter.SetText(src[count]); int s = 0; while (s < tgt[count].Length) { int e = tgt[count].IndexOf(' ', s); if (e < 0) { e = tgt[count].Length; } String resultstr = tgt[count].Substring(s, e - s); // ICU4N: Corrected 2nd parameter resultiter.SetText(resultstr); int ce = resultiter.Next(); while (ce != CollationElementIterator.NULLORDER) { if (ce != iter.Next()) { Errln("Discontiguos contraction test mismatch at" + count); return; } ce = resultiter.Next(); } s = e + 1; } iter.Reset(); CollationTest.BackAndForth(this, iter); count++; } } catch (Exception e) { Warnln("Error running discontiguous tests " + e.ToString()); } }
/// <summary> /// Compares the character data stored in two different strings based on the /// collation rules. Returns information about whether a string is less /// than, greater than or equal to another string in a language. /// This can be overriden in a subclass. /// </summary> /// <exception cref="NullPointerException"> if <code>source</code> or <code>target</code> is null. </exception> public override int Compare(String source, String target) { lock (this) { if (source == null || target == null) { throw new NullPointerException(); } // The basic algorithm here is that we use CollationElementIterators // to step through both the source and target strings. We compare each // collation element in the source string against the corresponding one // in the target, checking for differences. // // If a difference is found, we set <result> to LESS or GREATER to // indicate whether the source string is less or greater than the target. // // However, it's not that simple. If we find a tertiary difference // (e.g. 'A' vs. 'a') near the beginning of a string, it can be // overridden by a primary difference (e.g. "A" vs. "B") later in // the string. For example, "AA" < "aB", even though 'A' > 'a'. // // To keep track of this, we use strengthResult to keep track of the // strength of the most significant difference that has been found // so far. When we find a difference whose strength is greater than // strengthResult, it overrides the last difference (if any) that // was found. int result = Collator.EQUAL; if (SourceCursor == null) { SourceCursor = GetCollationElementIterator(source); } else { SourceCursor.Text = source; } if (TargetCursor == null) { TargetCursor = GetCollationElementIterator(target); } else { TargetCursor.Text = target; } int sOrder = 0, tOrder = 0; bool initialCheckSecTer = Strength >= Collator.SECONDARY; bool checkSecTer = initialCheckSecTer; bool checkTertiary = Strength >= Collator.TERTIARY; bool gets = true, gett = true; while (true) { // Get the next collation element in each of the strings, unless // we've been requested to skip it. if (gets) { sOrder = SourceCursor.Next(); } else { gets = true; } if (gett) { tOrder = TargetCursor.Next(); } else { gett = true; } // If we've hit the end of one of the strings, jump out of the loop if ((sOrder == CollationElementIterator.NULLORDER) || (tOrder == CollationElementIterator.NULLORDER)) { break; } int pSOrder = CollationElementIterator.PrimaryOrder(sOrder); int pTOrder = CollationElementIterator.PrimaryOrder(tOrder); // If there's no difference at this position, we can skip it if (sOrder == tOrder) { if (Tables_Renamed.FrenchSec && pSOrder != 0) { if (!checkSecTer) { // in french, a secondary difference more to the right is stronger, // so accents have to be checked with each base element checkSecTer = initialCheckSecTer; // but tertiary differences are less important than the first // secondary difference, so checking tertiary remains disabled checkTertiary = false; } } continue; } // Compare primary differences first. if (pSOrder != pTOrder) { if (sOrder == 0) { // The entire source element is ignorable. // Skip to the next source element, but don't fetch another target element. gett = false; continue; } if (tOrder == 0) { gets = false; continue; } // The source and target elements aren't ignorable, but it's still possible // for the primary component of one of the elements to be ignorable.... if (pSOrder == 0) // primary order in source is ignorable { // The source's primary is ignorable, but the target's isn't. We treat ignorables // as a secondary difference, so remember that we found one. if (checkSecTer) { result = Collator.GREATER; // (strength is SECONDARY) checkSecTer = false; } // Skip to the next source element, but don't fetch another target element. gett = false; } else if (pTOrder == 0) { // record differences - see the comment above. if (checkSecTer) { result = Collator.LESS; // (strength is SECONDARY) checkSecTer = false; } // Skip to the next source element, but don't fetch another target element. gets = false; } else { // Neither of the orders is ignorable, and we already know that the primary // orders are different because of the (pSOrder != pTOrder) test above. // Record the difference and stop the comparison. if (pSOrder < pTOrder) { return(Collator.LESS); // (strength is PRIMARY) } else { return(Collator.GREATER); // (strength is PRIMARY) } } } // else of if ( pSOrder != pTOrder ) else { // primary order is the same, but complete order is different. So there // are no base elements at this point, only ignorables (Since the strings are // normalized) if (checkSecTer) { // a secondary or tertiary difference may still matter short secSOrder = CollationElementIterator.SecondaryOrder(sOrder); short secTOrder = CollationElementIterator.SecondaryOrder(tOrder); if (secSOrder != secTOrder) { // there is a secondary difference result = (secSOrder < secTOrder) ? Collator.LESS : Collator.GREATER; // (strength is SECONDARY) checkSecTer = false; // (even in french, only the first secondary difference within // a base character matters) } else { if (checkTertiary) { // a tertiary difference may still matter short terSOrder = CollationElementIterator.TertiaryOrder(sOrder); short terTOrder = CollationElementIterator.TertiaryOrder(tOrder); if (terSOrder != terTOrder) { // there is a tertiary difference result = (terSOrder < terTOrder) ? Collator.LESS : Collator.GREATER; // (strength is TERTIARY) checkTertiary = false; } } } } // if (checkSecTer) } // if ( pSOrder != pTOrder ) } // while() if (sOrder != CollationElementIterator.NULLORDER) { // (tOrder must be CollationElementIterator::NULLORDER, // since this point is only reached when sOrder or tOrder is NULLORDER.) // The source string has more elements, but the target string hasn't. do { if (CollationElementIterator.PrimaryOrder(sOrder) != 0) { // We found an additional non-ignorable base character in the source string. // This is a primary difference, so the source is greater return(Collator.GREATER); // (strength is PRIMARY) } else if (CollationElementIterator.SecondaryOrder(sOrder) != 0) { // Additional secondary elements mean the source string is greater if (checkSecTer) { result = Collator.GREATER; // (strength is SECONDARY) checkSecTer = false; } } } while ((sOrder = SourceCursor.Next()) != CollationElementIterator.NULLORDER); } else if (tOrder != CollationElementIterator.NULLORDER) { // The target string has more elements, but the source string hasn't. do { if (CollationElementIterator.PrimaryOrder(tOrder) != 0) // We found an additional non-ignorable base character in the target string. // This is a primary difference, so the source is less { return(Collator.LESS); // (strength is PRIMARY) } else if (CollationElementIterator.SecondaryOrder(tOrder) != 0) { // Additional secondary elements in the target mean the source string is less if (checkSecTer) { result = Collator.LESS; // (strength is SECONDARY) checkSecTer = false; } } } while ((tOrder = TargetCursor.Next()) != CollationElementIterator.NULLORDER); } // For IDENTICAL comparisons, we use a bitwise character comparison // as a tiebreaker if all else is equal if (result == 0 && Strength == IDENTICAL) { int mode = Decomposition; Normalizer.Form form; if (mode == CANONICAL_DECOMPOSITION) { form = Normalizer.Form.NFD; } else if (mode == FULL_DECOMPOSITION) { form = Normalizer.Form.NFKD; } else { return(source.CompareTo(target)); } String sourceDecomposition = Normalizer.Normalize(source, form); String targetDecomposition = Normalizer.Normalize(target, form); return(sourceDecomposition.CompareTo(targetDecomposition)); } return(result); } }
public void TestSearchCollatorElements() { String tsceText = " \uAC00" + // simple LV Hangul " \uAC01" + // simple LVT Hangul " \uAC0F" + // LVTT, last jamo expands for search " \uAFFF" + // LLVVVTT, every jamo expands for search " \u1100\u1161\u11A8" + // 0xAC01 as conjoining jamo " \u3131\u314F\u3131" + // 0xAC01 as compatibility jamo " \u1100\u1161\u11B6" + // 0xAC0F as conjoining jamo; last expands for search " \u1101\u1170\u11B6" + // 0xAFFF as conjoining jamo; all expand for search " \u00E6" + // small letter ae, expands " \u1E4D" + // small letter o with tilde and acute, decomposes " "; int[] rootStandardOffsets = { 0, 1, 2, 2, 3, 4, 4, 4, 5, 6, 6, 6, 7, 8, 8, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,/* plus another 1-2 offset=26 if ae-ligature maps to three CEs */ 26, 27, 28, 28, 28, 29 }; int[] rootSearchOffsets = { 0, 1, 2, 2, 3, 4, 4, 4, 5, 6, 6, 6, 6, 7, 8, 8, 8, 8, 8, 8, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 20, 21, 22, 22, 23, 23, 23, 24, 24, 25, 26,/* plus another 1-2 offset=26 if ae-ligature maps to three CEs */ 26, 27, 28, 28, 28, 29 }; TSCEItem[] tsceItems = { new TSCEItem("root", rootStandardOffsets), new TSCEItem("root@collation=search", rootSearchOffsets), }; foreach (TSCEItem tsceItem in tsceItems) { String localeString = tsceItem.LocaleString; ULocale uloc = new ULocale(localeString); RuleBasedCollator col = null; try { col = (RuleBasedCollator)Collator.GetInstance(uloc); } catch (Exception e) { Errln("Error: in locale " + localeString + ", err in Collator.getInstance"); continue; } CollationElementIterator uce = col.GetCollationElementIterator(tsceText); int[] offsets = tsceItem.GetOffsets(); int ioff, noff = offsets.Length; int offset, element; ioff = 0; do { offset = uce.GetOffset(); element = uce.Next(); Logln(String.Format("({0}) offset={1:d2} ce={2:x8}\n", tsceItem.LocaleString, offset, element)); if (element == 0) { Errln("Error: in locale " + localeString + ", CEIterator next() returned element 0"); } if (ioff < noff) { if (offset != offsets[ioff]) { Errln("Error: in locale " + localeString + ", expected CEIterator next()->getOffset " + offsets[ioff] + ", got " + offset); //ioff = noff; //break; } ioff++; } else { Errln("Error: in locale " + localeString + ", CEIterator next() returned more elements than expected"); } } while (element != CollationElementIterator.NULLORDER); if (ioff < noff) { Errln("Error: in locale " + localeString + ", CEIterator next() returned fewer elements than expected"); } // backwards test uce.SetOffset(tsceText.Length); ioff = noff; do { offset = uce.GetOffset(); element = uce.Previous(); if (element == 0) { Errln("Error: in locale " + localeString + ", CEIterator previous() returned element 0"); } if (ioff > 0) { ioff--; if (offset != offsets[ioff]) { Errln("Error: in locale " + localeString + ", expected CEIterator previous()->getOffset " + offsets[ioff] + ", got " + offset); //ioff = 0; //break; } } else { Errln("Error: in locale " + localeString + ", CEIterator previous() returned more elements than expected"); } } while (element != CollationElementIterator.NULLORDER); if (ioff > 0) { Errln("Error: in locale " + localeString + ", CEIterator previous() returned fewer elements than expected"); } } }
/// <summary> /// Transforms the string into a series of characters that can be compared /// with CollationKey.compareTo. This overrides java.text.Collator.getCollationKey. /// It can be overriden in a subclass. /// </summary> public override CollationKey GetCollationKey(String source) { lock (this) { // // The basic algorithm here is to find all of the collation elements for each // character in the source string, convert them to a char representation, // and put them into the collation key. But it's trickier than that. // Each collation element in a string has three components: primary (A vs B), // secondary (A vs A-acute), and tertiary (A' vs a); and a primary difference // at the end of a string takes precedence over a secondary or tertiary // difference earlier in the string. // // To account for this, we put all of the primary orders at the beginning of the // string, followed by the secondary and tertiary orders, separated by nulls. // // Here's a hypothetical example, with the collation element represented as // a three-digit number, one digit for primary, one for secondary, etc. // // String: A a B \u00e9 <--(e-acute) // Collation Elements: 101 100 201 510 // // Collation Key: 1125<null>0001<null>1010 // // To make things even trickier, secondary differences (accent marks) are compared // starting at the *end* of the string in languages with French secondary ordering. // But when comparing the accent marks on a single base character, they are compared // from the beginning. To handle this, we reverse all of the accents that belong // to each base character, then we reverse the entire string of secondary orderings // at the end. Taking the same example above, a French collator might return // this instead: // // Collation Key: 1125<null>1000<null>1010 // if (source == null) { return(null); } if (PrimResult == null) { PrimResult = new StringBuffer(); SecResult = new StringBuffer(); TerResult = new StringBuffer(); } else { PrimResult.Length = 0; SecResult.Length = 0; TerResult.Length = 0; } int order = 0; bool compareSec = (Strength >= Collator.SECONDARY); bool compareTer = (Strength >= Collator.TERTIARY); int secOrder = CollationElementIterator.NULLORDER; int terOrder = CollationElementIterator.NULLORDER; int preSecIgnore = 0; if (SourceCursor == null) { SourceCursor = GetCollationElementIterator(source); } else { SourceCursor.Text = source; } // walk through each character while ((order = SourceCursor.Next()) != CollationElementIterator.NULLORDER) { secOrder = CollationElementIterator.SecondaryOrder(order); terOrder = CollationElementIterator.TertiaryOrder(order); if (!CollationElementIterator.IsIgnorable(order)) { PrimResult.Append((char)(CollationElementIterator.PrimaryOrder(order) + COLLATIONKEYOFFSET)); if (compareSec) { // // accumulate all of the ignorable/secondary characters attached // to a given base character // if (Tables_Renamed.FrenchSec && preSecIgnore < SecResult.Length()) { // // We're doing reversed secondary ordering and we've hit a base // (non-ignorable) character. Reverse any secondary orderings // that applied to the last base character. (see block comment above.) // RBCollationTables.Reverse(SecResult, preSecIgnore, SecResult.Length()); } // Remember where we are in the secondary orderings - this is how far // back to go if we need to reverse them later. SecResult.Append((char)(secOrder + COLLATIONKEYOFFSET)); preSecIgnore = SecResult.Length(); } if (compareTer) { TerResult.Append((char)(terOrder + COLLATIONKEYOFFSET)); } } else { if (compareSec && secOrder != 0) { SecResult.Append((char)(secOrder + Tables_Renamed.MaxSecOrder + COLLATIONKEYOFFSET)); } if (compareTer && terOrder != 0) { TerResult.Append((char)(terOrder + Tables_Renamed.MaxTerOrder + COLLATIONKEYOFFSET)); } } } if (Tables_Renamed.FrenchSec) { if (preSecIgnore < SecResult.Length()) { // If we've accumulated any secondary characters after the last base character, // reverse them. RBCollationTables.Reverse(SecResult, preSecIgnore, SecResult.Length()); } // And now reverse the entire secResult to get French secondary ordering. RBCollationTables.Reverse(SecResult, 0, SecResult.Length()); } PrimResult.Append((char)0); SecResult.Append((char)0); SecResult.Append(TerResult.ToString()); PrimResult.Append(SecResult.ToString()); if (Strength == IDENTICAL) { PrimResult.Append((char)0); int mode = Decomposition; if (mode == CANONICAL_DECOMPOSITION) { PrimResult.Append(Normalizer.Normalize(source, Normalizer.Form.NFD)); } else if (mode == FULL_DECOMPOSITION) { PrimResult.Append(Normalizer.Normalize(source, Normalizer.Form.NFKD)); } else { PrimResult.Append(source); } } return(new RuleBasedCollationKey(source, PrimResult.ToString())); } }