// two strings that are canonically equivalent must test // equal under a canonical caseless match // see UAX #21 Case Mappings and Jitterbug 2021 and // Unicode Technical Committee meeting consensus 92-C31 private void compare(String s1, String s2) { if (s1.Length == 1 && s2.Length == 1) { if (Normalizer.Compare(UTF16.CharAt(s1, 0), UTF16.CharAt(s2, 0), Normalizer.COMPARE_IGNORE_CASE) != 0) { Errln("Normalizer.compare(int,int) failed for s1: " + Utility.Hex(s1) + " s2: " + Utility.Hex(s2)); } } if (s1.Length == 1 && s2.Length > 1) { if (Normalizer.Compare(UTF16.CharAt(s1, 0), s2, Normalizer.COMPARE_IGNORE_CASE) != 0) { Errln("Normalizer.compare(int,String) failed for s1: " + Utility.Hex(s1) + " s2: " + Utility.Hex(s2)); } } if (s1.Length > 1 && s2.Length > 1) { // TODO: Re-enable this tests after UTC fixes UAX 21 if (Normalizer.Compare(s1.ToCharArray(), s2.ToCharArray(), Normalizer.COMPARE_IGNORE_CASE) != 0) { Errln("Normalizer.compare(char[],char[]) failed for s1: " + Utility.Hex(s1) + " s2: " + Utility.Hex(s2)); } } }
// we have a segment, in NFD. Find all the strings that are canonically equivalent to it. private string[] GetEquivalents(string segment) { ISet <string> result = new HashSet <string>(); ISet <string> basic = GetEquivalents2(segment); ISet <string> permutations = new HashSet <string>(); // now get all the permutations // add only the ones that are canonically equivalent // TODO: optimize by not permuting any class zero. using (IEnumerator <string> it = basic.GetEnumerator()) { while (it.MoveNext()) { string item = it.Current; permutations.Clear(); #pragma warning disable 612, 618 Permute(item, SKIP_ZEROS, permutations); #pragma warning restore 612, 618 using (IEnumerator <string> it2 = permutations.GetEnumerator()) { while (it2.MoveNext()) { string possible = it2.Current; /* * String attempt = Normalizer.normalize(possible, Normalizer.DECOMP, 0); * if (attempt.equals(segment)) { */ if (Normalizer.Compare(possible, segment, 0) == 0) { if (PROGRESS) { Console.Out.WriteLine("Adding Permutation: " + Utility.Hex(possible)); } result.Add(possible); } else { if (PROGRESS) { Console.Out.WriteLine("-Skipping Permutation: " + Utility.Hex(possible)); } } } } } } // convert into a String[] to clean up storage string[] finalResult = new string[result.Count]; result.CopyTo(finalResult, 0); return(finalResult); }
/// <summary> /// See if the decomposition of cp2 is at segment starting at <paramref name="segmentPos"/> /// (with canonical rearrangment!). /// If so, take the remainder, and return the equivalents. /// </summary> /// <param name="comp"></param> /// <param name="segment"></param> /// <param name="segmentPos"></param> /// <param name="buf"></param> /// <returns></returns> private ISet <string> Extract(int comp, string segment, int segmentPos, StringBuffer buf) { if (PROGRESS) { Console.Out.WriteLine(" extract: " + Utility.Hex(UTF16.ValueOf(comp)) + ", " + Utility.Hex(segment.Substring(segmentPos))); } string decomp = nfcImpl.GetDecomposition(comp); if (decomp == null) { decomp = UTF16.ValueOf(comp); } // See if it matches the start of segment (at segmentPos) bool ok = false; int cp; int decompPos = 0; int decompCp = UTF16.CharAt(decomp, 0); decompPos += UTF16.GetCharCount(decompCp); // adjust position to skip first char //int decompClass = getClass(decompCp); buf.Length = 0; // initialize working buffer, shared among callees for (int i = segmentPos; i < segment.Length; i += UTF16.GetCharCount(cp)) { cp = UTF16.CharAt(segment, i); if (cp == decompCp) { // if equal, eat another cp from decomp if (PROGRESS) { Console.Out.WriteLine(" matches: " + Utility.Hex(UTF16.ValueOf(cp))); } if (decompPos == decomp.Length) { // done, have all decomp characters! buf.Append(segment.Substring(i + UTF16.GetCharCount(cp))); // add remaining segment chars ok = true; break; } decompCp = UTF16.CharAt(decomp, decompPos); decompPos += UTF16.GetCharCount(decompCp); //decompClass = getClass(decompCp); } else { if (PROGRESS) { Console.Out.WriteLine(" buffer: " + Utility.Hex(UTF16.ValueOf(cp))); } // brute force approach UTF16.Append(buf, cp); /* TODO: optimize * // since we know that the classes are monotonically increasing, after zero * // e.g. 0 5 7 9 0 3 * // we can do an optimization * // there are only a few cases that work: zero, less, same, greater * // if both classes are the same, we fail * // if the decomp class < the segment class, we fail * * segClass = getClass(cp); * if (decompClass <= segClass) return null; */ } } if (!ok) { return(null); // we failed, characters left over } if (PROGRESS) { Console.Out.WriteLine("Matches"); } if (buf.Length == 0) { return(SET_WITH_NULL_STRING); // succeed, but no remainder } string remainder = buf.ToString(); // brute force approach // to check to make sure result is canonically equivalent /* * String trial = Normalizer.normalize(UTF16.valueOf(comp) + remainder, Normalizer.DECOMP, 0); * if (!segment.regionMatches(segmentPos, trial, 0, segment.length() - segmentPos)) return null; */ if (0 != Normalizer.Compare(UTF16.ValueOf(comp) + remainder, segment.Substring(segmentPos), 0)) { return(null); } // get the remaining combinations return(GetEquivalents2(remainder)); }
/** * Verify the conformance of the given line of the Unicode * normalization (UTR 15) test suite file. For each line, * there are five columns, corresponding to field[0]..field[4]. * * The following invariants must be true for all conformant implementations * c2 == NFC(c1) == NFC(c2) == NFC(c3) * c3 == NFD(c1) == NFD(c2) == NFD(c3) * c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5) * c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5) * * @param field the 5 columns * @param line the source line from the test suite file * @return true if the test passes */ private bool checkConformance(String[] field, String line, int options) { bool pass = true; StringBuffer buf = new StringBuffer(); // scratch String @out, fcd; int i = 0; for (i = 0; i < 5; ++i) { int fieldNum = i + 1; if (i < 3) { pass &= checkNorm(Normalizer.NFC, options, field[i], field[1], fieldNum); pass &= checkNorm(Normalizer.NFD, options, field[i], field[2], fieldNum); } pass &= checkNorm(Normalizer.NFKC, options, field[i], field[3], fieldNum); pass &= checkNorm(Normalizer.NFKD, options, field[i], field[4], fieldNum); cross(field[4] /*NFKD String*/, field[3] /*NFKC String*/, Normalizer.NFKC); cross(field[3] /*NFKC String*/, field[4] /*NFKD String*/, Normalizer.NFKD); } compare(field[1], field[2]); compare(field[0], field[1]); compare(field[0], field[2]); // test quick checks if (NormalizerQuickCheckResult.No == Normalizer.QuickCheck(field[1], Normalizer.NFC, options)) { Errln("Normalizer error: quickCheck(NFC(s), Normalizer.NFC) is Normalizer.NO"); pass = false; } if (Normalizer.NO == Normalizer.QuickCheck(field[2], Normalizer.NFD, options)) { Errln("Normalizer error: quickCheck(NFD(s), Normalizer.NFD) is Normalizer.NO"); pass = false; } if (Normalizer.NO == Normalizer.QuickCheck(field[3], Normalizer.NFKC, options)) { Errln("Normalizer error: quickCheck(NFKC(s), Normalizer.NFKC) is Normalizer.NO"); pass = false; } if (Normalizer.NO == Normalizer.QuickCheck(field[4], Normalizer.NFKD, options)) { Errln("Normalizer error: quickCheck(NFKD(s), Normalizer.NFKD) is Normalizer.NO"); pass = false; } if (!Normalizer.IsNormalized(field[1], Normalizer.NFC, options)) { Errln("Normalizer error: isNormalized(NFC(s), Normalizer.NFC) is false"); pass = false; } if (!field[0].Equals(field[1]) && Normalizer.IsNormalized(field[0], Normalizer.NFC, options)) { Errln("Normalizer error: isNormalized(s, Normalizer.NFC) is TRUE"); pass = false; } if (!Normalizer.IsNormalized(field[3], Normalizer.NFKC, options)) { Errln("Normalizer error: isNormalized(NFKC(s), Normalizer.NFKC) is false"); pass = false; } if (!field[0].Equals(field[3]) && Normalizer.IsNormalized(field[0], Normalizer.NFKC, options)) { Errln("Normalizer error: isNormalized(s, Normalizer.NFKC) is TRUE"); pass = false; } // test api that takes a char[] if (!Normalizer.IsNormalized(field[1].ToCharArray(), 0, field[1].Length, Normalizer.NFC, options)) { Errln("Normalizer error: isNormalized(NFC(s), Normalizer.NFC) is false"); pass = false; } // test api that takes a codepoint if (!Normalizer.IsNormalized(UTF16.CharAt(field[1], 0), Normalizer.NFC, options)) { Errln("Normalizer error: isNormalized(NFC(s), Normalizer.NFC) is false"); pass = false; } // test FCD quick check and "makeFCD" fcd = Normalizer.Normalize(field[0], Normalizer.FCD); if (Normalizer.NO == Normalizer.QuickCheck(fcd, Normalizer.FCD, options)) { Errln("Normalizer error: quickCheck(FCD(s), Normalizer.FCD) is Normalizer.NO"); pass = false; } // check FCD return length { char[] fcd2 = new char[fcd.Length * 2]; char[] src = field[0].ToCharArray(); int fcdLen = Normalizer.Normalize(src, 0, src.Length, fcd2, fcd.Length, fcd2.Length, Normalizer.FCD, 0); if (fcdLen != fcd.Length) { Errln("makeFCD did not return the correct length"); } } if (Normalizer.NO == Normalizer.QuickCheck(fcd, Normalizer.FCD, options)) { Errln("Normalizer error: quickCheck(FCD(s), Normalizer.FCD) is Normalizer.NO"); pass = false; } if (Normalizer.NO == Normalizer.QuickCheck(field[2], Normalizer.FCD, options)) { Errln("Normalizer error: quickCheck(NFD(s), Normalizer.FCD) is Normalizer.NO"); pass = false; } if (Normalizer.NO == Normalizer.QuickCheck(field[4], Normalizer.FCD, options)) { Errln("Normalizer error: quickCheck(NFKD(s), Normalizer.FCD) is Normalizer.NO"); pass = false; } @out = iterativeNorm(new StringCharacterIterator(field[0]), Normalizer.FCD, buf, +1, options); @out = iterativeNorm(new StringCharacterIterator(field[0]), Normalizer.FCD, buf, -1, options); @out = iterativeNorm(new StringCharacterIterator(field[2]), Normalizer.FCD, buf, +1, options); @out = iterativeNorm(new StringCharacterIterator(field[2]), Normalizer.FCD, buf, -1, options); @out = iterativeNorm(new StringCharacterIterator(field[4]), Normalizer.FCD, buf, +1, options); @out = iterativeNorm(new StringCharacterIterator(field[4]), Normalizer.FCD, buf, -1, options); @out = Normalizer.Normalize(fcd, Normalizer.NFD); if ([email protected](field[2])) { Errln("Normalizer error: NFD(FCD(s))!=NFD(s)"); pass = false; } if (!pass) { Errln("FAIL: " + line); } if (field[0] != field[2]) { // two strings that are canonically equivalent must test // equal under a canonical caseless match // see UAX #21 Case Mappings and Jitterbug 2021 and // Unicode Technical Committee meeting consensus 92-C31 int rc; if ((rc = Normalizer.Compare(field[0], field[2], (options << Normalizer.COMPARE_NORM_OPTIONS_SHIFT) | Normalizer.COMPARE_IGNORE_CASE)) != 0) { Errln("Normalizer.compare(original, NFD, case-insensitive) returned " + rc + " instead of 0 for equal"); pass = false; } } return(pass); }