private RuleStatus CalcStatus(int current, int next) { if (current == BreakIterator.Done || next == BreakIterator.Done) { return(ICU4N.Text.RuleStatus.WordNone); } int begin = m_start + current; int end = m_start + next; int codepoint; for (int i = begin; i < end; i += UTF16.GetCharCount(codepoint)) { codepoint = UTF16.CharAt(m_text, 0, end, begin); if (UChar.IsDigit(codepoint)) { return(ICU4N.Text.RuleStatus.WordNumber); } else if (UChar.IsLetter(codepoint)) { // TODO: try to separately specify ideographic, kana? // [currently all bundled as letter for this case] return(ICU4N.Text.RuleStatus.WordLetter); } } return(ICU4N.Text.RuleStatus.WordNone); }
// two strings that are canonically equivalent must test // equal under a canonical caseless match // see UAX #21 Case Mappings and Jitterbug 2021 and // Unicode Technical Committee meeting consensus 92-C31 private void compare(String s1, String s2) { if (s1.Length == 1 && s2.Length == 1) { if (Normalizer.Compare(UTF16.CharAt(s1, 0), UTF16.CharAt(s2, 0), Normalizer.COMPARE_IGNORE_CASE) != 0) { Errln("Normalizer.compare(int,int) failed for s1: " + Utility.Hex(s1) + " s2: " + Utility.Hex(s2)); } } if (s1.Length == 1 && s2.Length > 1) { if (Normalizer.Compare(UTF16.CharAt(s1, 0), s2, Normalizer.COMPARE_IGNORE_CASE) != 0) { Errln("Normalizer.compare(int,String) failed for s1: " + Utility.Hex(s1) + " s2: " + Utility.Hex(s2)); } } if (s1.Length > 1 && s2.Length > 1) { // TODO: Re-enable this tests after UTC fixes UAX 21 if (Normalizer.Compare(s1.ToCharArray(), s2.ToCharArray(), Normalizer.COMPARE_IGNORE_CASE) != 0) { Errln("Normalizer.compare(char[],char[]) failed for s1: " + Utility.Hex(s1) + " s2: " + Utility.Hex(s2)); } } }
/// <summary> /// Returns the current 32-bit code point without parsing escapes, parsing /// variables, or skipping whitespace. /// </summary> /// <returns>The current 32-bit code point.</returns> private int Current() { if (buf != null) { return(UTF16.CharAt(buf, 0, buf.Length, bufPos)); } else { int i = pos.Index; return((i < text.Length) ? UTF16.CharAt(text, i) : DONE); } }
/// <summary> /// Iterates to the next script run, returning true if one exists. /// </summary> /// <returns>true if there is another script run, false otherwise.</returns> public bool Next() { if (scriptLimit >= limit) { return(false); } scriptCode = UScript.Common; scriptStart = scriptLimit; while (index < limit) { int ch = UTF16.CharAt(text, start, limit, index - start); int sc = GetScript(ch); /* * From UTR #24: Implementations that determine the boundaries between * characters of given scripts should never break between a non-spacing * mark and its base character. Thus for boundary determinations and * similar sorts of processing, a non-spacing mark — whatever its script * value — should inherit the script value of its base character. */ if (IsSameScript(scriptCode, sc) || UChar.GetUnicodeCategory(ch) == UUnicodeCategory.NonSpacingMark) { index += UTF16.GetCharCount(ch); /* * Inherited or Common becomes the script code of the surrounding text. */ if (scriptCode <= UScript.Inherited && sc > UScript.Inherited) { scriptCode = sc; } } else { break; } } scriptLimit = index; return(true); }
public static String ReplaceAll(String source, UnicodeSet set, String replacement) { StringBuffer results = new StringBuffer(); int cp; for (int i = 0; i < source.Length; i += UTF16.GetCharCount(cp)) { cp = UTF16.CharAt(source, i); if (set.Contains(cp)) { results.Append(replacement); } else { UTF16.Append(results, cp); } } return(results.ToString()); }
/// <summary>Returns <c>true</c> if the current text represents emoji character or sequence.</summary> private bool IsEmoji(int current, int next) { int begin = start + current; int end = start + next; int codepoint = UTF16.CharAt(text, 0, end, begin); if (EMOJI.Contains(codepoint)) { if (EMOJI_RK.Contains(codepoint)) { // if its in EmojiRK, we don't treat it as emoji unless there is evidence it forms emoji sequence, // an emoji presentation selector or keycap follows. int trailer = begin + Character.CharCount(codepoint); return(trailer < end && (text[trailer] == 0xFE0F || text[trailer] == 0x20E3)); } else { return(true); } } return(false); }
private void _testTrieIteration(Int32Trie trie, CheckRange[] checkRanges, int countCheckRanges) { // write a string int countValues = 0; StringBuffer s = new StringBuffer(); int[] values = new int[30]; for (int i = 0; i < countCheckRanges; ++i) { int c = checkRanges[i].Limit; if (c != 0) { --c; UTF16.Append(s, c); values[countValues++] = checkRanges[i].Value; } } { int limit = s.Length; // try forward int p = 0; int i = 0; while (p < limit) { int c = UTF16.CharAt(s, p); p += UTF16.GetCharCount(c); int value = trie.GetCodePointValue(c); if (value != values[i]) { Errln("wrong value from UTRIE_NEXT(U+" + (c).ToHexString() + "): 0x" + (value).ToHexString() + " instead of 0x" + (values[i]).ToHexString()); } // unlike the c version lead is 0 if c is non-supplementary char lead = UTF16.GetLeadSurrogate(c); char trail = UTF16.GetTrailSurrogate(c); if (lead == 0 ? trail != s[p - 1] : !UTF16.IsLeadSurrogate(lead) || !UTF16.IsTrailSurrogate(trail) || lead != s[p - 2] || trail != s[p - 1]) { Errln("wrong (lead, trail) from UTRIE_NEXT(U+" + (c).ToHexString()); continue; } if (lead != 0) { value = trie.GetLeadValue(lead); value = trie.GetTrailValue(value, trail); if (value != trie.GetSurrogateValue(lead, trail) && value != values[i]) { Errln("wrong value from getting supplementary " + "values (U+" + (c).ToHexString() + "): 0x" + (value).ToHexString() + " instead of 0x" + (values[i]).ToHexString()); } } ++i; } } }
public void TestIterationUChar32() { String text = "\u0061\u0062\ud841\udc02\u20ac\ud7ff\ud842\udc06\ud801\udc00\u0061"; int c; int i; { UCharacterIterator iter = UCharacterIterator.GetInstance(text); String iterText = iter.GetText(); if (!iterText.Equals(text)) { Errln("iter.getText() failed"); } iter.Index = (1); if (iter.CurrentCodePoint != UTF16.CharAt(text, 1)) { Errln("Iterator didn't start out in the right place."); } iter.SetToStart(); c = iter.CurrentCodePoint; i = 0; i = iter.MoveCodePointIndex(1); c = iter.CurrentCodePoint; if (c != UTF16.CharAt(text, 1) || i != 1) { Errln("moveCodePointIndex(1) didn't work correctly expected " + Hex(c) + " got " + Hex(UTF16.CharAt(text, 1)) + " i= " + i); } i = iter.MoveCodePointIndex(2); c = iter.CurrentCodePoint; if (c != UTF16.CharAt(text, 4) || i != 4) { Errln("moveCodePointIndex(2) didn't work correctly expected " + Hex(c) + " got " + Hex(UTF16.CharAt(text, 4)) + " i= " + i); } i = iter.MoveCodePointIndex(-2); c = iter.CurrentCodePoint; if (c != UTF16.CharAt(text, 1) || i != 1) { Errln("moveCodePointIndex(-2) didn't work correctly expected " + Hex(c) + " got " + Hex(UTF16.CharAt(text, 1)) + " i= " + i); } iter.SetToLimit(); i = iter.MoveCodePointIndex(-2); c = iter.CurrentCodePoint; if (c != UTF16.CharAt(text, (text.Length - 3)) || i != (text.Length - 3)) { Errln("moveCodePointIndex(-2) didn't work correctly expected " + Hex(c) + " got " + Hex(UTF16.CharAt(text, (text.Length - 3))) + " i= " + i); } iter.SetToStart(); c = iter.CurrentCodePoint; i = 0; //testing first32PostInc, nextCodePointPostInc, setTostart i = 0; iter.SetToStart(); c = iter.Next(); if (c != UTF16.CharAt(text, i)) { Errln("first32PostInc failed. Expected->" + Hex(UTF16.CharAt(text, i)) + " Got-> " + Hex(c)); } if (iter.Index != UTF16.GetCharCount(c) + i) { Errln("getIndex() after first32PostInc() failed"); } iter.SetToStart(); i = 0; if (iter.Index != 0) { Errln("setToStart failed"); } Logln("Testing forward iteration..."); do { if (c != UCharacterIterator.DONE) { c = iter.NextCodePoint(); } if (c != UTF16.CharAt(text, i)) { Errln("Character mismatch at position " + i + ", iterator has " + Hex(c) + ", string has " + Hex(UTF16.CharAt(text, i))); } i += UTF16.GetCharCount(c); if (iter.Index != i) { Errln("getIndex() aftr nextCodePointPostInc() isn't working right"); } c = iter.CurrentCodePoint; if (c != UCharacterIterator.DONE && c != UTF16.CharAt(text, i)) { Errln("current() after nextCodePointPostInc() isn't working right"); } } while (c != UCharacterIterator.DONE); c = iter.NextCodePoint(); if (c != UCharacterIterator.DONE) { Errln("nextCodePointPostInc() didn't return DONE at the beginning"); } } }
public int Char32At(int pos) { return(UTF16.CharAt(buffer, 0, length, pos)); }
public int Char32At(int pos) => UTF16.CharAt(buffer, 0, length, pos);
public void runConformance(String fileName, int options) { String line = null; String[] fields = new String[5]; StringBuffer buf = new StringBuffer(); int passCount = 0; int failCount = 0; UnicodeSet other = new UnicodeSet(0, 0x10ffff); int c = 0; TextReader input = null; try { input = TestUtil.GetDataReader(fileName); for (int count = 0; ; ++count) { line = input.ReadLine(); if (line == null) { //read the extra test cases if (count > moreCases.Length) { count = 0; } else if (count == moreCases.Length) { // all done break; } line = moreCases[count++]; } if (line.Length == 0) { continue; } // Expect 5 columns of this format: // 1E0C;1E0C;0044 0323;1E0C;0044 0323; # <comments> // Skip comments if (line[0] == '#' || line[0] == '@') { continue; } // Parse out the fields hexsplit(line, ';', fields, buf); // Remove a single code point from the "other" UnicodeSet if (fields[0].Length == UTF16.MoveCodePointOffset(fields[0], 0, 1)) { c = UTF16.CharAt(fields[0], 0); if (0xac20 <= c && c <= 0xd73f) { // not an exhaustive test run: skip most Hangul syllables if (c == 0xac20) { other.Remove(0xac20, 0xd73f); } continue; } other.Remove(c); } if (checkConformance(fields, line, options)) { ++passCount; } else { ++failCount; } if ((count % 1000) == 999) { Logln("Line " + (count + 1)); } } } catch (IOException ex) { ex.PrintStackTrace(); throw new ArgumentException("Couldn't read file " + ex.GetType().Name + " " + ex.ToString() + " line = " + line ); } finally { if (input != null) { try { input.Dispose(); } catch (IOException ignored) { } } } if (failCount != 0) { Errln("Total: " + failCount + " lines failed, " + passCount + " lines passed"); } else { Logln("Total: " + passCount + " lines passed"); } }
/** * Verify the conformance of the given line of the Unicode * normalization (UTR 15) test suite file. For each line, * there are five columns, corresponding to field[0]..field[4]. * * The following invariants must be true for all conformant implementations * c2 == NFC(c1) == NFC(c2) == NFC(c3) * c3 == NFD(c1) == NFD(c2) == NFD(c3) * c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5) * c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5) * * @param field the 5 columns * @param line the source line from the test suite file * @return true if the test passes */ private bool checkConformance(String[] field, String line, int options) { bool pass = true; StringBuffer buf = new StringBuffer(); // scratch String @out, fcd; int i = 0; for (i = 0; i < 5; ++i) { int fieldNum = i + 1; if (i < 3) { pass &= checkNorm(Normalizer.NFC, options, field[i], field[1], fieldNum); pass &= checkNorm(Normalizer.NFD, options, field[i], field[2], fieldNum); } pass &= checkNorm(Normalizer.NFKC, options, field[i], field[3], fieldNum); pass &= checkNorm(Normalizer.NFKD, options, field[i], field[4], fieldNum); cross(field[4] /*NFKD String*/, field[3] /*NFKC String*/, Normalizer.NFKC); cross(field[3] /*NFKC String*/, field[4] /*NFKD String*/, Normalizer.NFKD); } compare(field[1], field[2]); compare(field[0], field[1]); compare(field[0], field[2]); // test quick checks if (NormalizerQuickCheckResult.No == Normalizer.QuickCheck(field[1], Normalizer.NFC, options)) { Errln("Normalizer error: quickCheck(NFC(s), Normalizer.NFC) is Normalizer.NO"); pass = false; } if (Normalizer.NO == Normalizer.QuickCheck(field[2], Normalizer.NFD, options)) { Errln("Normalizer error: quickCheck(NFD(s), Normalizer.NFD) is Normalizer.NO"); pass = false; } if (Normalizer.NO == Normalizer.QuickCheck(field[3], Normalizer.NFKC, options)) { Errln("Normalizer error: quickCheck(NFKC(s), Normalizer.NFKC) is Normalizer.NO"); pass = false; } if (Normalizer.NO == Normalizer.QuickCheck(field[4], Normalizer.NFKD, options)) { Errln("Normalizer error: quickCheck(NFKD(s), Normalizer.NFKD) is Normalizer.NO"); pass = false; } if (!Normalizer.IsNormalized(field[1], Normalizer.NFC, options)) { Errln("Normalizer error: isNormalized(NFC(s), Normalizer.NFC) is false"); pass = false; } if (!field[0].Equals(field[1]) && Normalizer.IsNormalized(field[0], Normalizer.NFC, options)) { Errln("Normalizer error: isNormalized(s, Normalizer.NFC) is TRUE"); pass = false; } if (!Normalizer.IsNormalized(field[3], Normalizer.NFKC, options)) { Errln("Normalizer error: isNormalized(NFKC(s), Normalizer.NFKC) is false"); pass = false; } if (!field[0].Equals(field[3]) && Normalizer.IsNormalized(field[0], Normalizer.NFKC, options)) { Errln("Normalizer error: isNormalized(s, Normalizer.NFKC) is TRUE"); pass = false; } // test api that takes a char[] if (!Normalizer.IsNormalized(field[1].ToCharArray(), 0, field[1].Length, Normalizer.NFC, options)) { Errln("Normalizer error: isNormalized(NFC(s), Normalizer.NFC) is false"); pass = false; } // test api that takes a codepoint if (!Normalizer.IsNormalized(UTF16.CharAt(field[1], 0), Normalizer.NFC, options)) { Errln("Normalizer error: isNormalized(NFC(s), Normalizer.NFC) is false"); pass = false; } // test FCD quick check and "makeFCD" fcd = Normalizer.Normalize(field[0], Normalizer.FCD); if (Normalizer.NO == Normalizer.QuickCheck(fcd, Normalizer.FCD, options)) { Errln("Normalizer error: quickCheck(FCD(s), Normalizer.FCD) is Normalizer.NO"); pass = false; } // check FCD return length { char[] fcd2 = new char[fcd.Length * 2]; char[] src = field[0].ToCharArray(); int fcdLen = Normalizer.Normalize(src, 0, src.Length, fcd2, fcd.Length, fcd2.Length, Normalizer.FCD, 0); if (fcdLen != fcd.Length) { Errln("makeFCD did not return the correct length"); } } if (Normalizer.NO == Normalizer.QuickCheck(fcd, Normalizer.FCD, options)) { Errln("Normalizer error: quickCheck(FCD(s), Normalizer.FCD) is Normalizer.NO"); pass = false; } if (Normalizer.NO == Normalizer.QuickCheck(field[2], Normalizer.FCD, options)) { Errln("Normalizer error: quickCheck(NFD(s), Normalizer.FCD) is Normalizer.NO"); pass = false; } if (Normalizer.NO == Normalizer.QuickCheck(field[4], Normalizer.FCD, options)) { Errln("Normalizer error: quickCheck(NFKD(s), Normalizer.FCD) is Normalizer.NO"); pass = false; } @out = iterativeNorm(new StringCharacterIterator(field[0]), Normalizer.FCD, buf, +1, options); @out = iterativeNorm(new StringCharacterIterator(field[0]), Normalizer.FCD, buf, -1, options); @out = iterativeNorm(new StringCharacterIterator(field[2]), Normalizer.FCD, buf, +1, options); @out = iterativeNorm(new StringCharacterIterator(field[2]), Normalizer.FCD, buf, -1, options); @out = iterativeNorm(new StringCharacterIterator(field[4]), Normalizer.FCD, buf, +1, options); @out = iterativeNorm(new StringCharacterIterator(field[4]), Normalizer.FCD, buf, -1, options); @out = Normalizer.Normalize(fcd, Normalizer.NFD); if ([email protected](field[2])) { Errln("Normalizer error: NFD(FCD(s))!=NFD(s)"); pass = false; } if (!pass) { Errln("FAIL: " + line); } if (field[0] != field[2]) { // two strings that are canonically equivalent must test // equal under a canonical caseless match // see UAX #21 Case Mappings and Jitterbug 2021 and // Unicode Technical Committee meeting consensus 92-C31 int rc; if ((rc = Normalizer.Compare(field[0], field[2], (options << Normalizer.COMPARE_NORM_OPTIONS_SHIFT) | Normalizer.COMPARE_IGNORE_CASE)) != 0) { Errln("Normalizer.compare(original, NFD, case-insensitive) returned " + rc + " instead of 0 for equal"); pass = false; } } return(pass); }
/// <summary> /// Maps the string to single code points and adds the associated case closure /// mappings. /// </summary> /// <remarks> /// The string is mapped to code points if it is their full case folding string. /// In other words, this performs a reverse full case folding and then /// adds the case closure items of the resulting code points. /// If the string is found and its closure applied, then /// the string itself is added as well as part of its code points' closure. /// </remarks> /// <returns>true if the string was found.</returns> public bool AddStringCaseClosure(string s, UnicodeSet set) { int i, length, start, limit, result, unfoldOffset, unfoldRows, unfoldRowWidth, unfoldStringWidth; if (unfold == null || s == null) { return(false); /* no reverse case folding data, or no string */ } length = s.Length; if (length <= 1) { /* the string is too short to find any match */ /* * more precise would be: * if(!u_strHasMoreChar32Than(s, length, 1)) * but this does not make much practical difference because * a single supplementary code point would just not be found */ return(false); } unfoldRows = unfold[UNFOLD_ROWS]; unfoldRowWidth = unfold[UNFOLD_ROW_WIDTH]; unfoldStringWidth = unfold[UNFOLD_STRING_WIDTH]; //unfoldCPWidth=unfoldRowWidth-unfoldStringWidth; if (length > unfoldStringWidth) { /* the string is too long to find any match */ return(false); } /* do a binary search for the string */ start = 0; limit = unfoldRows; while (start < limit) { i = (start + limit) / 2; unfoldOffset = ((i + 1) * unfoldRowWidth); // +1 to skip the header values above result = StrCmpMax(s, unfoldOffset, unfoldStringWidth); if (result == 0) { /* found the string: add each code point, and its case closure */ int c; for (i = unfoldStringWidth; i < unfoldRowWidth && unfold[unfoldOffset + i] != 0; i += UTF16.GetCharCount(c)) { c = UTF16.CharAt(unfold, unfoldOffset, unfold.Length, i); set.Add(c); AddCaseClosure(c, set); } return(true); } else if (result < 0) { limit = i; } else /* result>0 */ { start = i + 1; } } return(false); /* string not found */ }