private RuleStatus CalcStatus(int current, int next)
            {
                if (current == BreakIterator.Done || next == BreakIterator.Done)
                {
                    return(ICU4N.Text.RuleStatus.WordNone);
                }

                int begin = m_start + current;
                int end   = m_start + next;

                int codepoint;

                for (int i = begin; i < end; i += UTF16.GetCharCount(codepoint))
                {
                    codepoint = UTF16.CharAt(m_text, 0, end, begin);

                    if (UChar.IsDigit(codepoint))
                    {
                        return(ICU4N.Text.RuleStatus.WordNumber);
                    }
                    else if (UChar.IsLetter(codepoint))
                    {
                        // TODO: try to separately specify ideographic, kana?
                        // [currently all bundled as letter for this case]
                        return(ICU4N.Text.RuleStatus.WordLetter);
                    }
                }

                return(ICU4N.Text.RuleStatus.WordNone);
            }
示例#2
0
 // two strings that are canonically equivalent must test
 // equal under a canonical caseless match
 // see UAX #21 Case Mappings and Jitterbug 2021 and
 // Unicode Technical Committee meeting consensus 92-C31
 private void compare(String s1, String s2)
 {
     if (s1.Length == 1 && s2.Length == 1)
     {
         if (Normalizer.Compare(UTF16.CharAt(s1, 0), UTF16.CharAt(s2, 0), Normalizer.COMPARE_IGNORE_CASE) != 0)
         {
             Errln("Normalizer.compare(int,int) failed for s1: "
                   + Utility.Hex(s1) + " s2: " + Utility.Hex(s2));
         }
     }
     if (s1.Length == 1 && s2.Length > 1)
     {
         if (Normalizer.Compare(UTF16.CharAt(s1, 0), s2, Normalizer.COMPARE_IGNORE_CASE) != 0)
         {
             Errln("Normalizer.compare(int,String) failed for s1: "
                   + Utility.Hex(s1) + " s2: " + Utility.Hex(s2));
         }
     }
     if (s1.Length > 1 && s2.Length > 1)
     {
         // TODO: Re-enable this tests after UTC fixes UAX 21
         if (Normalizer.Compare(s1.ToCharArray(), s2.ToCharArray(), Normalizer.COMPARE_IGNORE_CASE) != 0)
         {
             Errln("Normalizer.compare(char[],char[]) failed for s1: "
                   + Utility.Hex(s1) + " s2: " + Utility.Hex(s2));
         }
     }
 }
示例#3
0
 /// <summary>
 /// Returns the current 32-bit code point without parsing escapes, parsing
 /// variables, or skipping whitespace.
 /// </summary>
 /// <returns>The current 32-bit code point.</returns>
 private int Current()
 {
     if (buf != null)
     {
         return(UTF16.CharAt(buf, 0, buf.Length, bufPos));
     }
     else
     {
         int i = pos.Index;
         return((i < text.Length) ? UTF16.CharAt(text, i) : DONE);
     }
 }
示例#4
0
        /// <summary>
        /// Iterates to the next script run, returning true if one exists.
        /// </summary>
        /// <returns>true if there is another script run, false otherwise.</returns>
        public bool Next()
        {
            if (scriptLimit >= limit)
            {
                return(false);
            }

            scriptCode  = UScript.Common;
            scriptStart = scriptLimit;

            while (index < limit)
            {
                int ch = UTF16.CharAt(text, start, limit, index - start);
                int sc = GetScript(ch);

                /*
                 * From UTR #24: Implementations that determine the boundaries between
                 * characters of given scripts should never break between a non-spacing
                 * mark and its base character. Thus for boundary determinations and
                 * similar sorts of processing, a non-spacing mark — whatever its script
                 * value — should inherit the script value of its base character.
                 */
                if (IsSameScript(scriptCode, sc) ||
                    UChar.GetUnicodeCategory(ch) == UUnicodeCategory.NonSpacingMark)
                {
                    index += UTF16.GetCharCount(ch);

                    /*
                     * Inherited or Common becomes the script code of the surrounding text.
                     */
                    if (scriptCode <= UScript.Inherited && sc > UScript.Inherited)
                    {
                        scriptCode = sc;
                    }
                }
                else
                {
                    break;
                }
            }

            scriptLimit = index;
            return(true);
        }
示例#5
0
        public static String ReplaceAll(String source, UnicodeSet set, String replacement)
        {
            StringBuffer results = new StringBuffer();
            int          cp;

            for (int i = 0; i < source.Length; i += UTF16.GetCharCount(cp))
            {
                cp = UTF16.CharAt(source, i);
                if (set.Contains(cp))
                {
                    results.Append(replacement);
                }
                else
                {
                    UTF16.Append(results, cp);
                }
            }
            return(results.ToString());
        }
示例#6
0
        /// <summary>Returns <c>true</c> if the current text represents emoji character or sequence.</summary>
        private bool IsEmoji(int current, int next)
        {
            int begin     = start + current;
            int end       = start + next;
            int codepoint = UTF16.CharAt(text, 0, end, begin);

            if (EMOJI.Contains(codepoint))
            {
                if (EMOJI_RK.Contains(codepoint))
                {
                    // if its in EmojiRK, we don't treat it as emoji unless there is evidence it forms emoji sequence,
                    // an emoji presentation selector or keycap follows.
                    int trailer = begin + Character.CharCount(codepoint);
                    return(trailer < end && (text[trailer] == 0xFE0F || text[trailer] == 0x20E3));
                }
                else
                {
                    return(true);
                }
            }
            return(false);
        }
示例#7
0
        private void _testTrieIteration(Int32Trie trie, CheckRange[] checkRanges,
                                        int countCheckRanges)
        {
            // write a string
            int          countValues = 0;
            StringBuffer s           = new StringBuffer();

            int[] values = new int[30];
            for (int i = 0; i < countCheckRanges; ++i)
            {
                int c = checkRanges[i].Limit;
                if (c != 0)
                {
                    --c;
                    UTF16.Append(s, c);
                    values[countValues++] = checkRanges[i].Value;
                }
            }

            {
                int limit = s.Length;
                // try forward
                int p = 0;
                int i = 0;
                while (p < limit)
                {
                    int c = UTF16.CharAt(s, p);
                    p += UTF16.GetCharCount(c);
                    int value = trie.GetCodePointValue(c);
                    if (value != values[i])
                    {
                        Errln("wrong value from UTRIE_NEXT(U+"
                              + (c).ToHexString() + "): 0x"
                              + (value).ToHexString() + " instead of 0x"
                              + (values[i]).ToHexString());
                    }
                    // unlike the c version lead is 0 if c is non-supplementary
                    char lead  = UTF16.GetLeadSurrogate(c);
                    char trail = UTF16.GetTrailSurrogate(c);
                    if (lead == 0
                        ? trail != s[p - 1]
                        : !UTF16.IsLeadSurrogate(lead) ||
                        !UTF16.IsTrailSurrogate(trail) || lead != s[p - 2] ||
                        trail != s[p - 1])
                    {
                        Errln("wrong (lead, trail) from UTRIE_NEXT(U+"
                              + (c).ToHexString());
                        continue;
                    }
                    if (lead != 0)
                    {
                        value = trie.GetLeadValue(lead);
                        value = trie.GetTrailValue(value, trail);
                        if (value != trie.GetSurrogateValue(lead, trail) &&
                            value != values[i])
                        {
                            Errln("wrong value from getting supplementary "
                                  + "values (U+"
                                  + (c).ToHexString() + "): 0x"
                                  + (value).ToHexString() + " instead of 0x"
                                  + (values[i]).ToHexString());
                        }
                    }
                    ++i;
                }
            }
        }
示例#8
0
        public void TestIterationUChar32()
        {
            String text = "\u0061\u0062\ud841\udc02\u20ac\ud7ff\ud842\udc06\ud801\udc00\u0061";
            int    c;
            int    i;
            {
                UCharacterIterator iter = UCharacterIterator.GetInstance(text);

                String iterText = iter.GetText();
                if (!iterText.Equals(text))
                {
                    Errln("iter.getText() failed");
                }

                iter.Index = (1);
                if (iter.CurrentCodePoint != UTF16.CharAt(text, 1))
                {
                    Errln("Iterator didn't start out in the right place.");
                }

                iter.SetToStart();
                c = iter.CurrentCodePoint;
                i = 0;
                i = iter.MoveCodePointIndex(1);
                c = iter.CurrentCodePoint;
                if (c != UTF16.CharAt(text, 1) || i != 1)
                {
                    Errln("moveCodePointIndex(1) didn't work correctly expected " + Hex(c) + " got " + Hex(UTF16.CharAt(text, 1)) + " i= " + i);
                }

                i = iter.MoveCodePointIndex(2);
                c = iter.CurrentCodePoint;
                if (c != UTF16.CharAt(text, 4) || i != 4)
                {
                    Errln("moveCodePointIndex(2) didn't work correctly expected " + Hex(c) + " got " + Hex(UTF16.CharAt(text, 4)) + " i= " + i);
                }

                i = iter.MoveCodePointIndex(-2);
                c = iter.CurrentCodePoint;
                if (c != UTF16.CharAt(text, 1) || i != 1)
                {
                    Errln("moveCodePointIndex(-2) didn't work correctly expected " + Hex(c) + " got " + Hex(UTF16.CharAt(text, 1)) + " i= " + i);
                }

                iter.SetToLimit();
                i = iter.MoveCodePointIndex(-2);
                c = iter.CurrentCodePoint;
                if (c != UTF16.CharAt(text, (text.Length - 3)) || i != (text.Length - 3))
                {
                    Errln("moveCodePointIndex(-2) didn't work correctly expected " + Hex(c) + " got " + Hex(UTF16.CharAt(text, (text.Length - 3))) + " i= " + i);
                }

                iter.SetToStart();
                c = iter.CurrentCodePoint;
                i = 0;

                //testing first32PostInc, nextCodePointPostInc, setTostart
                i = 0;
                iter.SetToStart();
                c = iter.Next();
                if (c != UTF16.CharAt(text, i))
                {
                    Errln("first32PostInc failed.  Expected->" + Hex(UTF16.CharAt(text, i)) + " Got-> " + Hex(c));
                }
                if (iter.Index != UTF16.GetCharCount(c) + i)
                {
                    Errln("getIndex() after first32PostInc() failed");
                }

                iter.SetToStart();
                i = 0;
                if (iter.Index != 0)
                {
                    Errln("setToStart failed");
                }

                Logln("Testing forward iteration...");
                do
                {
                    if (c != UCharacterIterator.DONE)
                    {
                        c = iter.NextCodePoint();
                    }

                    if (c != UTF16.CharAt(text, i))
                    {
                        Errln("Character mismatch at position " + i + ", iterator has " + Hex(c) + ", string has " + Hex(UTF16.CharAt(text, i)));
                    }

                    i += UTF16.GetCharCount(c);
                    if (iter.Index != i)
                    {
                        Errln("getIndex() aftr nextCodePointPostInc() isn't working right");
                    }
                    c = iter.CurrentCodePoint;
                    if (c != UCharacterIterator.DONE && c != UTF16.CharAt(text, i))
                    {
                        Errln("current() after nextCodePointPostInc() isn't working right");
                    }
                } while (c != UCharacterIterator.DONE);
                c = iter.NextCodePoint();
                if (c != UCharacterIterator.DONE)
                {
                    Errln("nextCodePointPostInc() didn't return DONE at the beginning");
                }
            }
        }
示例#9
0
 public int Char32At(int pos)
 {
     return(UTF16.CharAt(buffer, 0, length, pos));
 }
示例#10
0
 public int Char32At(int pos) => UTF16.CharAt(buffer, 0, length, pos);
示例#11
0
        public void runConformance(String fileName, int options)
        {
            String line = null;

            String[]
            fields                 = new String[5];
            StringBuffer buf       = new StringBuffer();
            int          passCount = 0;
            int          failCount = 0;
            UnicodeSet   other     = new UnicodeSet(0, 0x10ffff);
            int          c         = 0;
            TextReader   input     = null;

            try
            {
                input = TestUtil.GetDataReader(fileName);
                for (int count = 0; ; ++count)
                {
                    line = input.ReadLine();
                    if (line == null)
                    {
                        //read the extra test cases
                        if (count > moreCases.Length)
                        {
                            count = 0;
                        }
                        else if (count == moreCases.Length)
                        {
                            // all done
                            break;
                        }
                        line = moreCases[count++];
                    }
                    if (line.Length == 0)
                    {
                        continue;
                    }

                    // Expect 5 columns of this format:
                    // 1E0C;1E0C;0044 0323;1E0C;0044 0323; # <comments>

                    // Skip comments
                    if (line[0] == '#' || line[0] == '@')
                    {
                        continue;
                    }

                    // Parse out the fields
                    hexsplit(line, ';', fields, buf);

                    // Remove a single code point from the "other" UnicodeSet
                    if (fields[0].Length == UTF16.MoveCodePointOffset(fields[0], 0, 1))
                    {
                        c = UTF16.CharAt(fields[0], 0);
                        if (0xac20 <= c && c <= 0xd73f)
                        {
                            // not an exhaustive test run: skip most Hangul syllables
                            if (c == 0xac20)
                            {
                                other.Remove(0xac20, 0xd73f);
                            }
                            continue;
                        }
                        other.Remove(c);
                    }
                    if (checkConformance(fields, line, options))
                    {
                        ++passCount;
                    }
                    else
                    {
                        ++failCount;
                    }
                    if ((count % 1000) == 999)
                    {
                        Logln("Line " + (count + 1));
                    }
                }
            }
            catch (IOException ex)
            {
                ex.PrintStackTrace();
                throw new ArgumentException("Couldn't read file "
                                            + ex.GetType().Name + " " + ex.ToString()
                                            + " line = " + line
                                            );
            }
            finally
            {
                if (input != null)
                {
                    try
                    {
                        input.Dispose();
                    }
                    catch (IOException ignored)
                    {
                    }
                }
            }

            if (failCount != 0)
            {
                Errln("Total: " + failCount + " lines failed, " +
                      passCount + " lines passed");
            }
            else
            {
                Logln("Total: " + passCount + " lines passed");
            }
        }
示例#12
0
        /**
         * Verify the conformance of the given line of the Unicode
         * normalization (UTR 15) test suite file.  For each line,
         * there are five columns, corresponding to field[0]..field[4].
         *
         * The following invariants must be true for all conformant implementations
         *  c2 == NFC(c1) == NFC(c2) == NFC(c3)
         *  c3 == NFD(c1) == NFD(c2) == NFD(c3)
         *  c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5)
         *  c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)
         *
         * @param field the 5 columns
         * @param line the source line from the test suite file
         * @return true if the test passes
         */
        private bool checkConformance(String[] field, String line, int options)
        {
            bool         pass = true;
            StringBuffer buf = new StringBuffer(); // scratch
            String       @out, fcd;
            int          i = 0;

            for (i = 0; i < 5; ++i)
            {
                int fieldNum = i + 1;
                if (i < 3)
                {
                    pass &= checkNorm(Normalizer.NFC, options, field[i], field[1], fieldNum);
                    pass &= checkNorm(Normalizer.NFD, options, field[i], field[2], fieldNum);
                }
                pass &= checkNorm(Normalizer.NFKC, options, field[i], field[3], fieldNum);
                pass &= checkNorm(Normalizer.NFKD, options, field[i], field[4], fieldNum);
                cross(field[4] /*NFKD String*/, field[3] /*NFKC String*/, Normalizer.NFKC);
                cross(field[3] /*NFKC String*/, field[4] /*NFKD String*/, Normalizer.NFKD);
            }
            compare(field[1], field[2]);
            compare(field[0], field[1]);
            compare(field[0], field[2]);
            // test quick checks
            if (NormalizerQuickCheckResult.No == Normalizer.QuickCheck(field[1], Normalizer.NFC, options))
            {
                Errln("Normalizer error: quickCheck(NFC(s), Normalizer.NFC) is Normalizer.NO");
                pass = false;
            }
            if (Normalizer.NO == Normalizer.QuickCheck(field[2], Normalizer.NFD, options))
            {
                Errln("Normalizer error: quickCheck(NFD(s), Normalizer.NFD) is Normalizer.NO");
                pass = false;
            }
            if (Normalizer.NO == Normalizer.QuickCheck(field[3], Normalizer.NFKC, options))
            {
                Errln("Normalizer error: quickCheck(NFKC(s), Normalizer.NFKC) is Normalizer.NO");
                pass = false;
            }
            if (Normalizer.NO == Normalizer.QuickCheck(field[4], Normalizer.NFKD, options))
            {
                Errln("Normalizer error: quickCheck(NFKD(s), Normalizer.NFKD) is Normalizer.NO");
                pass = false;
            }

            if (!Normalizer.IsNormalized(field[1], Normalizer.NFC, options))
            {
                Errln("Normalizer error: isNormalized(NFC(s), Normalizer.NFC) is false");
                pass = false;
            }
            if (!field[0].Equals(field[1]) && Normalizer.IsNormalized(field[0], Normalizer.NFC, options))
            {
                Errln("Normalizer error: isNormalized(s, Normalizer.NFC) is TRUE");
                pass = false;
            }
            if (!Normalizer.IsNormalized(field[3], Normalizer.NFKC, options))
            {
                Errln("Normalizer error: isNormalized(NFKC(s), Normalizer.NFKC) is false");
                pass = false;
            }
            if (!field[0].Equals(field[3]) && Normalizer.IsNormalized(field[0], Normalizer.NFKC, options))
            {
                Errln("Normalizer error: isNormalized(s, Normalizer.NFKC) is TRUE");
                pass = false;
            }
            // test api that takes a char[]
            if (!Normalizer.IsNormalized(field[1].ToCharArray(), 0, field[1].Length, Normalizer.NFC, options))
            {
                Errln("Normalizer error: isNormalized(NFC(s), Normalizer.NFC) is false");
                pass = false;
            }
            // test api that takes a codepoint
            if (!Normalizer.IsNormalized(UTF16.CharAt(field[1], 0), Normalizer.NFC, options))
            {
                Errln("Normalizer error: isNormalized(NFC(s), Normalizer.NFC) is false");
                pass = false;
            }
            // test FCD quick check and "makeFCD"
            fcd = Normalizer.Normalize(field[0], Normalizer.FCD);
            if (Normalizer.NO == Normalizer.QuickCheck(fcd, Normalizer.FCD, options))
            {
                Errln("Normalizer error: quickCheck(FCD(s), Normalizer.FCD) is Normalizer.NO");
                pass = false;
            }
            // check FCD return length
            {
                char[] fcd2   = new char[fcd.Length * 2];
                char[] src    = field[0].ToCharArray();
                int    fcdLen = Normalizer.Normalize(src, 0, src.Length, fcd2, fcd.Length, fcd2.Length, Normalizer.FCD, 0);
                if (fcdLen != fcd.Length)
                {
                    Errln("makeFCD did not return the correct length");
                }
            }
            if (Normalizer.NO == Normalizer.QuickCheck(fcd, Normalizer.FCD, options))
            {
                Errln("Normalizer error: quickCheck(FCD(s), Normalizer.FCD) is Normalizer.NO");
                pass = false;
            }
            if (Normalizer.NO == Normalizer.QuickCheck(field[2], Normalizer.FCD, options))
            {
                Errln("Normalizer error: quickCheck(NFD(s), Normalizer.FCD) is Normalizer.NO");
                pass = false;
            }

            if (Normalizer.NO == Normalizer.QuickCheck(field[4], Normalizer.FCD, options))
            {
                Errln("Normalizer error: quickCheck(NFKD(s), Normalizer.FCD) is Normalizer.NO");
                pass = false;
            }

            @out = iterativeNorm(new StringCharacterIterator(field[0]), Normalizer.FCD, buf, +1, options);
            @out = iterativeNorm(new StringCharacterIterator(field[0]), Normalizer.FCD, buf, -1, options);

            @out = iterativeNorm(new StringCharacterIterator(field[2]), Normalizer.FCD, buf, +1, options);
            @out = iterativeNorm(new StringCharacterIterator(field[2]), Normalizer.FCD, buf, -1, options);

            @out = iterativeNorm(new StringCharacterIterator(field[4]), Normalizer.FCD, buf, +1, options);
            @out = iterativeNorm(new StringCharacterIterator(field[4]), Normalizer.FCD, buf, -1, options);

            @out = Normalizer.Normalize(fcd, Normalizer.NFD);
            if ([email protected](field[2]))
            {
                Errln("Normalizer error: NFD(FCD(s))!=NFD(s)");
                pass = false;
            }
            if (!pass)
            {
                Errln("FAIL: " + line);
            }
            if (field[0] != field[2])
            {
                // two strings that are canonically equivalent must test
                // equal under a canonical caseless match
                // see UAX #21 Case Mappings and Jitterbug 2021 and
                // Unicode Technical Committee meeting consensus 92-C31
                int rc;
                if ((rc = Normalizer.Compare(field[0], field[2], (options << Normalizer.COMPARE_NORM_OPTIONS_SHIFT) | Normalizer.COMPARE_IGNORE_CASE)) != 0)
                {
                    Errln("Normalizer.compare(original, NFD, case-insensitive) returned " + rc + " instead of 0 for equal");
                    pass = false;
                }
            }

            return(pass);
        }
示例#13
0
        /// <summary>
        /// Maps the string to single code points and adds the associated case closure
        /// mappings.
        /// </summary>
        /// <remarks>
        /// The string is mapped to code points if it is their full case folding string.
        /// In other words, this performs a reverse full case folding and then
        /// adds the case closure items of the resulting code points.
        /// If the string is found and its closure applied, then
        /// the string itself is added as well as part of its code points' closure.
        /// </remarks>
        /// <returns>true if the string was found.</returns>
        public bool AddStringCaseClosure(string s, UnicodeSet set)
        {
            int i, length, start, limit, result, unfoldOffset, unfoldRows, unfoldRowWidth, unfoldStringWidth;

            if (unfold == null || s == null)
            {
                return(false); /* no reverse case folding data, or no string */
            }
            length = s.Length;
            if (length <= 1)
            {
                /* the string is too short to find any match */

                /*
                 * more precise would be:
                 * if(!u_strHasMoreChar32Than(s, length, 1))
                 * but this does not make much practical difference because
                 * a single supplementary code point would just not be found
                 */
                return(false);
            }

            unfoldRows        = unfold[UNFOLD_ROWS];
            unfoldRowWidth    = unfold[UNFOLD_ROW_WIDTH];
            unfoldStringWidth = unfold[UNFOLD_STRING_WIDTH];
            //unfoldCPWidth=unfoldRowWidth-unfoldStringWidth;

            if (length > unfoldStringWidth)
            {
                /* the string is too long to find any match */
                return(false);
            }

            /* do a binary search for the string */
            start = 0;
            limit = unfoldRows;
            while (start < limit)
            {
                i            = (start + limit) / 2;
                unfoldOffset = ((i + 1) * unfoldRowWidth); // +1 to skip the header values above
                result       = StrCmpMax(s, unfoldOffset, unfoldStringWidth);

                if (result == 0)
                {
                    /* found the string: add each code point, and its case closure */
                    int c;

                    for (i = unfoldStringWidth; i < unfoldRowWidth && unfold[unfoldOffset + i] != 0; i += UTF16.GetCharCount(c))
                    {
                        c = UTF16.CharAt(unfold, unfoldOffset, unfold.Length, i);
                        set.Add(c);
                        AddCaseClosure(c, set);
                    }
                    return(true);
                }
                else if (result < 0)
                {
                    limit = i;
                }
                else /* result>0 */
                {
                    start = i + 1;
                }
            }

            return(false); /* string not found */
        }