Beispiel #1
0
        private void checkFirst(CharsTrie trie, StringAndValue[] data, int dataLength)
        {
            for (int i = 0; i < dataLength; ++i)
            {
                if (data[i].s.Length == 0)
                {
                    continue;  // skip empty string
                }
                String expectedString = data[i].s;
                int    c           = expectedString[0];
                int    nextCp      = expectedString.Length > 1 ? expectedString[1] : 0;
                Result firstResult = trie.First(c);
                int    firstValue  = firstResult.HasValue() ? trie.GetValue() : -1;
                Result nextResult  = trie.Next(nextCp);
                if (firstResult != trie.Reset().Next(c) ||
                    firstResult != trie.Current ||
                    firstValue != (firstResult.HasValue() ? trie.GetValue() : -1) ||
                    nextResult != trie.Next(nextCp)
                    )
                {
                    Errln(String.Format("trie.first(U+{0:X4})!=trie.Reset().Next(same) for {1}",
                                        c, data[i].s));
                }
                c = expectedString.CodePointAt(0);
                int cLength = Character.CharCount(c);
                nextCp = expectedString.Length > cLength?expectedString.CodePointAt(cLength) : 0;

                firstResult = trie.FirstForCodePoint(c);
                firstValue  = firstResult.HasValue() ? trie.GetValue() : -1;
                nextResult  = trie.NextForCodePoint(nextCp);
                if (firstResult != trie.Reset().NextForCodePoint(c) ||
                    firstResult != trie.Current ||
                    firstValue != (firstResult.HasValue() ? trie.GetValue() : -1) ||
                    nextResult != trie.NextForCodePoint(nextCp)
                    )
                {
                    Errln(String.Format("trie.firstForCodePoint(U+{0:X4})!=trie.Reset().NextForCodePoint(same) for {1}",
                                        c, data[i].s));
                }
            }
            trie.Reset();
        }
Beispiel #2
0
        public override int Matches(CharacterIterator text_, int maxLength, int[] lengths, int[] count_, int limit, int[] values)
        {
            UCharacterIterator text = UCharacterIterator.GetInstance(text_);
            CharsTrie          uct  = new CharsTrie(characters, 0);
            int c = text.NextCodePoint();

            if (c == UCharacterIterator.Done)
            {
                return(0);
            }
            Result result = uct.FirstForCodePoint(c);
            // TODO: should numChars count Character.charCount?
            int numChars = 1;
            int count    = 0;

            for (; ;)
            {
                if (result.HasValue())
                {
                    if (count < limit)
                    {
                        if (values != null)
                        {
                            values[count] = uct.GetValue();
                        }
                        lengths[count] = numChars;
                        count++;
                    }

                    if (result == Result.FinalValue)
                    {
                        break;
                    }
                }
                else if (result == Result.NoMatch)
                {
                    break;
                }

                if (numChars >= maxLength)
                {
                    break;
                }
                c = text.NextCodePoint();
                if (c == UCharacterIterator.Done)
                {
                    break;
                }
                ++numChars;
                result = uct.NextForCodePoint(c);
            }
            count_[0] = count;
            return(numChars);
        }
Beispiel #3
0
        /// <summary>
        /// Is there an exception at this point?
        /// </summary>
        /// <param name="n">The location of the possible break.</param>
        /// <returns></returns>
        private bool BreakExceptionAt(int n)
        {
            // Note: the C++ version of this function is SimpleFilteredSentenceBreakIterator::breakExceptionAt()

            int bestPosn  = -1;
            int bestValue = -1;

            // loops while 'n' points to an exception
            text.Index = n;
            backwardsTrie.Reset();
            int uch;



            // Assume a space is following the '.' (so we handle the case: "Mr. /Brown")
            if ((uch = text.PreviousCodePoint()) == ' ')
            { // TODO: skip a class of chars here??
              // TODO only do this the 1st time?
            }
            else
            {
                uch = text.NextCodePoint();
            }

            Result r = Result.IntermediateValue;

            while ((uch = text.PreviousCodePoint()) != UCharacterIterator.Done && // more to consume backwards and..
                   ((r = backwardsTrie.NextForCodePoint(uch)).HasNext()))
            {                                                                     // more in the trie
                if (r.HasValue())
                {                                                                 // remember the best match so far
                    bestPosn  = text.Index;
                    bestValue = backwardsTrie.GetValue();
                }
            }

            if (r.Matches())
            { // exact match?
                bestValue = backwardsTrie.GetValue();
                bestPosn  = text.Index;
            }

            if (bestPosn >= 0)
            {
                if (bestValue == SimpleFilteredSentenceBreakIteratorBuilder.Match)
                {                 // exact match!
                    return(true); // Exception here.
                }
                else if (bestValue == SimpleFilteredSentenceBreakIteratorBuilder.Partial && forwardsPartialTrie != null)
                {
                    // make sure there's a forward trie
                    // We matched the "Ph." in "Ph.D." - now we need to run everything through the forwards trie
                    // to see if it matches something going forward.
                    forwardsPartialTrie.Reset();

                    Result rfwd = Result.IntermediateValue;
                    text.Index = bestPosn; // hope that's close ..
                    while ((uch = text.NextCodePoint()) != BreakIterator.Done &&
                           ((rfwd = forwardsPartialTrie.NextForCodePoint(uch)).HasNext()))
                    {
                    }
                    if (rfwd.Matches())
                    {
                        // Exception here
                        return(true);
                    } // else fall through
                }     // else fall through
            }         // else fall through
            return(false); // No exception here.
        }
Beispiel #4
0
        public void Test32NextForCodePoint()
        {
            StringAndValue[] data =
            {
                // "\u4dff\\U00010000\u9999\\U00020000\udfff\\U0010ffff"
                new StringAndValue("\u4dff\ud800\udc00\u9999\ud840\udc00\udfff\udbff\udfff", 2000000000),
                // "\u4dff\\U00010000\u9999\\U00020002"
                new StringAndValue("\u4dff\ud800\udc00\u9999\ud840\udc02",                        44444),
                // "\u4dff\\U000103ff"
                new StringAndValue("\u4dff\ud800\udfff", 99999)
            };
            CharsTrie trie = buildTrie(data, data.Length, TrieBuilderOption.Fast);
            Result    result;

            if ((result = trie.NextForCodePoint(0x4dff)) != Result.NoValue || result != trie.Current ||
                (result = trie.NextForCodePoint(0x10000)) != Result.NoValue || result != trie.Current ||
                (result = trie.NextForCodePoint(0x9999)) != Result.NoValue || result != trie.Current ||
                (result = trie.NextForCodePoint(0x20000)) != Result.NoValue || result != trie.Current ||
                (result = trie.NextForCodePoint(0xdfff)) != Result.NoValue || result != trie.Current ||
                (result = trie.NextForCodePoint(0x10ffff)) != Result.FinalValue || result != trie.Current ||
                trie.GetValue() != 2000000000
                )
            {
                Errln("CharsTrie.NextForCodePoint() fails for " + data[0].s);
            }
            if ((result = trie.FirstForCodePoint(0x4dff)) != Result.NoValue || result != trie.Current ||
                (result = trie.NextForCodePoint(0x10000)) != Result.NoValue || result != trie.Current ||
                (result = trie.NextForCodePoint(0x9999)) != Result.NoValue || result != trie.Current ||
                (result = trie.NextForCodePoint(0x20002)) != Result.FinalValue || result != trie.Current ||
                trie.GetValue() != 44444
                )
            {
                Errln("CharsTrie.NextForCodePoint() fails for " + data[1].s);
            }
            if ((result = trie.Reset().NextForCodePoint(0x4dff)) != Result.NoValue || result != trie.Current ||
                (result = trie.NextForCodePoint(0x10000)) != Result.NoValue || result != trie.Current ||
                (result = trie.NextForCodePoint(0x9999)) != Result.NoValue || result != trie.Current ||
                (result = trie.NextForCodePoint(0x20222)) != Result.NoMatch || result != trie.Current  // no match for trail surrogate
                )
            {
                Errln("CharsTrie.NextForCodePoint() fails for \u4dff\\U00010000\u9999\\U00020222");
            }
            if ((result = trie.Reset().NextForCodePoint(0x4dff)) != Result.NoValue || result != trie.Current ||
                (result = trie.NextForCodePoint(0x103ff)) != Result.FinalValue || result != trie.Current ||
                trie.GetValue() != 99999
                )
            {
                Errln("CharsTrie.NextForCodePoint() fails for " + data[2].s);
            }
        }