Exemplo n.º 1
0
        /// <summary>
        /// Sets the current iteration position to the first boundary position after
        /// the specified position.
        /// </summary>
        ///
        /// <param name="offset">The position to begin searching forward from</param>
        /// <returns>The position of the first boundary after "offset"</returns>
        /// @stable ICU 2.0
        public override int Following(int offset)
        {
            ICharacterIterator text = GetText();

            IBM.ICU.Text.RuleBasedBreakIterator.CheckOffset(offset, text);

            // if we have no cached break positions, or if "offset" is outside the
            // range covered by the cache, then dump the cache and call our
            // inherited following() method. This will call other methods in this
            // class that may refresh the cache.
            if (cachedBreakPositions == null ||
                offset < cachedBreakPositions[0] ||
                offset >= cachedBreakPositions[cachedBreakPositions.Length - 1])
            {
                cachedBreakPositions = null;
                return(base.Following(offset));
            }

            // on the other hand, if "offset" is within the range covered by the
            // cache, then just search the cache for the first break position
            // after "offset"
            else
            {
                positionInCache = 0;
                while (positionInCache < cachedBreakPositions.Length &&
                       offset >= cachedBreakPositions[positionInCache])
                {
                    ++positionInCache;
                }
                text.SetIndex(cachedBreakPositions[positionInCache]);
                return(text.GetIndex());
            }
        }
Exemplo n.º 2
0
        /// <exclude/>
        /// <summary>
        /// This is the implementation function for next().
        /// </summary>
        ///
        internal override int HandleNext()
        {
            ICharacterIterator text = GetText();

            // if there are no cached break positions, or if we've just moved
            // off the end of the range covered by the cache, we have to dump
            // and possibly regenerate the cache
            if (cachedBreakPositions == null ||
                positionInCache == cachedBreakPositions.Length - 1)
            {
                // start by using the inherited handleNext() to find a tentative
                // return
                // value. dictionaryCharCount tells us how many dictionary
                // characters
                // we passed over on our way to the tentative return value
                int startPos = text.GetIndex();
                fDictionaryCharCount = 0;
                int result = base.HandleNext();

                // if we passed over more than one dictionary character, then we use
                // divideUpDictionaryRange() to regenerate the cached break
                // positions
                // for the new range
                if (fDictionaryCharCount > 1 && result - startPos > 1)
                {
                    DivideUpDictionaryRange(startPos, result);
                }

                // otherwise, the value we got back from the inherited fuction
                // is our return value, and we can dump the cache
                else
                {
                    cachedBreakPositions = null;
                    return(result);
                }
            }

            // if the cache of break positions has been regenerated (or existed all
            // along), then just advance to the next break position in the cache
            // and return it
            if (cachedBreakPositions != null)
            {
                ++positionInCache;
                text.SetIndex(cachedBreakPositions[positionInCache]);
                return(cachedBreakPositions[positionInCache]);
            }
            IBM.ICU.Impl.Assert.Assrt(false);
            return(-9999);    // SHOULD NEVER GET HERE!
        }
Exemplo n.º 3
0
        /// <summary>
        /// Set the target text to be searched. Text iteration will then begin at the
        /// start of the text string. This method is useful if you want to reuse an
        /// iterator to search within a different body of text.
        /// </summary>
        ///
        /// <param name="text">new text iterator to look for match,</param>
        /// <exception cref="IllegalArgumentException">thrown when text is null or has 0 length</exception>
        /// <seealso cref="M:IBM.ICU.Text.SearchIterator.GetTarget"/>
        /// @stable ICU 2.4
        public virtual void SetTarget(ICharacterIterator text)
        {
            if (text == null || text.GetEndIndex() == text.GetIndex())
            {
                throw new ArgumentException("Illegal null or empty text");
            }

            targetText = text;
            targetText.SetIndex(targetText.GetBeginIndex());
            matchLength           = 0;
            m_reset_              = true;
            m_isForwardSearching_ = true;
            if (breakIterator != null)
            {
                breakIterator.SetText(targetText);
            }
        }
Exemplo n.º 4
0
        /// <summary>
        /// Sets the current iteration position to the last boundary position before
        /// the specified position.
        /// </summary>
        ///
        /// <param name="offset">The position to begin searching from</param>
        /// <returns>The position of the last boundary before "offset"</returns>
        /// @stable ICU 2.0
        public override int Preceding(int offset)
        {
            ICharacterIterator text = GetText();

            IBM.ICU.Text.RuleBasedBreakIterator.CheckOffset(offset, text);

            // if we have no cached break positions, or "offset" is outside the
            // range covered by the cache, we can just call the inherited routine
            // (which will eventually call other routines in this class that may
            // refresh the cache)
            if (cachedBreakPositions == null ||
                offset <= cachedBreakPositions[0] ||
                offset > cachedBreakPositions[cachedBreakPositions.Length - 1])
            {
                cachedBreakPositions = null;
                return(base.Preceding(offset));
            }

            // on the other hand, if "offset" is within the range covered by the
            // cache,
            // then all we have to do is search the cache for the last break
            // position
            // before "offset"
            else
            {
                positionInCache = 0;
                while (positionInCache < cachedBreakPositions.Length &&
                       offset > cachedBreakPositions[positionInCache])
                {
                    ++positionInCache;
                }
                --positionInCache;
                text.SetIndex(cachedBreakPositions[positionInCache]);
                return(text.GetIndex());
            }
        }
Exemplo n.º 5
0
 /// <seealso cref="null"/>
 public override int GetIndex()
 {
     return(iterator.GetIndex());
 }
Exemplo n.º 6
0
        /// <summary>
        /// This is the function that actually implements the dictionary-based
        /// algorithm. Given the endpoints of a range of text, it uses the dictionary
        /// to determine the positions of any boundaries in this range. It stores all
        /// the boundary positions it discovers in cachedBreakPositions so that we
        /// only have to do this work once for each time we enter the range.
        /// </summary>
        ///
        private void DivideUpDictionaryRange(int startPos, int endPos)
        {
            ICharacterIterator text = GetText();

            // the range we're dividing may begin or end with non-dictionary
            // characters
            // (i.e., for line breaking, we may have leading or trailing punctuation
            // that needs to be kept with the word). Seek from the beginning of the
            // range to the first dictionary character
            text.SetIndex(startPos);
            int c = IBM.ICU.Text.RuleBasedBreakIterator.CICurrent32(text);

            while (IsDictionaryChar(c) == false)
            {
                c = IBM.ICU.Text.RuleBasedBreakIterator.CINext32(text);
            }

            // System.out.println("\nDividing up range from " + (text.getIndex() +
            // 1) + " to " + endPos);

            // initialize. We maintain two stacks: currentBreakPositions contains
            // the list of break positions that will be returned if we successfully
            // finish traversing the whole range now. possibleBreakPositions lists
            // all other possible word ends we've passed along the way. (Whenever
            // we reach an error [a sequence of characters that can't begin any word
            // in the dictionary], we back up, possibly delete some breaks from
            // currentBreakPositions, move a break from possibleBreakPositions
            // to currentBreakPositions, and start over from there. This process
            // continues in this way until we either successfully make it all the
            // way
            // across the range, or exhaust all of our combinations of break
            // positions.)
            Stack     currentBreakPositions  = new Stack();
            Stack     possibleBreakPositions = new Stack();
            ArrayList wrongBreakPositions    = new ArrayList();

            // the dictionary is implemented as a trie, which is treated as a state
            // machine. -1 represents the end of a legal word. Every word in the
            // dictionary is represented by a path from the root node to -1. A path
            // that ends in state 0 is an illegal combination of characters.
            int state = 0;

            // these two variables are used for error handling. We keep track of the
            // farthest we've gotten through the range being divided, and the
            // combination
            // of breaks that got us that far. If we use up all possible break
            // combinations, the text contains an error or a word that's not in the
            // dictionary. In this case, we "bless" the break positions that got us
            // the
            // farthest as real break positions, and then start over from scratch
            // with
            // the character where the error occurred.
            int   farthestEndPoint   = text.GetIndex();
            Stack bestBreakPositions = null;

            // initialize (we always exit the loop with a break statement)
            c = IBM.ICU.Text.RuleBasedBreakIterator.CICurrent32(text);
            while (true)
            {
                // System.out.print("c = " + Integer.toString(c, 16) + ", pos = " +
                // text.getIndex());

                // if we can transition to state "-1" from our current state, we're
                // on the last character of a legal word. Push that position onto
                // the possible-break-positions stack
                if (dictionary.At(state, 0) == -1)
                {
                    possibleBreakPositions.Push(((int)(text.GetIndex())));
                }

                // look up the new state to transition to in the dictionary
                // There will be no supplementaries here because the Thai dictionary
                // does not include any. This code is going away soon, not worth
                // fixing.
                state = (dictionary.At(state, (char)c)) & 0xFFFF;      // TODO: fix
                                                                       // supplementaries
                // System.out.print(", state = " + state);

                // if the character we're sitting on causes us to transition to
                // the "end of word" state, then it was a non-dictionary character
                // and we've successfully traversed the whole range. Drop out
                // of the loop.
                if (state == /*-1*/ 0xFFFF)
                {
                    currentBreakPositions.Push(((int)(text.GetIndex())));
                    break;
                }

                // if the character we're sitting on causes us to transition to
                // the error state, or if we've gone off the end of the range
                // without transitioning to the "end of word" state, we've hit
                // an error...
                else if (state == 0 || text.GetIndex() >= endPos)
                {
                    // if this is the farthest we've gotten, take note of it in
                    // case there's an error in the text
                    if (text.GetIndex() > farthestEndPoint)
                    {
                        farthestEndPoint   = text.GetIndex();
                        bestBreakPositions = (Stack)(currentBreakPositions.Clone());
                    }

                    // wrongBreakPositions is a list of all break positions we've
                    // tried starting
                    // that didn't allow us to traverse all the way through the
                    // text. Every time
                    // we pop a break position off of currentBreakPositions, we put
                    // it into
                    // wrongBreakPositions to avoid trying it again later. If we
                    // make it to this
                    // spot, we're either going to back up to a break in
                    // possibleBreakPositions
                    // and try starting over from there, or we've exhausted all
                    // possible break
                    // positions and are going to do the fallback procedure. This
                    // loop prevents
                    // us from messing with anything in possibleBreakPositions that
                    // didn't work as
                    // a starting point the last time we tried it (this is to
                    // prevent a bunch of
                    // repetitive checks from slowing down some extreme cases)
                    // variable not used Integer newStartingSpot = null;
                    while (!(possibleBreakPositions.Count == 0) &&
                           wrongBreakPositions.Contains(possibleBreakPositions
                                                        .Peek()))
                    {
                        possibleBreakPositions.Pop();
                    }

                    // if we've used up all possible break-position combinations,
                    // there's
                    // an error or an unknown word in the text. In this case, we
                    // start
                    // over, treating the farthest character we've reached as the
                    // beginning
                    // of the range, and "blessing" the break positions that got us
                    // that
                    // far as real break positions
                    if ((possibleBreakPositions.Count == 0))
                    {
                        if (bestBreakPositions != null)
                        {
                            currentBreakPositions = bestBreakPositions;
                            if (farthestEndPoint < endPos)
                            {
                                text.SetIndex(farthestEndPoint + 1);
                            }
                            else
                            {
                                break;
                            }
                        }
                        else
                        {
                            if ((currentBreakPositions.Count == 0 || ((Int32)(currentBreakPositions
                                                                              .Peek())) != text.GetIndex()) &&
                                text.GetIndex() != startPos)
                            {
                                currentBreakPositions.Push(((int)(text
                                                                  .GetIndex())));
                            }
                            IBM.ICU.Text.RuleBasedBreakIterator.CINext32(text);
                            currentBreakPositions
                            .Push(((int)(text.GetIndex())));
                        }
                    }

                    // if we still have more break positions we can try, then
                    // promote the
                    // last break in possibleBreakPositions into
                    // currentBreakPositions,
                    // and get rid of all entries in currentBreakPositions that come
                    // after
                    // it. Then back up to that position and start over from there
                    // (i.e.,
                    // treat that position as the beginning of a new word)
                    else
                    {
                        Int32  temp  = (Int32)possibleBreakPositions.Pop();
                        Object temp2 = null;
                        while (!(currentBreakPositions.Count == 0) &&
                               temp < ((Int32)currentBreakPositions
                                       .Peek()))
                        {
                            temp2 = currentBreakPositions.Pop();
                            wrongBreakPositions.Add(temp2);
                        }
                        currentBreakPositions.Push(temp);
                        text.SetIndex(((Int32)currentBreakPositions.Peek()));
                    }

                    // re-sync "c" for the next go-round, and drop out of the loop
                    // if
                    // we've made it off the end of the range
                    c     = IBM.ICU.Text.RuleBasedBreakIterator.CICurrent32(text);
                    state = 0;
                    if (text.GetIndex() >= endPos)
                    {
                        break;
                    }
                }

                // if we didn't hit any exceptional conditions on this last
                // iteration,
                // just advance to the next character and loop
                else
                {
                    c = IBM.ICU.Text.RuleBasedBreakIterator.CINext32(text);
                }
                // System.out.print(", possibleBreakPositions = { "); for (int i =
                // 0; i < possibleBreakPositions.size(); i++)
                // System.out.print(possibleBreakPositions.elementAt(i) + " ");
                // System.out.print("}");
                // System.out.print(", currentBreakPositions = { "); for (int i = 0;
                // i < currentBreakPositions.size(); i++)
                // System.out.print(currentBreakPositions.elementAt(i) + " ");
                // System.out.println("}");
            }

            // dump the last break position in the list, and replace it with the
            // actual
            // end of the range (which may be the same character, or may be further
            // on
            // because the range actually ended with non-dictionary characters we
            // want to
            // keep with the word)
            if (!(currentBreakPositions.Count == 0))
            {
                currentBreakPositions.Pop();
            }
            currentBreakPositions.Push(((int)(endPos)));

            // create a regular array to hold the break positions and copy
            // the break positions from the stack to the array (in addition,
            // our starting position goes into this array as a break position).
            // This array becomes the cache of break positions used by next()
            // and previous(), so this is where we actually refresh the cache.
            cachedBreakPositions    = new int[currentBreakPositions.Count + 1];
            cachedBreakPositions[0] = startPos;

            for (int i = 0; i < currentBreakPositions.Count; i++)
            {
                cachedBreakPositions[i + 1] = ((Int32)currentBreakPositions.ToArray()[i]);
            }
            positionInCache = 0;
        }