Esempio n. 1
0
        /// <summary>
        /// Sets the current iteration position to the first boundary position after
        /// the specified position.
        /// </summary>
        ///
        /// <param name="offset">The position to begin searching forward from</param>
        /// <returns>The position of the first boundary after "offset"</returns>
        /// @stable ICU 2.0
        public override int Following(int offset)
        {
            ICharacterIterator text = GetText();

            IBM.ICU.Text.RuleBasedBreakIterator.CheckOffset(offset, text);

            // if we have no cached break positions, or if "offset" is outside the
            // range covered by the cache, then dump the cache and call our
            // inherited following() method. This will call other methods in this
            // class that may refresh the cache.
            if (cachedBreakPositions == null ||
                offset < cachedBreakPositions[0] ||
                offset >= cachedBreakPositions[cachedBreakPositions.Length - 1])
            {
                cachedBreakPositions = null;
                return(base.Following(offset));
            }

            // on the other hand, if "offset" is within the range covered by the
            // cache, then just search the cache for the first break position
            // after "offset"
            else
            {
                positionInCache = 0;
                while (positionInCache < cachedBreakPositions.Length &&
                       offset >= cachedBreakPositions[positionInCache])
                {
                    ++positionInCache;
                }
                text.SetIndex(cachedBreakPositions[positionInCache]);
                return(text.GetIndex());
            }
        }
Esempio n. 2
0
 /// @stable ICU 2.0
 public override void SetText(ICharacterIterator newText)
 {
     base.SetText(newText);
     cachedBreakPositions = null;
     fDictionaryCharCount = 0;
     positionInCache      = 0;
 }
Esempio n. 3
0
        /// <summary>
        /// Advances the iterator one step backwards.
        /// </summary>
        ///
        /// <returns>The position of the last boundary position before the current
        /// iteration position</returns>
        /// @stable ICU 2.0
        public override int Previous()
        {
            ICharacterIterator text = GetText();

            // if we have cached break positions and we're still in the range
            // covered by them, just move one step backward in the cache
            if (cachedBreakPositions != null && positionInCache > 0)
            {
                --positionInCache;
                text.SetIndex(cachedBreakPositions[positionInCache]);
                return(cachedBreakPositions[positionInCache]);
            }

            // otherwise, dump the cache and use the inherited previous() method to
            // move
            // backward. This may fill up the cache with new break positions, in
            // which
            // case we have to mark our position in the cache. If it doesn't, use
            // next()
            // to move forward until we hit or pass the current position. This
            // *will* fill
            // the cache.
            else
            {
                cachedBreakPositions = null;
                int offset = Current();
                int result = base.Previous();

                if (cachedBreakPositions != null)
                {
                    positionInCache = cachedBreakPositions.Length - 2;
                    return(result);
                }

                while (result < offset)
                {
                    int nextResult = Next();

                    if (nextResult >= offset)
                    {
                        break;
                    }

                    result = nextResult;
                }

                if (cachedBreakPositions != null)
                {
                    positionInCache = cachedBreakPositions.Length - 2;
                }

                if (result != IBM.ICU.Text.BreakIterator.DONE)
                {
                    text.SetIndex(result);
                }

                return(result);
            }
        }
Esempio n. 4
0
 public CharacterIteratorWrapper(ICharacterIterator iter)
 {
     if (iter == null)
     {
         throw new ArgumentException();
     }
     iterator = iter;
 }
Esempio n. 5
0
        /// <exclude/>
        /// <summary>
        /// This is the implementation function for next().
        /// </summary>
        ///
        internal override int HandleNext()
        {
            ICharacterIterator text = GetText();

            // if there are no cached break positions, or if we've just moved
            // off the end of the range covered by the cache, we have to dump
            // and possibly regenerate the cache
            if (cachedBreakPositions == null ||
                positionInCache == cachedBreakPositions.Length - 1)
            {
                // start by using the inherited handleNext() to find a tentative
                // return
                // value. dictionaryCharCount tells us how many dictionary
                // characters
                // we passed over on our way to the tentative return value
                int startPos = text.GetIndex();
                fDictionaryCharCount = 0;
                int result = base.HandleNext();

                // if we passed over more than one dictionary character, then we use
                // divideUpDictionaryRange() to regenerate the cached break
                // positions
                // for the new range
                if (fDictionaryCharCount > 1 && result - startPos > 1)
                {
                    DivideUpDictionaryRange(startPos, result);
                }

                // otherwise, the value we got back from the inherited fuction
                // is our return value, and we can dump the cache
                else
                {
                    cachedBreakPositions = null;
                    return(result);
                }
            }

            // if the cache of break positions has been regenerated (or existed all
            // along), then just advance to the next break position in the cache
            // and return it
            if (cachedBreakPositions != null)
            {
                ++positionInCache;
                text.SetIndex(cachedBreakPositions[positionInCache]);
                return(cachedBreakPositions[positionInCache]);
            }
            IBM.ICU.Impl.Assert.Assrt(false);
            return(-9999);    // SHOULD NEVER GET HERE!
        }
Esempio n. 6
0
        /// <summary>
        /// Set the target text to be searched. Text iteration will then begin at the
        /// start of the text string. This method is useful if you want to reuse an
        /// iterator to search within a different body of text.
        /// </summary>
        ///
        /// <param name="text">new text iterator to look for match,</param>
        /// <exception cref="IllegalArgumentException">thrown when text is null or has 0 length</exception>
        /// <seealso cref="M:IBM.ICU.Text.SearchIterator.GetTarget"/>
        /// @stable ICU 2.4
        public virtual void SetTarget(ICharacterIterator text)
        {
            if (text == null || text.GetEndIndex() == text.GetIndex())
            {
                throw new ArgumentException("Illegal null or empty text");
            }

            targetText = text;
            targetText.SetIndex(targetText.GetBeginIndex());
            matchLength           = 0;
            m_reset_              = true;
            m_isForwardSearching_ = true;
            if (breakIterator != null)
            {
                breakIterator.SetText(targetText);
            }
        }
Esempio n. 7
0
        // protected constructor ----------------------------------------------

        /// <summary>
        /// Protected constructor for use by subclasses. Initializes the iterator
        /// with the argument target text for searching and sets the BreakIterator.
        /// See class documentation for more details on the use of the target text
        /// and BreakIterator.
        /// </summary>
        ///
        /// <param name="target">The target text to be searched.</param>
        /// <param name="breaker">A <see cref="T:IBM.ICU.Text.BreakIterator"/> that is used to determine theboundaries of a logical match. This argument can be null.</param>
        /// <exception cref="IllegalArgumentException">thrown when argument target is null, or of length 0</exception>
        /// <seealso cref="T:IBM.ICU.Text.BreakIterator"/>
        /// @stable ICU 2.0
        protected internal SearchIterator(ICharacterIterator target, BreakIterator breaker)
        {
            if (target == null ||
                (target.GetEndIndex() - target.GetBeginIndex()) == 0)
            {
                throw new ArgumentException("Illegal argument target. "
                                            + " Argument can not be null or of length 0");
            }
            targetText    = target;
            breakIterator = breaker;
            if (breakIterator != null)
            {
                breakIterator.SetText(target);
            }
            matchLength           = 0;
            m_lastMatchStart_     = DONE;
            m_isOverlap_          = false;
            m_isForwardSearching_ = true;
            m_reset_     = true;
            m_setOffset_ = DONE;
        }
Esempio n. 8
0
        /// <summary>
        /// Sets the current iteration position to the last boundary position before
        /// the specified position.
        /// </summary>
        ///
        /// <param name="offset">The position to begin searching from</param>
        /// <returns>The position of the last boundary before "offset"</returns>
        /// @stable ICU 2.0
        public override int Preceding(int offset)
        {
            ICharacterIterator text = GetText();

            IBM.ICU.Text.RuleBasedBreakIterator.CheckOffset(offset, text);

            // if we have no cached break positions, or "offset" is outside the
            // range covered by the cache, we can just call the inherited routine
            // (which will eventually call other routines in this class that may
            // refresh the cache)
            if (cachedBreakPositions == null ||
                offset <= cachedBreakPositions[0] ||
                offset > cachedBreakPositions[cachedBreakPositions.Length - 1])
            {
                cachedBreakPositions = null;
                return(base.Preceding(offset));
            }

            // on the other hand, if "offset" is within the range covered by the
            // cache,
            // then all we have to do is search the cache for the last break
            // position
            // before "offset"
            else
            {
                positionInCache = 0;
                while (positionInCache < cachedBreakPositions.Length &&
                       offset > cachedBreakPositions[positionInCache])
                {
                    ++positionInCache;
                }
                --positionInCache;
                text.SetIndex(cachedBreakPositions[positionInCache]);
                return(text.GetIndex());
            }
        }
Esempio n. 9
0
 /// <summary>
 /// Sets the iterator to analyze a new piece of text. The BreakIterator is
 /// passed a CharacterIterator through which it will access the text itself.
 /// The current iteration position is reset to the CharacterIterator's start
 /// index. (The old iterator is dropped.)
 /// </summary>
 ///
 /// <param name="newText">A CharacterIterator referring to the text to analyze with thisBreakIterator (the iterator's current position is ignored, butits other state is significant).</param>
 /// @stable ICU 2.0
 public abstract void SetText(ICharacterIterator newText);
Esempio n. 10
0
 /// <summary>
 /// Returns a <c>UCharacterIterator</c> object given a
 /// CharacterIterator.
 /// </summary>
 ///
 /// <param name="source">a valid CharacterIterator object.</param>
 /// <returns>UCharacterIterator object</returns>
 /// <exception cref="IllegalArgumentException">if the argument is null</exception>
 /// @stable ICU 2.4
 public static UCharacterIterator GetInstance(ICharacterIterator source)
 {
     return(new CharacterIteratorWrapper(source));
 }
Esempio n. 11
0
        /// <summary>
        /// Matches the current regular expression program against a character array,starting at a given index.
        /// </summary>
        /// <param name="search">String to match against</param>
        /// <param name="i">Index to start searching at</param>
        /// <returns>True if search string matched</returns>
        bool IsMatch(ICharacterIterator search, int i)
        {
            // There is no compiled program to search with!
            if (program == null)
            {
                // This should be uncommon enough to be an error case rather
                // than an exception (which would have to be handled everywhere)
                InternalError("No RE program to run!");
            }

            // Save string to search
            this.search = search;

            // Can we optimize the search by looking for new lines?
            if ((program.Flags & ProgramOptions.HasBeginOfLine) == ProgramOptions.HasBeginOfLine)
            {
                // Non multi-line matching with BOL: Must match at '0' index
                if ((matchFlags & RegexOptions.Multiline) == 0) return i == 0 && MatchAt(i);

                // Multi-line matching with BOL: Seek to next line
                for (; !search.IsEnd(i); ++i)
                {
                    char currentChar = search.CharAt(i);
                    // Skip if we are at the beginning of the line
                    if (CharacterClass.IsNewline(ref currentChar)) continue;

                    // Match at the beginning of the line
                    if (MatchAt(i)) return true;

                    // Skip to the end of line
                    for (; !search.IsEnd(i); ++i) if (CharacterClass.IsNewline(ref currentChar)) break;
                }

                return false;
            }

            // Can we optimize the search by looking for a prefix string?
            if (program.Prefix == null)
            {
                // Unprefixed matching must try for a match at each character                
                for (; !search.IsEnd(i - 1); ++i) if (MatchAt(i)) return true;// Try a match at index i
                return false;
            }
            else
            {
                // Prefix-anchored matching is possible
                bool caseIndependent = (matchFlags & RegexOptions.IgnoreCase) != 0;
                char[] prefix = program.Prefix;
                int prefixLength = prefix.Length;
                for (; !search.IsEnd(i + prefixLength - 1); ++i)
                {
                    int j = i;
                    int k = 0;

                    bool match;
                    do
                    {
                        char currentChar = search.CharAt(j++);
                        char nextChar = prefix[k++];
                        // If there's a mismatch of any character in the prefix, give up
                        match = (CharacterClass.CompareChars(ref currentChar, ref nextChar, caseIndependent) == 0);
                    } while (match && k < prefixLength);

                    // See if the whole prefix string matched
                    if (k == prefixLength)
                        if (MatchAt(i)) return true;// We matched the full prefix at firstChar, so try it
                }
                return false;
            }
        }
Esempio n. 12
0
        /// <summary>
        /// This is the function that actually implements the dictionary-based
        /// algorithm. Given the endpoints of a range of text, it uses the dictionary
        /// to determine the positions of any boundaries in this range. It stores all
        /// the boundary positions it discovers in cachedBreakPositions so that we
        /// only have to do this work once for each time we enter the range.
        /// </summary>
        ///
        private void DivideUpDictionaryRange(int startPos, int endPos)
        {
            ICharacterIterator text = GetText();

            // the range we're dividing may begin or end with non-dictionary
            // characters
            // (i.e., for line breaking, we may have leading or trailing punctuation
            // that needs to be kept with the word). Seek from the beginning of the
            // range to the first dictionary character
            text.SetIndex(startPos);
            int c = IBM.ICU.Text.RuleBasedBreakIterator.CICurrent32(text);

            while (IsDictionaryChar(c) == false)
            {
                c = IBM.ICU.Text.RuleBasedBreakIterator.CINext32(text);
            }

            // System.out.println("\nDividing up range from " + (text.getIndex() +
            // 1) + " to " + endPos);

            // initialize. We maintain two stacks: currentBreakPositions contains
            // the list of break positions that will be returned if we successfully
            // finish traversing the whole range now. possibleBreakPositions lists
            // all other possible word ends we've passed along the way. (Whenever
            // we reach an error [a sequence of characters that can't begin any word
            // in the dictionary], we back up, possibly delete some breaks from
            // currentBreakPositions, move a break from possibleBreakPositions
            // to currentBreakPositions, and start over from there. This process
            // continues in this way until we either successfully make it all the
            // way
            // across the range, or exhaust all of our combinations of break
            // positions.)
            Stack     currentBreakPositions  = new Stack();
            Stack     possibleBreakPositions = new Stack();
            ArrayList wrongBreakPositions    = new ArrayList();

            // the dictionary is implemented as a trie, which is treated as a state
            // machine. -1 represents the end of a legal word. Every word in the
            // dictionary is represented by a path from the root node to -1. A path
            // that ends in state 0 is an illegal combination of characters.
            int state = 0;

            // these two variables are used for error handling. We keep track of the
            // farthest we've gotten through the range being divided, and the
            // combination
            // of breaks that got us that far. If we use up all possible break
            // combinations, the text contains an error or a word that's not in the
            // dictionary. In this case, we "bless" the break positions that got us
            // the
            // farthest as real break positions, and then start over from scratch
            // with
            // the character where the error occurred.
            int   farthestEndPoint   = text.GetIndex();
            Stack bestBreakPositions = null;

            // initialize (we always exit the loop with a break statement)
            c = IBM.ICU.Text.RuleBasedBreakIterator.CICurrent32(text);
            while (true)
            {
                // System.out.print("c = " + Integer.toString(c, 16) + ", pos = " +
                // text.getIndex());

                // if we can transition to state "-1" from our current state, we're
                // on the last character of a legal word. Push that position onto
                // the possible-break-positions stack
                if (dictionary.At(state, 0) == -1)
                {
                    possibleBreakPositions.Push(((int)(text.GetIndex())));
                }

                // look up the new state to transition to in the dictionary
                // There will be no supplementaries here because the Thai dictionary
                // does not include any. This code is going away soon, not worth
                // fixing.
                state = (dictionary.At(state, (char)c)) & 0xFFFF;      // TODO: fix
                                                                       // supplementaries
                // System.out.print(", state = " + state);

                // if the character we're sitting on causes us to transition to
                // the "end of word" state, then it was a non-dictionary character
                // and we've successfully traversed the whole range. Drop out
                // of the loop.
                if (state == /*-1*/ 0xFFFF)
                {
                    currentBreakPositions.Push(((int)(text.GetIndex())));
                    break;
                }

                // if the character we're sitting on causes us to transition to
                // the error state, or if we've gone off the end of the range
                // without transitioning to the "end of word" state, we've hit
                // an error...
                else if (state == 0 || text.GetIndex() >= endPos)
                {
                    // if this is the farthest we've gotten, take note of it in
                    // case there's an error in the text
                    if (text.GetIndex() > farthestEndPoint)
                    {
                        farthestEndPoint   = text.GetIndex();
                        bestBreakPositions = (Stack)(currentBreakPositions.Clone());
                    }

                    // wrongBreakPositions is a list of all break positions we've
                    // tried starting
                    // that didn't allow us to traverse all the way through the
                    // text. Every time
                    // we pop a break position off of currentBreakPositions, we put
                    // it into
                    // wrongBreakPositions to avoid trying it again later. If we
                    // make it to this
                    // spot, we're either going to back up to a break in
                    // possibleBreakPositions
                    // and try starting over from there, or we've exhausted all
                    // possible break
                    // positions and are going to do the fallback procedure. This
                    // loop prevents
                    // us from messing with anything in possibleBreakPositions that
                    // didn't work as
                    // a starting point the last time we tried it (this is to
                    // prevent a bunch of
                    // repetitive checks from slowing down some extreme cases)
                    // variable not used Integer newStartingSpot = null;
                    while (!(possibleBreakPositions.Count == 0) &&
                           wrongBreakPositions.Contains(possibleBreakPositions
                                                        .Peek()))
                    {
                        possibleBreakPositions.Pop();
                    }

                    // if we've used up all possible break-position combinations,
                    // there's
                    // an error or an unknown word in the text. In this case, we
                    // start
                    // over, treating the farthest character we've reached as the
                    // beginning
                    // of the range, and "blessing" the break positions that got us
                    // that
                    // far as real break positions
                    if ((possibleBreakPositions.Count == 0))
                    {
                        if (bestBreakPositions != null)
                        {
                            currentBreakPositions = bestBreakPositions;
                            if (farthestEndPoint < endPos)
                            {
                                text.SetIndex(farthestEndPoint + 1);
                            }
                            else
                            {
                                break;
                            }
                        }
                        else
                        {
                            if ((currentBreakPositions.Count == 0 || ((Int32)(currentBreakPositions
                                                                              .Peek())) != text.GetIndex()) &&
                                text.GetIndex() != startPos)
                            {
                                currentBreakPositions.Push(((int)(text
                                                                  .GetIndex())));
                            }
                            IBM.ICU.Text.RuleBasedBreakIterator.CINext32(text);
                            currentBreakPositions
                            .Push(((int)(text.GetIndex())));
                        }
                    }

                    // if we still have more break positions we can try, then
                    // promote the
                    // last break in possibleBreakPositions into
                    // currentBreakPositions,
                    // and get rid of all entries in currentBreakPositions that come
                    // after
                    // it. Then back up to that position and start over from there
                    // (i.e.,
                    // treat that position as the beginning of a new word)
                    else
                    {
                        Int32  temp  = (Int32)possibleBreakPositions.Pop();
                        Object temp2 = null;
                        while (!(currentBreakPositions.Count == 0) &&
                               temp < ((Int32)currentBreakPositions
                                       .Peek()))
                        {
                            temp2 = currentBreakPositions.Pop();
                            wrongBreakPositions.Add(temp2);
                        }
                        currentBreakPositions.Push(temp);
                        text.SetIndex(((Int32)currentBreakPositions.Peek()));
                    }

                    // re-sync "c" for the next go-round, and drop out of the loop
                    // if
                    // we've made it off the end of the range
                    c     = IBM.ICU.Text.RuleBasedBreakIterator.CICurrent32(text);
                    state = 0;
                    if (text.GetIndex() >= endPos)
                    {
                        break;
                    }
                }

                // if we didn't hit any exceptional conditions on this last
                // iteration,
                // just advance to the next character and loop
                else
                {
                    c = IBM.ICU.Text.RuleBasedBreakIterator.CINext32(text);
                }
                // System.out.print(", possibleBreakPositions = { "); for (int i =
                // 0; i < possibleBreakPositions.size(); i++)
                // System.out.print(possibleBreakPositions.elementAt(i) + " ");
                // System.out.print("}");
                // System.out.print(", currentBreakPositions = { "); for (int i = 0;
                // i < currentBreakPositions.size(); i++)
                // System.out.print(currentBreakPositions.elementAt(i) + " ");
                // System.out.println("}");
            }

            // dump the last break position in the list, and replace it with the
            // actual
            // end of the range (which may be the same character, or may be further
            // on
            // because the range actually ended with non-dictionary characters we
            // want to
            // keep with the word)
            if (!(currentBreakPositions.Count == 0))
            {
                currentBreakPositions.Pop();
            }
            currentBreakPositions.Push(((int)(endPos)));

            // create a regular array to hold the break positions and copy
            // the break positions from the stack to the array (in addition,
            // our starting position goes into this array as a break position).
            // This array becomes the cache of break positions used by next()
            // and previous(), so this is where we actually refresh the cache.
            cachedBreakPositions    = new int[currentBreakPositions.Count + 1];
            cachedBreakPositions[0] = startPos;

            for (int i = 0; i < currentBreakPositions.Count; i++)
            {
                cachedBreakPositions[i + 1] = ((Int32)currentBreakPositions.ToArray()[i]);
            }
            positionInCache = 0;
        }