Exemple #1
0
        /// <seealso cref="Transliterator.AddSourceTargetSet(UnicodeSet, UnicodeSet, UnicodeSet)"/>
        public override void AddSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet)
        {
            UnicodeSet myFilter = GetFilterAsUnicodeSet(inputFilter);

            if (!myFilter.ContainsAll(UnicodeNameTransliterator.OPEN_DELIM) || !myFilter.Contains(CLOSE_DELIM))
            {
                return; // we have to contain both prefix and suffix
            }
            UnicodeSet items = new UnicodeSet()
                               .AddAll('0', '9')
                               .AddAll('A', 'F')
                               .AddAll('a', 'z')  // for controls
                               .Add('<').Add('>') // for controls
                               .Add('(').Add(')') // for controls
                               .Add('-')
                               .Add(' ')
                               .AddAll(UnicodeNameTransliterator.OPEN_DELIM)
                               .Add(CLOSE_DELIM);

            items.RetainAll(myFilter);
            if (items.Count > 0)
            {
                sourceSet.AddAll(items);
                // could produce any character
                targetSet.AddAll(0, 0x10FFFF);
            }
        }
Exemple #2
0
 private bool IsSubsetOfInternal(UnicodeSet other)
 {
     foreach (var item in this)
     {
         if (!other.Contains(item))
         {
             return(false);
         }
     }
     return(true);
 }
Exemple #3
0
 /// <summary>
 /// Update the set of unhandled characters for the specified breakType to include
 /// all that have the same script as <paramref name="c"/>.
 /// May be called concurrently with <see cref="Handles(int, int)"/> or <see cref="FindBreaks(CharacterIterator, int, int, int, DictionaryBreakEngine.DequeI)"/>.
 /// Must not be called concurrently with itself.
 /// </summary>
 public void HandleChar(int c, int breakType)
 {
     if (breakType >= 0 && breakType < fHandled.Length && c != CharacterIteration.Done32)
     {
         UnicodeSet originalSet = fHandled[breakType];
         if (!originalSet.Contains(c))
         {
             int        script = UChar.GetIntPropertyValue(c, UProperty.Script);
             UnicodeSet newSet = new UnicodeSet();
             newSet.ApplyInt32PropertyValue(UProperty.Script, script);
             newSet.AddAll(originalSet);
             fHandled[breakType] = newSet;
         }
     }
 }
Exemple #4
0
 public int FindBreaks(CharacterIterator text, int startPos, int endPos,
                       int breakType, DictionaryBreakEngine.DequeI foundBreaks)
 {
     if (breakType >= 0 && breakType < fHandled.Length)
     {
         UnicodeSet uniset = fHandled[breakType];
         int        c      = CharacterIteration.Current32(text);
         while (text.Index < endPos && uniset.Contains(c))
         {
             CharacterIteration.Next32(text);
             c = CharacterIteration.Current32(text);
         }
     }
     return(0);
 }
Exemple #5
0
        /// <summary>
        /// Find the source and target sets, subject to the input filter.
        /// There is a known issue with filters containing multiple characters.
        /// </summary>
        // TODO: Problem: the rule is [{ab}]c > x
        // The filter is [a{bc}].
        // If the input is abc, then the rule will work.
        // However, following code applying the filter won't catch that case.
        internal void AddSourceTargetSet(UnicodeSet filter, UnicodeSet sourceSet, UnicodeSet targetSet, UnicodeSet revisiting)
        {
            int        limit      = anteContextLength + keyLength;
            UnicodeSet tempSource = new UnicodeSet();
            UnicodeSet temp       = new UnicodeSet();

            // We need to walk through the pattern.
            // Iff some of the characters at ALL of the the positions are matched by the filter, then we add temp to toUnionTo
            for (int i = anteContextLength; i < limit;)
            {
                int ch = UTF16.CharAt(pattern, i);
                i += UTF16.GetCharCount(ch);
                IUnicodeMatcher matcher = data.LookupMatcher(ch);
                if (matcher == null)
                {
                    if (!filter.Contains(ch))
                    {
                        return;
                    }
                    tempSource.Add(ch);
                }
                else
                {
                    try
                    {
                        if (!filter.ContainsSome((UnicodeSet)matcher))
                        {
                            return;
                        }
                        matcher.AddMatchSetTo(tempSource);
                    }
                    catch (InvalidCastException)
                    { // if the matcher is not a UnicodeSet
                        temp.Clear();
                        matcher.AddMatchSetTo(temp);
                        if (!filter.ContainsSome(temp))
                        {
                            return;
                        }
                        tempSource.AddAll(temp);
                    }
                }
            }
            // if we made our way through the gauntlet, add to source/target
            sourceSet.AddAll(tempSource);
            output.AddReplacementSetTo(targetSet);
        }
Exemple #6
0
        //------------------------------------------------------------------------
        //
        //           build          Build the list of non-overlapping character ranges
        //                          from the Unicode Sets.
        //
        //------------------------------------------------------------------------
        internal virtual void Build()
        {
            RangeDescriptor rlRange;

            if (fRB.fDebugEnv != null && fRB.fDebugEnv.IndexOf("usets", StringComparison.Ordinal) >= 0)
            {
                PrintSets();
            }

            //  Initialize the process by creating a single range encompassing all characters
            //  that is in no sets.
            //
            fRangeList            = new RangeDescriptor();
            fRangeList.fStartChar = 0;
            fRangeList.fEndChar   = 0x10ffff;

            //
            //  Find the set of non-overlapping ranges of characters
            //
            foreach (RBBINode usetNode in fRB.fUSetNodes)
            {
                UnicodeSet inputSet           = usetNode.fInputSet;
                int        inputSetRangeCount = inputSet.RangeCount;
                int        inputSetRangeIndex = 0;
                rlRange = fRangeList;

                for (; ;)
                {
                    if (inputSetRangeIndex >= inputSetRangeCount)
                    {
                        break;
                    }
                    int inputSetRangeBegin = inputSet.GetRangeStart(inputSetRangeIndex);
                    int inputSetRangeEnd   = inputSet.GetRangeEnd(inputSetRangeIndex);

                    // skip over ranges from the range list that are completely
                    //   below the current range from the input unicode set.
                    while (rlRange.fEndChar < inputSetRangeBegin)
                    {
                        rlRange = rlRange.fNext;
                    }

                    // If the start of the range from the range list is before with
                    //   the start of the range from the unicode set, split the range list range
                    //   in two, with one part being before (wholly outside of) the unicode set
                    //   and the other containing the rest.
                    //   Then continue the loop; the post-split current range will then be skipped
                    //     over
                    if (rlRange.fStartChar < inputSetRangeBegin)
                    {
                        rlRange.Split(inputSetRangeBegin);
                        continue;
                    }

                    // Same thing at the end of the ranges...
                    // If the end of the range from the range list doesn't coincide with
                    //   the end of the range from the unicode set, split the range list
                    //   range in two.  The first part of the split range will be
                    //   wholly inside the Unicode set.
                    if (rlRange.fEndChar > inputSetRangeEnd)
                    {
                        rlRange.Split(inputSetRangeEnd + 1);
                    }

                    // The current rlRange is now entirely within the UnicodeSet range.
                    // Add this unicode set to the list of sets for this rlRange
                    if (rlRange.fIncludesSets.IndexOf(usetNode) == -1)
                    {
                        rlRange.fIncludesSets.Add(usetNode);
                    }

                    // Advance over ranges that we are finished with.
                    if (inputSetRangeEnd == rlRange.fEndChar)
                    {
                        inputSetRangeIndex++;
                    }
                    rlRange = rlRange.fNext;
                }
            }

            if (fRB.fDebugEnv != null && fRB.fDebugEnv.IndexOf("range", StringComparison.Ordinal) >= 0)
            {
                PrintRanges();
            }

            //
            //  Group the above ranges, with each group consisting of one or more
            //    ranges that are in exactly the same set of original UnicodeSets.
            //    The groups are numbered, and these group numbers are the set of
            //    input symbols recognized by the run-time state machine.
            //
            //    Numbering: # 0  (state table column 0) is unused.
            //               # 1  is reserved - table column 1 is for end-of-input
            //               # 2  is reserved - table column 2 is for beginning-in-input
            //               # 3  is the first range list.
            //
            RangeDescriptor rlSearchRange;

            for (rlRange = fRangeList; rlRange != null; rlRange = rlRange.fNext)
            {
                for (rlSearchRange = fRangeList; rlSearchRange != rlRange; rlSearchRange = rlSearchRange.fNext)
                {
                    if (ListEqualityComparer <RBBINode> .Default.Equals(rlRange.fIncludesSets, rlSearchRange.fIncludesSets))
                    {
                        rlRange.fNum = rlSearchRange.fNum;
                        break;
                    }
                }
                if (rlRange.fNum == 0)
                {
                    fGroupCount++;
                    rlRange.fNum = fGroupCount + 2;
                    rlRange.SetDictionaryFlag();
                    AddValToSets(rlRange.fIncludesSets, fGroupCount + 2);
                }
            }

            // Handle input sets that contain the special string {eof}.
            //   Column 1 of the state table is reserved for EOF on input.
            //   Column 2 is reserved for before-the-start-input.
            //            (This column can be optimized away later if there are no rule
            //             references to {bof}.)
            //   Add this column value (1 or 2) to the equivalent expression
            //     subtree for each UnicodeSet that contains the string {eof}
            //   Because {bof} and {eof} are not a characters in the normal sense,
            //   they doesn't affect the computation of ranges or TRIE.

            string eofString = "eof";
            string bofString = "bof";

            foreach (RBBINode usetNode in fRB.fUSetNodes)
            {
                UnicodeSet inputSet = usetNode.fInputSet;
                if (inputSet.Contains(eofString))
                {
                    AddValToSet(usetNode, 1);
                }
                if (inputSet.Contains(bofString))
                {
                    AddValToSet(usetNode, 2);
                    fSawBOF = true;
                }
            }


            if (fRB.fDebugEnv != null && fRB.fDebugEnv.IndexOf("rgroup", StringComparison.Ordinal) >= 0)
            {
                PrintRangeGroups();
            }
            if (fRB.fDebugEnv != null && fRB.fDebugEnv.IndexOf("esets", StringComparison.Ordinal) >= 0)
            {
                PrintSets();
            }

            fTrie = new Trie2Writable(0,       //   Initial value for all code points.
                                      0);      //   Error value for out-of-range input.

            for (rlRange = fRangeList; rlRange != null; rlRange = rlRange.fNext)
            {
                fTrie.SetRange(
                    rlRange.fStartChar,         // Range start
                    rlRange.fEndChar,           // Range end (inclusive)
                    rlRange.fNum,               // value for range
                    true                        // Overwrite previously written values
                    );
            }
        }
Exemple #7
0
        public override int DivideUpDictionaryRange(CharacterIterator fIter, int rangeStart, int rangeEnd,
                                                    DequeI foundBreaks)
        {
            if ((rangeEnd - rangeStart) < BURMESE_MIN_WORD)
            {
                return(0);  // Not enough characters for word
            }
            int wordsFound = 0;
            int wordLength;
            int current;

            PossibleWord[] words = new PossibleWord[BURMESE_LOOKAHEAD];
            for (int i = 0; i < BURMESE_LOOKAHEAD; i++)
            {
                words[i] = new PossibleWord();
            }
            int uc;

            fIter.SetIndex(rangeStart);
            while ((current = fIter.Index) < rangeEnd)
            {
                wordLength = 0;

                //Look for candidate words at the current position
                int candidates = words[wordsFound % BURMESE_LOOKAHEAD].Candidates(fIter, fDictionary, rangeEnd);

                // If we found exactly one, use that
                if (candidates == 1)
                {
                    wordLength  = words[wordsFound % BURMESE_LOOKAHEAD].AcceptMarked(fIter);
                    wordsFound += 1;
                }

                // If there was more than one, see which one can take us forward the most words
                else if (candidates > 1)
                {
                    bool foundBest = false;
                    // If we're already at the end of the range, we're done
                    if (fIter.Index < rangeEnd)
                    {
                        do
                        {
                            int wordsMatched = 1;
                            if (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].Candidates(fIter, fDictionary, rangeEnd) > 0)
                            {
                                if (wordsMatched < 2)
                                {
                                    // Followed by another dictionary word; mark first word as a good candidate
                                    words[wordsFound % BURMESE_LOOKAHEAD].MarkCurrent();
                                    wordsMatched = 2;
                                }

                                // If we're already at the end of the range, we're done
                                if (fIter.Index >= rangeEnd)
                                {
                                    break;
                                }

                                // See if any of the possible second words is followed by a third word
                                do
                                {
                                    // If we find a third word, stop right away
                                    if (words[(wordsFound + 2) % BURMESE_LOOKAHEAD].Candidates(fIter, fDictionary, rangeEnd) > 0)
                                    {
                                        words[wordsFound % BURMESE_LOOKAHEAD].MarkCurrent();
                                        foundBest = true;
                                        break;
                                    }
                                } while (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].BackUp(fIter));
                            }
                        } while (words[wordsFound % BURMESE_LOOKAHEAD].BackUp(fIter) && !foundBest);
                    }
                    wordLength  = words[wordsFound % BURMESE_LOOKAHEAD].AcceptMarked(fIter);
                    wordsFound += 1;
                }

                // We come here after having either found a word or not. We look ahead to the
                // next word. If it's not a dictionary word, we will combine it with the word we
                // just found (if there is one), but only if the preceding word does not exceed
                // the threshold.
                // The text iterator should now be positioned at the end of the word we found.
                if (fIter.Index < rangeEnd && wordLength < BURMESE_ROOT_COMBINE_THRESHOLD)
                {
                    // If it is a dictionary word, do nothing. If it isn't, then if there is
                    // no preceding word, or the non-word shares less than the minimum threshold
                    // of characters with a dictionary word, then scan to resynchronize
                    if (words[wordsFound % BURMESE_LOOKAHEAD].Candidates(fIter, fDictionary, rangeEnd) <= 0 &&
                        (wordLength == 0 ||
                         words[wordsFound % BURMESE_LOOKAHEAD].LongestPrefix < BURMESE_PREFIX_COMBINE_THRESHOLD))
                    {
                        // Look for a plausible word boundary
                        int remaining = rangeEnd - (current + wordLength);
                        int pc        = fIter.Current;
                        int chars     = 0;
                        for (; ;)
                        {
                            fIter.Next();
                            uc     = fIter.Current;
                            chars += 1;
                            if (--remaining <= 0)
                            {
                                break;
                            }
                            if (fEndWordSet.Contains(pc) && fBeginWordSet.Contains(uc))
                            {
                                // Maybe. See if it's in the dictionary.
                                int candidate = words[(wordsFound + 1) % BURMESE_LOOKAHEAD].Candidates(fIter, fDictionary, rangeEnd);
                                fIter.SetIndex(current + wordLength + chars);
                                if (candidate > 0)
                                {
                                    break;
                                }
                            }
                            pc = uc;
                        }

                        // Bump the word count if there wasn't already one
                        if (wordLength <= 0)
                        {
                            wordsFound += 1;
                        }

                        // Update the length with the passed-over characters
                        wordLength += chars;
                    }
                    else
                    {
                        // Backup to where we were for next iteration
                        fIter.SetIndex(current + wordLength);
                    }
                }

                // Never stop before a combining mark.
                int currPos;
                while ((currPos = fIter.Index) < rangeEnd && fMarkSet.Contains(fIter.Current))
                {
                    fIter.Next();
                    wordLength += fIter.Index - currPos;
                }

                // Look ahead for possible suffixes if a dictionary word does not follow.
                // We do this in code rather than using a rule so that the heuristic
                // resynch continues to function. For example, one of the suffix characters
                // could be a typo in the middle of a word.
                // NOT CURRENTLY APPLICABLE TO BURMESE

                // Did we find a word on this iteration? If so, push it on the break stack
                if (wordLength > 0)
                {
                    foundBreaks.Push(current + wordLength);
                }
            }

            // Don't return a break for the end of the dictionary range if there is one there
            if (foundBreaks.Peek() >= rangeEnd)
            {
                foundBreaks.Pop();
                wordsFound -= 1;
            }

            return(wordsFound);
        }
Exemple #8
0
 public virtual bool Handles(int c, int breakType)
 {
     return(fTypes.SafeGet(breakType) && // this type can use us
            fSet.Contains(c));           // we recognize the character
 }
Exemple #9
0
        /// <summary>
        /// Implements <see cref="Transliterator.HandleTransliterate(IReplaceable, Position, bool)"/>.
        /// </summary>
        protected override void HandleTransliterate(IReplaceable text,
                                                    Position offsets, bool isIncremental)
        {
            int maxLen = UCharacterName.Instance.MaxCharNameLength + 1; // allow for temporary trailing space

            StringBuffer name = new StringBuffer(maxLen);

            // Get the legal character set
            UnicodeSet legal = new UnicodeSet();

            UCharacterName.Instance.GetCharNameCharacters(legal);

            int cursor = offsets.Start;
            int limit  = offsets.Limit;

            // Modes:
            // 0 - looking for open delimiter
            // 1 - after open delimiter
            int mode    = 0;
            int openPos = -1; // open delim candidate pos

            int c;

            while (cursor < limit)
            {
                c = text.Char32At(cursor);

                switch (mode)
                {
                case 0:   // looking for open delimiter
                    if (c == OPEN_DELIM)
                    {     // quick check first
                        openPos = cursor;
                        int i = Utility.ParsePattern(OPEN_PAT, text, cursor, limit);
                        if (i >= 0 && i < limit)
                        {
                            mode        = 1;
                            name.Length = 0;
                            cursor      = i;
                            continue;     // *** reprocess char32At(cursor)
                        }
                    }
                    break;

                case 1:     // after open delimiter
                            // Look for legal chars.  If \s+ is found, convert it
                            // to a single space.  If closeDelimiter is found, exit
                            // the loop.  If any other character is found, exit the
                            // loop.  If the limit is reached, exit the loop.

                    // Convert \s+ => SPACE.  This assumes there are no
                    // runs of >1 space characters in names.
                    if (PatternProps.IsWhiteSpace(c))
                    {
                        // Ignore leading whitespace
                        if (name.Length > 0 &&
                            name[name.Length - 1] != SPACE)
                        {
                            name.Append(SPACE);
                            // If we are too long then abort.  maxLen includes
                            // temporary trailing space, so use '>'.
                            if (name.Length > maxLen)
                            {
                                mode = 0;
                            }
                        }
                        break;
                    }

                    if (c == CLOSE_DELIM)
                    {
                        int len = name.Length;

                        // Delete trailing space, if any
                        if (len > 0 &&
                            name[len - 1] == SPACE)
                        {
                            name.Length = --len;
                        }

                        c = UCharacter.GetCharFromExtendedName(name.ToString());
                        if (c != -1)
                        {
                            // Lookup succeeded

                            // assert(UTF16.getCharCount(CLOSE_DELIM) == 1);
                            cursor++;     // advance over CLOSE_DELIM

                            string str = UTF16.ValueOf(c);
                            text.Replace(openPos, cursor, str);

                            // Adjust indices for the change in the length of
                            // the string.  Do not assume that str.length() ==
                            // 1, in case of surrogates.
                            int delta = cursor - openPos - str.Length;
                            cursor -= delta;
                            limit  -= delta;
                            // assert(cursor == openPos + str.length());
                        }
                        // If the lookup failed, we leave things as-is and
                        // still switch to mode 0 and continue.
                        mode    = 0;
                        openPos = -1; // close off candidate
                        continue;     // *** reprocess char32At(cursor)
                    }

                    if (legal.Contains(c))
                    {
                        UTF16.Append(name, c);
                        // If we go past the longest possible name then abort.
                        // maxLen includes temporary trailing space, so use '>='.
                        if (name.Length >= maxLen)
                        {
                            mode = 0;
                        }
                    }

                    // Invalid character
                    else
                    {
                        --cursor;     // Backup and reprocess this character
                        mode = 0;
                    }

                    break;
                }

                cursor += UTF16.GetCharCount(c);
            }

            offsets.ContextLimit += limit - offsets.Limit;
            offsets.Limit         = limit;
            // In incremental mode, only advance the cursor up to the last
            // open delimiter candidate.
            offsets.Start = (isIncremental && openPos >= 0) ? openPos : cursor;
        }
Exemple #10
0
        public override int DivideUpDictionaryRange(CharacterIterator inText, int startPos, int endPos,
                                                    DequeI foundBreaks)
        {
            if (startPos >= endPos)
            {
                return(0);
            }

            inText.SetIndex(startPos);

            int inputLength = endPos - startPos;

            int[]        charPositions = new int[inputLength + 1];
            StringBuffer s             = new StringBuffer("");

            inText.SetIndex(startPos);
            while (inText.Index < endPos)
            {
                s.Append(inText.Current);
                inText.Next();
            }
            string prenormstr = s.ToString();

#pragma warning disable 612, 618
            bool isNormalized = Normalizer.QuickCheck(prenormstr, NormalizerMode.NFKC) == QuickCheckResult.Yes ||
                                Normalizer.IsNormalized(prenormstr, NormalizerMode.NFKC, 0);
#pragma warning restore 612, 618
            CharacterIterator text;
            int numChars = 0;
            if (isNormalized)
            {
                text = new StringCharacterIterator(prenormstr);
                int index = 0;
                charPositions[0] = 0;
                while (index < prenormstr.Length)
                {
                    int codepoint = prenormstr.CodePointAt(index);
                    index += Character.CharCount(codepoint);
                    numChars++;
                    charPositions[numChars] = index;
                }
            }
            else
            {
#pragma warning disable 612, 618
                string normStr = Normalizer.Normalize(prenormstr, NormalizerMode.NFKC);
                text          = new StringCharacterIterator(normStr);
                charPositions = new int[normStr.Length + 1];
                Normalizer normalizer = new Normalizer(prenormstr, NormalizerMode.NFKC, 0);
                int        index      = 0;
                charPositions[0] = 0;
                while (index < normalizer.EndIndex)
                {
                    normalizer.Next();
                    numChars++;
                    index = normalizer.Index;
                    charPositions[numChars] = index;
                }
#pragma warning restore 612, 618
            }

            // From here on out, do the algorithm. Note that our indices
            // refer to indices within the normalized string.
            int[] bestSnlp = new int[numChars + 1];
            bestSnlp[0] = 0;
            for (int i = 1; i <= numChars; i++)
            {
                bestSnlp[i] = kint32max;
            }

            int[] prev = new int[numChars + 1];
            for (int i = 0; i <= numChars; i++)
            {
                prev[i] = -1;
            }

            int   maxWordSize = 20;
            int[] values      = new int[numChars];
            int[] lengths     = new int[numChars];
            // dynamic programming to find the best segmentation
            bool is_prev_katakana = false;
            for (int i = 0; i < numChars; i++)
            {
                text.SetIndex(i);
                if (bestSnlp[i] == kint32max)
                {
                    continue;
                }

                int   maxSearchLength = (i + maxWordSize < numChars) ? maxWordSize : (numChars - i);
                int[] count_          = new int[1];
                fDictionary.Matches(text, maxSearchLength, lengths, count_, maxSearchLength, values);
                int count = count_[0];

                // if there are no single character matches found in the dictionary
                // starting with this character, treat character as a 1-character word
                // with the highest value possible (i.e. the least likely to occur).
                // Exclude Korean characters from this treatment, as they should be
                // left together by default.
                text.SetIndex(i);  // fDictionary.matches() advances the text position; undo that.
                if ((count == 0 || lengths[0] != 1) && CharacterIteration.Current32(text) != CharacterIteration.Done32 && !fHangulWordSet.Contains(CharacterIteration.Current32(text)))
                {
                    values[count]  = maxSnlp;
                    lengths[count] = 1;
                    count++;
                }

                for (int j = 0; j < count; j++)
                {
                    int newSnlp = bestSnlp[i] + values[j];
                    if (newSnlp < bestSnlp[lengths[j] + i])
                    {
                        bestSnlp[lengths[j] + i] = newSnlp;
                        prev[lengths[j] + i]     = i;
                    }
                }

                // In Japanese, single-character Katakana words are pretty rare.
                // So we apply the following heuristic to Katakana: any continuous
                // run of Katakana characters is considered a candidate word with
                // a default cost specified in the katakanaCost table according
                // to its length.
                bool is_katakana = IsKatakana(CharacterIteration.Current32(text));
                if (!is_prev_katakana && is_katakana)
                {
                    int j = i + 1;
                    CharacterIteration.Next32(text);
                    while (j < numChars && (j - i) < kMaxKatakanaGroupLength && IsKatakana(CharacterIteration.Current32(text)))
                    {
                        CharacterIteration.Next32(text);
                        ++j;
                    }

                    if ((j - i) < kMaxKatakanaGroupLength)
                    {
                        int newSnlp = bestSnlp[i] + GetKatakanaCost(j - i);
                        if (newSnlp < bestSnlp[j])
                        {
                            bestSnlp[j] = newSnlp;
                            prev[j]     = i;
                        }
                    }
                }
                is_prev_katakana = is_katakana;
            }

            int[] t_boundary = new int[numChars + 1];
            int   numBreaks  = 0;
            if (bestSnlp[numChars] == kint32max)
            {
                t_boundary[numBreaks] = numChars;
                numBreaks++;
            }
            else
            {
                for (int i = numChars; i > 0; i = prev[i])
                {
                    t_boundary[numBreaks] = i;
                    numBreaks++;
                }
                Assert.Assrt(prev[t_boundary[numBreaks - 1]] == 0);
            }

            if (foundBreaks.Count == 0 || foundBreaks.Peek() < startPos)
            {
                t_boundary[numBreaks++] = 0;
            }

            int correctedNumBreaks = 0;
            for (int i = numBreaks - 1; i >= 0; i--)
            {
                int pos = charPositions[t_boundary[i]] + startPos;
                if (!(foundBreaks.Contains(pos) || pos == startPos))
                {
                    foundBreaks.Push(charPositions[t_boundary[i]] + startPos);
                    correctedNumBreaks++;
                }
            }

            if (!foundBreaks.IsEmpty && foundBreaks.Peek() == endPos)
            {
                foundBreaks.Pop();
                correctedNumBreaks--;
            }
            if (!foundBreaks.IsEmpty)
            {
                inText.SetIndex(foundBreaks.Peek());
            }
            return(correctedNumBreaks);
        }