コード例 #1
0
        public override int DivideUpDictionaryRange(CharacterIterator fIter, int rangeStart, int rangeEnd,
                                                    DequeI foundBreaks)
        {
            if ((rangeEnd - rangeStart) < BURMESE_MIN_WORD)
            {
                return(0);  // Not enough characters for word
            }
            int wordsFound = 0;
            int wordLength;
            int current;

            PossibleWord[] words = new PossibleWord[BURMESE_LOOKAHEAD];
            for (int i = 0; i < BURMESE_LOOKAHEAD; i++)
            {
                words[i] = new PossibleWord();
            }
            int uc;

            fIter.SetIndex(rangeStart);
            while ((current = fIter.Index) < rangeEnd)
            {
                wordLength = 0;

                //Look for candidate words at the current position
                int candidates = words[wordsFound % BURMESE_LOOKAHEAD].Candidates(fIter, fDictionary, rangeEnd);

                // If we found exactly one, use that
                if (candidates == 1)
                {
                    wordLength  = words[wordsFound % BURMESE_LOOKAHEAD].AcceptMarked(fIter);
                    wordsFound += 1;
                }

                // If there was more than one, see which one can take us forward the most words
                else if (candidates > 1)
                {
                    bool foundBest = false;
                    // If we're already at the end of the range, we're done
                    if (fIter.Index < rangeEnd)
                    {
                        do
                        {
                            int wordsMatched = 1;
                            if (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].Candidates(fIter, fDictionary, rangeEnd) > 0)
                            {
                                if (wordsMatched < 2)
                                {
                                    // Followed by another dictionary word; mark first word as a good candidate
                                    words[wordsFound % BURMESE_LOOKAHEAD].MarkCurrent();
                                    wordsMatched = 2;
                                }

                                // If we're already at the end of the range, we're done
                                if (fIter.Index >= rangeEnd)
                                {
                                    break;
                                }

                                // See if any of the possible second words is followed by a third word
                                do
                                {
                                    // If we find a third word, stop right away
                                    if (words[(wordsFound + 2) % BURMESE_LOOKAHEAD].Candidates(fIter, fDictionary, rangeEnd) > 0)
                                    {
                                        words[wordsFound % BURMESE_LOOKAHEAD].MarkCurrent();
                                        foundBest = true;
                                        break;
                                    }
                                } while (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].BackUp(fIter));
                            }
                        } while (words[wordsFound % BURMESE_LOOKAHEAD].BackUp(fIter) && !foundBest);
                    }
                    wordLength  = words[wordsFound % BURMESE_LOOKAHEAD].AcceptMarked(fIter);
                    wordsFound += 1;
                }

                // We come here after having either found a word or not. We look ahead to the
                // next word. If it's not a dictionary word, we will combine it with the word we
                // just found (if there is one), but only if the preceding word does not exceed
                // the threshold.
                // The text iterator should now be positioned at the end of the word we found.
                if (fIter.Index < rangeEnd && wordLength < BURMESE_ROOT_COMBINE_THRESHOLD)
                {
                    // If it is a dictionary word, do nothing. If it isn't, then if there is
                    // no preceding word, or the non-word shares less than the minimum threshold
                    // of characters with a dictionary word, then scan to resynchronize
                    if (words[wordsFound % BURMESE_LOOKAHEAD].Candidates(fIter, fDictionary, rangeEnd) <= 0 &&
                        (wordLength == 0 ||
                         words[wordsFound % BURMESE_LOOKAHEAD].LongestPrefix < BURMESE_PREFIX_COMBINE_THRESHOLD))
                    {
                        // Look for a plausible word boundary
                        int remaining = rangeEnd - (current + wordLength);
                        int pc        = fIter.Current;
                        int chars     = 0;
                        for (; ;)
                        {
                            fIter.Next();
                            uc     = fIter.Current;
                            chars += 1;
                            if (--remaining <= 0)
                            {
                                break;
                            }
                            if (fEndWordSet.Contains(pc) && fBeginWordSet.Contains(uc))
                            {
                                // Maybe. See if it's in the dictionary.
                                int candidate = words[(wordsFound + 1) % BURMESE_LOOKAHEAD].Candidates(fIter, fDictionary, rangeEnd);
                                fIter.SetIndex(current + wordLength + chars);
                                if (candidate > 0)
                                {
                                    break;
                                }
                            }
                            pc = uc;
                        }

                        // Bump the word count if there wasn't already one
                        if (wordLength <= 0)
                        {
                            wordsFound += 1;
                        }

                        // Update the length with the passed-over characters
                        wordLength += chars;
                    }
                    else
                    {
                        // Backup to where we were for next iteration
                        fIter.SetIndex(current + wordLength);
                    }
                }

                // Never stop before a combining mark.
                int currPos;
                while ((currPos = fIter.Index) < rangeEnd && fMarkSet.Contains(fIter.Current))
                {
                    fIter.Next();
                    wordLength += fIter.Index - currPos;
                }

                // Look ahead for possible suffixes if a dictionary word does not follow.
                // We do this in code rather than using a rule so that the heuristic
                // resynch continues to function. For example, one of the suffix characters
                // could be a typo in the middle of a word.
                // NOT CURRENTLY APPLICABLE TO BURMESE

                // Did we find a word on this iteration? If so, push it on the break stack
                if (wordLength > 0)
                {
                    foundBreaks.Push(current + wordLength);
                }
            }

            // Don't return a break for the end of the dictionary range if there is one there
            if (foundBreaks.Peek() >= rangeEnd)
            {
                foundBreaks.Pop();
                wordsFound -= 1;
            }

            return(wordsFound);
        }
コード例 #2
0
        public override int DivideUpDictionaryRange(CharacterIterator inText, int startPos, int endPos,
                                                    DequeI foundBreaks)
        {
            if (startPos >= endPos)
            {
                return(0);
            }

            inText.SetIndex(startPos);

            int inputLength = endPos - startPos;

            int[]        charPositions = new int[inputLength + 1];
            StringBuffer s             = new StringBuffer("");

            inText.SetIndex(startPos);
            while (inText.Index < endPos)
            {
                s.Append(inText.Current);
                inText.Next();
            }
            string prenormstr = s.ToString();

#pragma warning disable 612, 618
            bool isNormalized = Normalizer.QuickCheck(prenormstr, NormalizerMode.NFKC) == QuickCheckResult.Yes ||
                                Normalizer.IsNormalized(prenormstr, NormalizerMode.NFKC, 0);
#pragma warning restore 612, 618
            CharacterIterator text;
            int numChars = 0;
            if (isNormalized)
            {
                text = new StringCharacterIterator(prenormstr);
                int index = 0;
                charPositions[0] = 0;
                while (index < prenormstr.Length)
                {
                    int codepoint = prenormstr.CodePointAt(index);
                    index += Character.CharCount(codepoint);
                    numChars++;
                    charPositions[numChars] = index;
                }
            }
            else
            {
#pragma warning disable 612, 618
                string normStr = Normalizer.Normalize(prenormstr, NormalizerMode.NFKC);
                text          = new StringCharacterIterator(normStr);
                charPositions = new int[normStr.Length + 1];
                Normalizer normalizer = new Normalizer(prenormstr, NormalizerMode.NFKC, 0);
                int        index      = 0;
                charPositions[0] = 0;
                while (index < normalizer.EndIndex)
                {
                    normalizer.Next();
                    numChars++;
                    index = normalizer.Index;
                    charPositions[numChars] = index;
                }
#pragma warning restore 612, 618
            }

            // From here on out, do the algorithm. Note that our indices
            // refer to indices within the normalized string.
            int[] bestSnlp = new int[numChars + 1];
            bestSnlp[0] = 0;
            for (int i = 1; i <= numChars; i++)
            {
                bestSnlp[i] = kint32max;
            }

            int[] prev = new int[numChars + 1];
            for (int i = 0; i <= numChars; i++)
            {
                prev[i] = -1;
            }

            int   maxWordSize = 20;
            int[] values      = new int[numChars];
            int[] lengths     = new int[numChars];
            // dynamic programming to find the best segmentation
            bool is_prev_katakana = false;
            for (int i = 0; i < numChars; i++)
            {
                text.SetIndex(i);
                if (bestSnlp[i] == kint32max)
                {
                    continue;
                }

                int   maxSearchLength = (i + maxWordSize < numChars) ? maxWordSize : (numChars - i);
                int[] count_          = new int[1];
                fDictionary.Matches(text, maxSearchLength, lengths, count_, maxSearchLength, values);
                int count = count_[0];

                // if there are no single character matches found in the dictionary
                // starting with this character, treat character as a 1-character word
                // with the highest value possible (i.e. the least likely to occur).
                // Exclude Korean characters from this treatment, as they should be
                // left together by default.
                text.SetIndex(i);  // fDictionary.matches() advances the text position; undo that.
                if ((count == 0 || lengths[0] != 1) && CharacterIteration.Current32(text) != CharacterIteration.Done32 && !fHangulWordSet.Contains(CharacterIteration.Current32(text)))
                {
                    values[count]  = maxSnlp;
                    lengths[count] = 1;
                    count++;
                }

                for (int j = 0; j < count; j++)
                {
                    int newSnlp = bestSnlp[i] + values[j];
                    if (newSnlp < bestSnlp[lengths[j] + i])
                    {
                        bestSnlp[lengths[j] + i] = newSnlp;
                        prev[lengths[j] + i]     = i;
                    }
                }

                // In Japanese, single-character Katakana words are pretty rare.
                // So we apply the following heuristic to Katakana: any continuous
                // run of Katakana characters is considered a candidate word with
                // a default cost specified in the katakanaCost table according
                // to its length.
                bool is_katakana = IsKatakana(CharacterIteration.Current32(text));
                if (!is_prev_katakana && is_katakana)
                {
                    int j = i + 1;
                    CharacterIteration.Next32(text);
                    while (j < numChars && (j - i) < kMaxKatakanaGroupLength && IsKatakana(CharacterIteration.Current32(text)))
                    {
                        CharacterIteration.Next32(text);
                        ++j;
                    }

                    if ((j - i) < kMaxKatakanaGroupLength)
                    {
                        int newSnlp = bestSnlp[i] + GetKatakanaCost(j - i);
                        if (newSnlp < bestSnlp[j])
                        {
                            bestSnlp[j] = newSnlp;
                            prev[j]     = i;
                        }
                    }
                }
                is_prev_katakana = is_katakana;
            }

            int[] t_boundary = new int[numChars + 1];
            int   numBreaks  = 0;
            if (bestSnlp[numChars] == kint32max)
            {
                t_boundary[numBreaks] = numChars;
                numBreaks++;
            }
            else
            {
                for (int i = numChars; i > 0; i = prev[i])
                {
                    t_boundary[numBreaks] = i;
                    numBreaks++;
                }
                Assert.Assrt(prev[t_boundary[numBreaks - 1]] == 0);
            }

            if (foundBreaks.Count == 0 || foundBreaks.Peek() < startPos)
            {
                t_boundary[numBreaks++] = 0;
            }

            int correctedNumBreaks = 0;
            for (int i = numBreaks - 1; i >= 0; i--)
            {
                int pos = charPositions[t_boundary[i]] + startPos;
                if (!(foundBreaks.Contains(pos) || pos == startPos))
                {
                    foundBreaks.Push(charPositions[t_boundary[i]] + startPos);
                    correctedNumBreaks++;
                }
            }

            if (!foundBreaks.IsEmpty && foundBreaks.Peek() == endPos)
            {
                foundBreaks.Pop();
                correctedNumBreaks--;
            }
            if (!foundBreaks.IsEmpty)
            {
                inText.SetIndex(foundBreaks.Peek());
            }
            return(correctedNumBreaks);
        }