public override int DivideUpDictionaryRange(CharacterIterator fIter, int rangeStart, int rangeEnd, DequeI foundBreaks) { if ((rangeEnd - rangeStart) < BURMESE_MIN_WORD) { return(0); // Not enough characters for word } int wordsFound = 0; int wordLength; int current; PossibleWord[] words = new PossibleWord[BURMESE_LOOKAHEAD]; for (int i = 0; i < BURMESE_LOOKAHEAD; i++) { words[i] = new PossibleWord(); } int uc; fIter.SetIndex(rangeStart); while ((current = fIter.Index) < rangeEnd) { wordLength = 0; //Look for candidate words at the current position int candidates = words[wordsFound % BURMESE_LOOKAHEAD].Candidates(fIter, fDictionary, rangeEnd); // If we found exactly one, use that if (candidates == 1) { wordLength = words[wordsFound % BURMESE_LOOKAHEAD].AcceptMarked(fIter); wordsFound += 1; } // If there was more than one, see which one can take us forward the most words else if (candidates > 1) { bool foundBest = false; // If we're already at the end of the range, we're done if (fIter.Index < rangeEnd) { do { int wordsMatched = 1; if (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].Candidates(fIter, fDictionary, rangeEnd) > 0) { if (wordsMatched < 2) { // Followed by another dictionary word; mark first word as a good candidate words[wordsFound % BURMESE_LOOKAHEAD].MarkCurrent(); wordsMatched = 2; } // If we're already at the end of the range, we're done if (fIter.Index >= rangeEnd) { break; } // See if any of the possible second words is followed by a third word do { // If we find a third word, stop right away if (words[(wordsFound + 2) % BURMESE_LOOKAHEAD].Candidates(fIter, fDictionary, rangeEnd) > 0) { words[wordsFound % BURMESE_LOOKAHEAD].MarkCurrent(); foundBest = true; break; } } while (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].BackUp(fIter)); } } while (words[wordsFound % BURMESE_LOOKAHEAD].BackUp(fIter) && !foundBest); } wordLength = words[wordsFound % BURMESE_LOOKAHEAD].AcceptMarked(fIter); wordsFound += 1; } // We come here after having either found a word or not. We look ahead to the // next word. If it's not a dictionary word, we will combine it with the word we // just found (if there is one), but only if the preceding word does not exceed // the threshold. // The text iterator should now be positioned at the end of the word we found. if (fIter.Index < rangeEnd && wordLength < BURMESE_ROOT_COMBINE_THRESHOLD) { // If it is a dictionary word, do nothing. If it isn't, then if there is // no preceding word, or the non-word shares less than the minimum threshold // of characters with a dictionary word, then scan to resynchronize if (words[wordsFound % BURMESE_LOOKAHEAD].Candidates(fIter, fDictionary, rangeEnd) <= 0 && (wordLength == 0 || words[wordsFound % BURMESE_LOOKAHEAD].LongestPrefix < BURMESE_PREFIX_COMBINE_THRESHOLD)) { // Look for a plausible word boundary int remaining = rangeEnd - (current + wordLength); int pc = fIter.Current; int chars = 0; for (; ;) { fIter.Next(); uc = fIter.Current; chars += 1; if (--remaining <= 0) { break; } if (fEndWordSet.Contains(pc) && fBeginWordSet.Contains(uc)) { // Maybe. See if it's in the dictionary. int candidate = words[(wordsFound + 1) % BURMESE_LOOKAHEAD].Candidates(fIter, fDictionary, rangeEnd); fIter.SetIndex(current + wordLength + chars); if (candidate > 0) { break; } } pc = uc; } // Bump the word count if there wasn't already one if (wordLength <= 0) { wordsFound += 1; } // Update the length with the passed-over characters wordLength += chars; } else { // Backup to where we were for next iteration fIter.SetIndex(current + wordLength); } } // Never stop before a combining mark. int currPos; while ((currPos = fIter.Index) < rangeEnd && fMarkSet.Contains(fIter.Current)) { fIter.Next(); wordLength += fIter.Index - currPos; } // Look ahead for possible suffixes if a dictionary word does not follow. // We do this in code rather than using a rule so that the heuristic // resynch continues to function. For example, one of the suffix characters // could be a typo in the middle of a word. // NOT CURRENTLY APPLICABLE TO BURMESE // Did we find a word on this iteration? If so, push it on the break stack if (wordLength > 0) { foundBreaks.Push(current + wordLength); } } // Don't return a break for the end of the dictionary range if there is one there if (foundBreaks.Peek() >= rangeEnd) { foundBreaks.Pop(); wordsFound -= 1; } return(wordsFound); }
public override int DivideUpDictionaryRange(CharacterIterator inText, int startPos, int endPos, DequeI foundBreaks) { if (startPos >= endPos) { return(0); } inText.SetIndex(startPos); int inputLength = endPos - startPos; int[] charPositions = new int[inputLength + 1]; StringBuffer s = new StringBuffer(""); inText.SetIndex(startPos); while (inText.Index < endPos) { s.Append(inText.Current); inText.Next(); } string prenormstr = s.ToString(); #pragma warning disable 612, 618 bool isNormalized = Normalizer.QuickCheck(prenormstr, NormalizerMode.NFKC) == QuickCheckResult.Yes || Normalizer.IsNormalized(prenormstr, NormalizerMode.NFKC, 0); #pragma warning restore 612, 618 CharacterIterator text; int numChars = 0; if (isNormalized) { text = new StringCharacterIterator(prenormstr); int index = 0; charPositions[0] = 0; while (index < prenormstr.Length) { int codepoint = prenormstr.CodePointAt(index); index += Character.CharCount(codepoint); numChars++; charPositions[numChars] = index; } } else { #pragma warning disable 612, 618 string normStr = Normalizer.Normalize(prenormstr, NormalizerMode.NFKC); text = new StringCharacterIterator(normStr); charPositions = new int[normStr.Length + 1]; Normalizer normalizer = new Normalizer(prenormstr, NormalizerMode.NFKC, 0); int index = 0; charPositions[0] = 0; while (index < normalizer.EndIndex) { normalizer.Next(); numChars++; index = normalizer.Index; charPositions[numChars] = index; } #pragma warning restore 612, 618 } // From here on out, do the algorithm. Note that our indices // refer to indices within the normalized string. int[] bestSnlp = new int[numChars + 1]; bestSnlp[0] = 0; for (int i = 1; i <= numChars; i++) { bestSnlp[i] = kint32max; } int[] prev = new int[numChars + 1]; for (int i = 0; i <= numChars; i++) { prev[i] = -1; } int maxWordSize = 20; int[] values = new int[numChars]; int[] lengths = new int[numChars]; // dynamic programming to find the best segmentation bool is_prev_katakana = false; for (int i = 0; i < numChars; i++) { text.SetIndex(i); if (bestSnlp[i] == kint32max) { continue; } int maxSearchLength = (i + maxWordSize < numChars) ? maxWordSize : (numChars - i); int[] count_ = new int[1]; fDictionary.Matches(text, maxSearchLength, lengths, count_, maxSearchLength, values); int count = count_[0]; // if there are no single character matches found in the dictionary // starting with this character, treat character as a 1-character word // with the highest value possible (i.e. the least likely to occur). // Exclude Korean characters from this treatment, as they should be // left together by default. text.SetIndex(i); // fDictionary.matches() advances the text position; undo that. if ((count == 0 || lengths[0] != 1) && CharacterIteration.Current32(text) != CharacterIteration.Done32 && !fHangulWordSet.Contains(CharacterIteration.Current32(text))) { values[count] = maxSnlp; lengths[count] = 1; count++; } for (int j = 0; j < count; j++) { int newSnlp = bestSnlp[i] + values[j]; if (newSnlp < bestSnlp[lengths[j] + i]) { bestSnlp[lengths[j] + i] = newSnlp; prev[lengths[j] + i] = i; } } // In Japanese, single-character Katakana words are pretty rare. // So we apply the following heuristic to Katakana: any continuous // run of Katakana characters is considered a candidate word with // a default cost specified in the katakanaCost table according // to its length. bool is_katakana = IsKatakana(CharacterIteration.Current32(text)); if (!is_prev_katakana && is_katakana) { int j = i + 1; CharacterIteration.Next32(text); while (j < numChars && (j - i) < kMaxKatakanaGroupLength && IsKatakana(CharacterIteration.Current32(text))) { CharacterIteration.Next32(text); ++j; } if ((j - i) < kMaxKatakanaGroupLength) { int newSnlp = bestSnlp[i] + GetKatakanaCost(j - i); if (newSnlp < bestSnlp[j]) { bestSnlp[j] = newSnlp; prev[j] = i; } } } is_prev_katakana = is_katakana; } int[] t_boundary = new int[numChars + 1]; int numBreaks = 0; if (bestSnlp[numChars] == kint32max) { t_boundary[numBreaks] = numChars; numBreaks++; } else { for (int i = numChars; i > 0; i = prev[i]) { t_boundary[numBreaks] = i; numBreaks++; } Assert.Assrt(prev[t_boundary[numBreaks - 1]] == 0); } if (foundBreaks.Count == 0 || foundBreaks.Peek() < startPos) { t_boundary[numBreaks++] = 0; } int correctedNumBreaks = 0; for (int i = numBreaks - 1; i >= 0; i--) { int pos = charPositions[t_boundary[i]] + startPos; if (!(foundBreaks.Contains(pos) || pos == startPos)) { foundBreaks.Push(charPositions[t_boundary[i]] + startPos); correctedNumBreaks++; } } if (!foundBreaks.IsEmpty && foundBreaks.Peek() == endPos) { foundBreaks.Pop(); correctedNumBreaks--; } if (!foundBreaks.IsEmpty) { inText.SetIndex(foundBreaks.Peek()); } return(correctedNumBreaks); }