// Fill the list of candidates if needed, select the longest, and return the number found public virtual int Candidates(CharacterIterator fIter, DictionaryMatcher dict, int rangeEnd) { int start = fIter.Index; if (start != offset) { offset = start; prefix = dict.Matches(fIter, rangeEnd - start, lengths, count, lengths.Length); // Dictionary leaves text after longest prefix, not longest word. Back up. if (count[0] <= 0) { fIter.SetIndex(start); } } if (count[0] > 0) { fIter.SetIndex(start + lengths[count[0] - 1]); } current = count[0] - 1; mark = current; return(count[0]); }
public override int DivideUpDictionaryRange(CharacterIterator inText, int startPos, int endPos, DequeI foundBreaks) { if (startPos >= endPos) { return(0); } inText.SetIndex(startPos); int inputLength = endPos - startPos; int[] charPositions = new int[inputLength + 1]; StringBuffer s = new StringBuffer(""); inText.SetIndex(startPos); while (inText.Index < endPos) { s.Append(inText.Current); inText.Next(); } string prenormstr = s.ToString(); #pragma warning disable 612, 618 bool isNormalized = Normalizer.QuickCheck(prenormstr, NormalizerMode.NFKC) == QuickCheckResult.Yes || Normalizer.IsNormalized(prenormstr, NormalizerMode.NFKC, 0); #pragma warning restore 612, 618 CharacterIterator text; int numChars = 0; if (isNormalized) { text = new StringCharacterIterator(prenormstr); int index = 0; charPositions[0] = 0; while (index < prenormstr.Length) { int codepoint = prenormstr.CodePointAt(index); index += Character.CharCount(codepoint); numChars++; charPositions[numChars] = index; } } else { #pragma warning disable 612, 618 string normStr = Normalizer.Normalize(prenormstr, NormalizerMode.NFKC); text = new StringCharacterIterator(normStr); charPositions = new int[normStr.Length + 1]; Normalizer normalizer = new Normalizer(prenormstr, NormalizerMode.NFKC, 0); int index = 0; charPositions[0] = 0; while (index < normalizer.EndIndex) { normalizer.Next(); numChars++; index = normalizer.Index; charPositions[numChars] = index; } #pragma warning restore 612, 618 } // From here on out, do the algorithm. Note that our indices // refer to indices within the normalized string. int[] bestSnlp = new int[numChars + 1]; bestSnlp[0] = 0; for (int i = 1; i <= numChars; i++) { bestSnlp[i] = kint32max; } int[] prev = new int[numChars + 1]; for (int i = 0; i <= numChars; i++) { prev[i] = -1; } int maxWordSize = 20; int[] values = new int[numChars]; int[] lengths = new int[numChars]; // dynamic programming to find the best segmentation bool is_prev_katakana = false; for (int i = 0; i < numChars; i++) { text.SetIndex(i); if (bestSnlp[i] == kint32max) { continue; } int maxSearchLength = (i + maxWordSize < numChars) ? maxWordSize : (numChars - i); int[] count_ = new int[1]; fDictionary.Matches(text, maxSearchLength, lengths, count_, maxSearchLength, values); int count = count_[0]; // if there are no single character matches found in the dictionary // starting with this character, treat character as a 1-character word // with the highest value possible (i.e. the least likely to occur). // Exclude Korean characters from this treatment, as they should be // left together by default. text.SetIndex(i); // fDictionary.matches() advances the text position; undo that. if ((count == 0 || lengths[0] != 1) && CharacterIteration.Current32(text) != CharacterIteration.Done32 && !fHangulWordSet.Contains(CharacterIteration.Current32(text))) { values[count] = maxSnlp; lengths[count] = 1; count++; } for (int j = 0; j < count; j++) { int newSnlp = bestSnlp[i] + values[j]; if (newSnlp < bestSnlp[lengths[j] + i]) { bestSnlp[lengths[j] + i] = newSnlp; prev[lengths[j] + i] = i; } } // In Japanese, single-character Katakana words are pretty rare. // So we apply the following heuristic to Katakana: any continuous // run of Katakana characters is considered a candidate word with // a default cost specified in the katakanaCost table according // to its length. bool is_katakana = IsKatakana(CharacterIteration.Current32(text)); if (!is_prev_katakana && is_katakana) { int j = i + 1; CharacterIteration.Next32(text); while (j < numChars && (j - i) < kMaxKatakanaGroupLength && IsKatakana(CharacterIteration.Current32(text))) { CharacterIteration.Next32(text); ++j; } if ((j - i) < kMaxKatakanaGroupLength) { int newSnlp = bestSnlp[i] + GetKatakanaCost(j - i); if (newSnlp < bestSnlp[j]) { bestSnlp[j] = newSnlp; prev[j] = i; } } } is_prev_katakana = is_katakana; } int[] t_boundary = new int[numChars + 1]; int numBreaks = 0; if (bestSnlp[numChars] == kint32max) { t_boundary[numBreaks] = numChars; numBreaks++; } else { for (int i = numChars; i > 0; i = prev[i]) { t_boundary[numBreaks] = i; numBreaks++; } Assert.Assrt(prev[t_boundary[numBreaks - 1]] == 0); } if (foundBreaks.Count == 0 || foundBreaks.Peek() < startPos) { t_boundary[numBreaks++] = 0; } int correctedNumBreaks = 0; for (int i = numBreaks - 1; i >= 0; i--) { int pos = charPositions[t_boundary[i]] + startPos; if (!(foundBreaks.Contains(pos) || pos == startPos)) { foundBreaks.Push(charPositions[t_boundary[i]] + startPos); correctedNumBreaks++; } } if (!foundBreaks.IsEmpty && foundBreaks.Peek() == endPos) { foundBreaks.Pop(); correctedNumBreaks--; } if (!foundBreaks.IsEmpty) { inText.SetIndex(foundBreaks.Peek()); } return(correctedNumBreaks); }