public BurmeseBreakEngine() : base(BreakIterator.KIND_WORD, BreakIterator.KIND_LINE) { SetCharacters(fBurmeseWordSet); // Initialize dictionary fDictionary = DictionaryData.LoadDictionaryFor("Mymr"); }
public KhmerBreakEngine() : base(BreakIterator.KIND_WORD, BreakIterator.KIND_LINE) { SetCharacters(fKhmerWordSet); // Initialize dictionary fDictionary = DictionaryData.LoadDictionaryFor("Khmr"); }
public LaoBreakEngine() : base(BreakIterator.KIND_WORD, BreakIterator.KIND_LINE) { SetCharacters(fLaoWordSet); // Initialize dictionary fDictionary = DictionaryData.LoadDictionaryFor("Laoo"); }
public static DictionaryMatcher LoadDictionaryFor(string dictType) { ICUResourceBundle rb = (ICUResourceBundle)UResourceBundle.GetBundleInstance(ICUData.IcuBreakIteratorBaseName); // com/ibm/icu/impl/data/icudt60b/brkitr string dictFileName = rb.GetStringWithFallback("dictionaries/" + dictType); // ICU4N TODO: Possibly rename the above and use this syntax instead...? //var rm = new ResourceManager(ICUData.ICU_BRKITR_BASE_NAME, typeof(DictionaryData).GetTypeInfo().Assembly); //string dictFileName = rm.GetString("dictionaries_" + dictType); dictFileName = ICUData.IcuBreakIteratorName + '/' + dictFileName; ByteBuffer bytes = ICUBinary.GetRequiredData(dictFileName); ICUBinary.ReadHeader(bytes, DATA_FORMAT_ID, null); int[] indexes = new int[IX_COUNT]; // TODO: read indexes[IX_STRING_TRIE_OFFSET] first, then read a variable-length indexes[] for (int i = 0; i < IX_COUNT; i++) { indexes[i] = bytes.GetInt32(); } int offset = indexes[IX_STRING_TRIE_OFFSET]; Assert.Assrt(offset >= (4 * IX_COUNT)); if (offset > (4 * IX_COUNT)) { int diff = offset - (4 * IX_COUNT); ICUBinary.SkipBytes(bytes, diff); } int trieType = indexes[IX_TRIE_TYPE] & TRIE_TYPE_MASK; int totalSize = indexes[IX_TOTAL_SIZE] - offset; DictionaryMatcher m = null; if (trieType == TRIE_TYPE_BYTES) { int transform = indexes[IX_TRANSFORM]; byte[] data = new byte[totalSize]; bytes.Get(data); m = new BytesDictionaryMatcher(data, transform); } else if (trieType == TRIE_TYPE_UCHARS) { Assert.Assrt(totalSize % 2 == 0); string data = ICUBinary.GetString(bytes, totalSize / 2, totalSize & 1); m = new CharsDictionaryMatcher(data); } else { m = null; } return(m); }
public CjkBreakEngine(bool korean) : base(BreakIterator.KIND_WORD) { fDictionary = DictionaryData.LoadDictionaryFor("Hira"); if (korean) { SetCharacters(fHangulWordSet); } else { //Chinese and Japanese UnicodeSet cjSet = new UnicodeSet(); cjSet.AddAll(fHanWordSet); cjSet.AddAll(fKatakanaWordSet); cjSet.AddAll(fHiraganaWordSet); cjSet.Add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK cjSet.Add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK SetCharacters(cjSet); } }
// Fill the list of candidates if needed, select the longest, and return the number found public virtual int Candidates(CharacterIterator fIter, DictionaryMatcher dict, int rangeEnd) { int start = fIter.Index; if (start != offset) { offset = start; prefix = dict.Matches(fIter, rangeEnd - start, lengths, count, lengths.Length); // Dictionary leaves text after longest prefix, not longest word. Back up. if (count[0] <= 0) { fIter.SetIndex(start); } } if (count[0] > 0) { fIter.SetIndex(start + lengths[count[0] - 1]); } current = count[0] - 1; mark = current; return(count[0]); }