Beispiel #1
0
 public BurmeseBreakEngine()
     : base(BreakIterator.KIND_WORD, BreakIterator.KIND_LINE)
 {
     SetCharacters(fBurmeseWordSet);
     // Initialize dictionary
     fDictionary = DictionaryData.LoadDictionaryFor("Mymr");
 }
Beispiel #2
0
 public KhmerBreakEngine()
     : base(BreakIterator.KIND_WORD, BreakIterator.KIND_LINE)
 {
     SetCharacters(fKhmerWordSet);
     // Initialize dictionary
     fDictionary = DictionaryData.LoadDictionaryFor("Khmr");
 }
Beispiel #3
0
 public LaoBreakEngine()
     : base(BreakIterator.KIND_WORD, BreakIterator.KIND_LINE)
 {
     SetCharacters(fLaoWordSet);
     // Initialize dictionary
     fDictionary = DictionaryData.LoadDictionaryFor("Laoo");
 }
Beispiel #4
0
        public static DictionaryMatcher LoadDictionaryFor(string dictType)
        {
            ICUResourceBundle rb           = (ICUResourceBundle)UResourceBundle.GetBundleInstance(ICUData.IcuBreakIteratorBaseName); // com/ibm/icu/impl/data/icudt60b/brkitr
            string            dictFileName = rb.GetStringWithFallback("dictionaries/" + dictType);

            // ICU4N TODO: Possibly rename the above and use this syntax instead...?
            //var rm = new ResourceManager(ICUData.ICU_BRKITR_BASE_NAME, typeof(DictionaryData).GetTypeInfo().Assembly);
            //string dictFileName = rm.GetString("dictionaries_" + dictType);

            dictFileName = ICUData.IcuBreakIteratorName + '/' + dictFileName;
            ByteBuffer bytes = ICUBinary.GetRequiredData(dictFileName);

            ICUBinary.ReadHeader(bytes, DATA_FORMAT_ID, null);
            int[] indexes = new int[IX_COUNT];
            // TODO: read indexes[IX_STRING_TRIE_OFFSET] first, then read a variable-length indexes[]
            for (int i = 0; i < IX_COUNT; i++)
            {
                indexes[i] = bytes.GetInt32();
            }
            int offset = indexes[IX_STRING_TRIE_OFFSET];

            Assert.Assrt(offset >= (4 * IX_COUNT));
            if (offset > (4 * IX_COUNT))
            {
                int diff = offset - (4 * IX_COUNT);
                ICUBinary.SkipBytes(bytes, diff);
            }
            int trieType        = indexes[IX_TRIE_TYPE] & TRIE_TYPE_MASK;
            int totalSize       = indexes[IX_TOTAL_SIZE] - offset;
            DictionaryMatcher m = null;

            if (trieType == TRIE_TYPE_BYTES)
            {
                int    transform = indexes[IX_TRANSFORM];
                byte[] data      = new byte[totalSize];
                bytes.Get(data);
                m = new BytesDictionaryMatcher(data, transform);
            }
            else if (trieType == TRIE_TYPE_UCHARS)
            {
                Assert.Assrt(totalSize % 2 == 0);
                string data = ICUBinary.GetString(bytes, totalSize / 2, totalSize & 1);
                m = new CharsDictionaryMatcher(data);
            }
            else
            {
                m = null;
            }
            return(m);
        }
Beispiel #5
0
 public CjkBreakEngine(bool korean)
     : base(BreakIterator.KIND_WORD)
 {
     fDictionary = DictionaryData.LoadDictionaryFor("Hira");
     if (korean)
     {
         SetCharacters(fHangulWordSet);
     }
     else
     { //Chinese and Japanese
         UnicodeSet cjSet = new UnicodeSet();
         cjSet.AddAll(fHanWordSet);
         cjSet.AddAll(fKatakanaWordSet);
         cjSet.AddAll(fHiraganaWordSet);
         cjSet.Add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
         cjSet.Add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK
         SetCharacters(cjSet);
     }
 }
Beispiel #6
0
            // Fill the list of candidates if needed, select the longest, and return the number found
            public virtual int Candidates(CharacterIterator fIter, DictionaryMatcher dict, int rangeEnd)
            {
                int start = fIter.Index;

                if (start != offset)
                {
                    offset = start;
                    prefix = dict.Matches(fIter, rangeEnd - start, lengths, count, lengths.Length);
                    // Dictionary leaves text after longest prefix, not longest word. Back up.
                    if (count[0] <= 0)
                    {
                        fIter.SetIndex(start);
                    }
                }
                if (count[0] > 0)
                {
                    fIter.SetIndex(start + lengths[count[0] - 1]);
                }
                current = count[0] - 1;
                mark    = current;
                return(count[0]);
            }