Exemplo n.º 1
0
        static LaoBreakEngine()
        {
            // Initialize UnicodeSets
            fLaoWordSet   = new UnicodeSet();
            fMarkSet      = new UnicodeSet();
            fBeginWordSet = new UnicodeSet();

            fLaoWordSet.ApplyPattern("[[:Laoo:]&[:LineBreak=SA:]]");
            fLaoWordSet.Compact();

            fMarkSet.ApplyPattern("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]");
            fMarkSet.Add(0x0020);
            fEndWordSet = new UnicodeSet(fLaoWordSet);
            fEndWordSet.Remove(0x0EC0, 0x0EC4); // prefix vowels
            fBeginWordSet.Add(0x0E81, 0x0EAE);  // basic consonants (including holes for corresponding Thai characters)
            fBeginWordSet.Add(0x0EDC, 0x0EDD);  // digraph consonants (no Thai equivalent)
            fBeginWordSet.Add(0x0EC0, 0x0EC4);  // prefix vowels

            // Compact for caching
            fMarkSet.Compact();
            fEndWordSet.Compact();
            fBeginWordSet.Compact();

            // Freeze the static UnicodeSet
            fLaoWordSet.Freeze();
            fMarkSet.Freeze();
            fEndWordSet.Freeze();
            fBeginWordSet.Freeze();
        }
Exemplo n.º 2
0
        static ThaiBreakEngine()
        {
            // Initialize UnicodeSets
            fThaiWordSet = new UnicodeSet();
            fMarkSet = new UnicodeSet();
            fBeginWordSet = new UnicodeSet();
            fSuffixSet = new UnicodeSet();

            fThaiWordSet.ApplyPattern("[[:Thai:]&[:LineBreak=SA:]]");
            fThaiWordSet.Compact();

            fMarkSet.ApplyPattern("[[:Thai:]&[:LineBreak=SA:]&[:M:]]");
            fMarkSet.Add(0x0020);
            fEndWordSet = new UnicodeSet(fThaiWordSet);
            fEndWordSet.Remove(0x0E31); // MAI HAN-AKAT
            fEndWordSet.Remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
            fBeginWordSet.Add(0x0E01, 0x0E2E); //KO KAI through HO NOKHUK
            fBeginWordSet.Add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
            fSuffixSet.Add(THAI_PAIYANNOI);
            fSuffixSet.Add(THAI_MAIYAMOK);

            // Compact for caching
            fMarkSet.Compact();
            fEndWordSet.Compact();
            fBeginWordSet.Compact();
            fSuffixSet.Compact();

            // Freeze the static UnicodeSet
            fThaiWordSet.Freeze();
            fMarkSet.Freeze();
            fEndWordSet.Freeze();
            fBeginWordSet.Freeze();
            fSuffixSet.Freeze();
        }
Exemplo n.º 3
0
        static BurmeseBreakEngine()
        {
            // Initialize UnicodeSets
            fBurmeseWordSet = new UnicodeSet();
            fMarkSet        = new UnicodeSet();
            fBeginWordSet   = new UnicodeSet();

            fBurmeseWordSet.ApplyPattern("[[:Mymr:]&[:LineBreak=SA:]]");
            fBurmeseWordSet.Compact();

            fMarkSet.ApplyPattern("[[:Mymr:]&[:LineBreak=SA:]&[:M:]]");
            fMarkSet.Add(0x0020);
            fEndWordSet = new UnicodeSet(fBurmeseWordSet);
            fBeginWordSet.Add(0x1000, 0x102A);      // basic consonants and independent vowels

            // Compact for caching
            fMarkSet.Compact();
            fEndWordSet.Compact();
            fBeginWordSet.Compact();

            // Freeze the static UnicodeSet
            fBurmeseWordSet.Freeze();
            fMarkSet.Freeze();
            fEndWordSet.Freeze();
            fBeginWordSet.Freeze();
        }
Exemplo n.º 4
0
        /// <seealso cref="Transliterator.AddSourceTargetSet(UnicodeSet, UnicodeSet, UnicodeSet)"/>
        public override void AddSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet)
        {
            // Each form consists of a prefix, suffix,
            // * radix, minimum digit count, and maximum digit count.  These
            // * values are stored as a five character header. ...
            UnicodeSet    myFilter = GetFilterAsUnicodeSet(inputFilter);
            UnicodeSet    items    = new UnicodeSet();
            StringBuilder buffer   = new StringBuilder();

            for (int i = 0; spec[i] != END;)
            {
                // first 5 items are header
                int end   = i + spec[i] + spec[i + 1] + 5;
                int radix = spec[i + 2];
                for (int j = 0; j < radix; ++j)
                {
                    Utility.AppendNumber(buffer, j, radix, 0);
                }
                // then add the characters
                for (int j = i + 5; j < end; ++j)
                {
                    items.Add(spec[j]);
                }
                // and go to next block
                i = end;
            }
            items.AddAll(buffer.ToString());
            items.RetainAll(myFilter);

            if (items.Count > 0)
            {
                sourceSet.AddAll(items);
                targetSet.AddAll(0, 0x10FFFF); // assume we can produce any character
            }
        }
Exemplo n.º 5
0
        static KhmerBreakEngine()
        {
            // Initialize UnicodeSets
            fKhmerWordSet = new UnicodeSet();
            fMarkSet      = new UnicodeSet();
            fBeginWordSet = new UnicodeSet();

            fKhmerWordSet.ApplyPattern("[[:Khmer:]&[:LineBreak=SA:]]");
            fKhmerWordSet.Compact();

            fMarkSet.ApplyPattern("[[:Khmer:]&[:LineBreak=SA:]&[:M:]]");
            fMarkSet.Add(0x0020);
            fEndWordSet = new UnicodeSet(fKhmerWordSet);
            fBeginWordSet.Add(0x1780, 0x17B3);
            fEndWordSet.Remove(0x17D2); // KHMER SIGN COENG that combines some following characters

            // Compact for caching
            fMarkSet.Compact();
            fEndWordSet.Compact();
            fBeginWordSet.Compact();

            // Freeze the static UnicodeSet
            fKhmerWordSet.Freeze();
            fMarkSet.Freeze();
            fEndWordSet.Freeze();
            fBeginWordSet.Freeze();
        }
Exemplo n.º 6
0
 public CjkBreakEngine(bool korean)
     : base(BreakIterator.KIND_WORD)
 {
     fDictionary = DictionaryData.LoadDictionaryFor("Hira");
     if (korean)
     {
         SetCharacters(fHangulWordSet);
     }
     else
     { //Chinese and Japanese
         UnicodeSet cjSet = new UnicodeSet();
         cjSet.AddAll(fHanWordSet);
         cjSet.AddAll(fKatakanaWordSet);
         cjSet.AddAll(fHiraganaWordSet);
         cjSet.Add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
         cjSet.Add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK
         SetCharacters(cjSet);
     }
 }
Exemplo n.º 7
0
        /// <summary>
        /// Find the source and target sets, subject to the input filter.
        /// There is a known issue with filters containing multiple characters.
        /// </summary>
        // TODO: Problem: the rule is [{ab}]c > x
        // The filter is [a{bc}].
        // If the input is abc, then the rule will work.
        // However, following code applying the filter won't catch that case.
        internal void AddSourceTargetSet(UnicodeSet filter, UnicodeSet sourceSet, UnicodeSet targetSet, UnicodeSet revisiting)
        {
            int        limit      = anteContextLength + keyLength;
            UnicodeSet tempSource = new UnicodeSet();
            UnicodeSet temp       = new UnicodeSet();

            // We need to walk through the pattern.
            // Iff some of the characters at ALL of the the positions are matched by the filter, then we add temp to toUnionTo
            for (int i = anteContextLength; i < limit;)
            {
                int ch = UTF16.CharAt(pattern, i);
                i += UTF16.GetCharCount(ch);
                IUnicodeMatcher matcher = data.LookupMatcher(ch);
                if (matcher == null)
                {
                    if (!filter.Contains(ch))
                    {
                        return;
                    }
                    tempSource.Add(ch);
                }
                else
                {
                    try
                    {
                        if (!filter.ContainsSome((UnicodeSet)matcher))
                        {
                            return;
                        }
                        matcher.AddMatchSetTo(tempSource);
                    }
                    catch (InvalidCastException)
                    { // if the matcher is not a UnicodeSet
                        temp.Clear();
                        matcher.AddMatchSetTo(temp);
                        if (!filter.ContainsSome(temp))
                        {
                            return;
                        }
                        tempSource.AddAll(temp);
                    }
                }
            }
            // if we made our way through the gauntlet, add to source/target
            sourceSet.AddAll(tempSource);
            output.AddReplacementSetTo(targetSet);
        }
Exemplo n.º 8
0
        /// <summary>
        /// Union the set of all characters that may output by this object
        /// into the given set.
        /// </summary>
        /// <param name="toUnionTo">The set into which to union the output characters.</param>
        public virtual void AddReplacementSetTo(UnicodeSet toUnionTo)
        {
            int ch;

            for (int i = 0; i < output.Length; i += UTF16.GetCharCount(ch))
            {
                ch = UTF16.CharAt(output, i);
                IUnicodeReplacer r = data.LookupReplacer(ch);
                if (r == null)
                {
                    toUnionTo.Add(ch);
                }
                else
                {
                    r.AddReplacementSetTo(toUnionTo);
                }
            }
        }
Exemplo n.º 9
0
        /// <summary>
        /// Implementation of <see cref="IUnicodeMatcher"/> API.  Union the set of all
        /// characters that may be matched by this object into the given
        /// set.
        /// </summary>
        /// <param name="toUnionTo">The set into which to union the source characters.</param>
        public virtual void AddMatchSetTo(UnicodeSet toUnionTo)
        {
            int ch;

            for (int i = 0; i < pattern.Length; i += UTF16.GetCharCount(ch))
            {
                ch = UTF16.CharAt(pattern, i);
                IUnicodeMatcher matcher = data.LookupMatcher(ch);
                if (matcher == null)
                {
                    toUnionTo.Add(ch);
                }
                else
                {
                    matcher.AddMatchSetTo(toUnionTo);
                }
            }
        }
Exemplo n.º 10
0
        public SourceTargetUtility(ITransform <string, string> transform, Normalizer2 normalizer)
        {
            this.transform = transform;
            if (normalizer != null)
            {
                //            synchronized (SourceTargetUtility.class) {
                //                if (NFC == null) {
                //                    NFC = Normalizer2.getInstance(null, "nfc", Mode.COMPOSE);
                //                    for (int i = 0; i <= 0x10FFFF; ++i) {
                //                        String d = NFC.getDecomposition(i);
                //                        if (d == null) {
                //                            continue;
                //                        }
                //                        String s = NFC.normalize(d);
                //                        if (!CharSequences.equals(i, s)) {
                //                            continue;
                //                        }
                //                        // composes
                //                        boolean first = false;
                //                        for (int trailing : CharSequences.codePoints(d)) {
                //                            if (first) {
                //                                first = false;
                //                            } else {
                //                                TRAILING_COMBINING.add(trailing);
                //                            }
                //                        }
                //                    }
                //                }
                //            }
                sourceCache = new UnicodeSet("[:^ccc=0:]");
            }
            else
            {
                sourceCache = new UnicodeSet();
            }
            sourceStrings = new HashSet <string>();
            for (int i = 0; i <= 0x10FFFF; ++i)
            {
                string s     = transform.Transform(UTF16.ValueOf(i));
                bool   added = false;
                if (!CharSequences.Equals(i, s))
                {
                    sourceCache.Add(i);
                    added = true;
                }
                if (normalizer == null)
                {
                    continue;
                }
                string d = NFC.GetDecomposition(i);
                if (d == null)
                {
                    continue;
                }
                s = transform.Transform(d);
                if (!d.Equals(s))
                {
                    sourceStrings.Add(d);
                }
                if (added)
                {
                    continue;
                }
                if (!normalizer.IsInert(i))
                {
                    sourceCache.Add(i);
                    continue;
                }
                // see if any of the non-starters change s; if so, add i
                //            for (String ns : TRAILING_COMBINING) {
                //                String s2 = transform.transform(s + ns);
                //                if (!s2.startsWith(s)) {
                //                    sourceCache.add(i);
                //                    break;
                //                }
                //            }

                // int endOfFirst = CharSequences.onCharacterBoundary(d, 1) ? 1 : 2;
                // if (endOfFirst >= d.length()) {
                // continue;
                // }
                // // now add all initial substrings
                // for (int j = 1; j < d.length(); ++j) {
                // if (!CharSequences.onCharacterBoundary(d, j)) {
                // continue;
                // }
                // String dd = d.substring(0,j);
                // s = transform.transform(dd);
                // if (!dd.equals(s)) {
                // sourceStrings.add(dd);
                // }
                // }
            }
            sourceCache.Freeze();
        }