Example #1
0
        static BurmeseBreakEngine()
        {
            // Initialize UnicodeSets
            fBurmeseWordSet = new UnicodeSet();
            fMarkSet        = new UnicodeSet();
            fBeginWordSet   = new UnicodeSet();

            fBurmeseWordSet.ApplyPattern("[[:Mymr:]&[:LineBreak=SA:]]");
            fBurmeseWordSet.Compact();

            fMarkSet.ApplyPattern("[[:Mymr:]&[:LineBreak=SA:]&[:M:]]");
            fMarkSet.Add(0x0020);
            fEndWordSet = new UnicodeSet(fBurmeseWordSet);
            fBeginWordSet.Add(0x1000, 0x102A);      // basic consonants and independent vowels

            // Compact for caching
            fMarkSet.Compact();
            fEndWordSet.Compact();
            fBeginWordSet.Compact();

            // Freeze the static UnicodeSet
            fBurmeseWordSet.Freeze();
            fMarkSet.Freeze();
            fEndWordSet.Freeze();
            fBeginWordSet.Freeze();
        }
Example #2
0
        static LaoBreakEngine()
        {
            // Initialize UnicodeSets
            fLaoWordSet   = new UnicodeSet();
            fMarkSet      = new UnicodeSet();
            fBeginWordSet = new UnicodeSet();

            fLaoWordSet.ApplyPattern("[[:Laoo:]&[:LineBreak=SA:]]");
            fLaoWordSet.Compact();

            fMarkSet.ApplyPattern("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]");
            fMarkSet.Add(0x0020);
            fEndWordSet = new UnicodeSet(fLaoWordSet);
            fEndWordSet.Remove(0x0EC0, 0x0EC4); // prefix vowels
            fBeginWordSet.Add(0x0E81, 0x0EAE);  // basic consonants (including holes for corresponding Thai characters)
            fBeginWordSet.Add(0x0EDC, 0x0EDD);  // digraph consonants (no Thai equivalent)
            fBeginWordSet.Add(0x0EC0, 0x0EC4);  // prefix vowels

            // Compact for caching
            fMarkSet.Compact();
            fEndWordSet.Compact();
            fBeginWordSet.Compact();

            // Freeze the static UnicodeSet
            fLaoWordSet.Freeze();
            fMarkSet.Freeze();
            fEndWordSet.Freeze();
            fBeginWordSet.Freeze();
        }
Example #3
0
        static ThaiBreakEngine()
        {
            // Initialize UnicodeSets
            fThaiWordSet = new UnicodeSet();
            fMarkSet = new UnicodeSet();
            fBeginWordSet = new UnicodeSet();
            fSuffixSet = new UnicodeSet();

            fThaiWordSet.ApplyPattern("[[:Thai:]&[:LineBreak=SA:]]");
            fThaiWordSet.Compact();

            fMarkSet.ApplyPattern("[[:Thai:]&[:LineBreak=SA:]&[:M:]]");
            fMarkSet.Add(0x0020);
            fEndWordSet = new UnicodeSet(fThaiWordSet);
            fEndWordSet.Remove(0x0E31); // MAI HAN-AKAT
            fEndWordSet.Remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
            fBeginWordSet.Add(0x0E01, 0x0E2E); //KO KAI through HO NOKHUK
            fBeginWordSet.Add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
            fSuffixSet.Add(THAI_PAIYANNOI);
            fSuffixSet.Add(THAI_MAIYAMOK);

            // Compact for caching
            fMarkSet.Compact();
            fEndWordSet.Compact();
            fBeginWordSet.Compact();
            fSuffixSet.Compact();

            // Freeze the static UnicodeSet
            fThaiWordSet.Freeze();
            fMarkSet.Freeze();
            fEndWordSet.Freeze();
            fBeginWordSet.Freeze();
            fSuffixSet.Freeze();
        }
Example #4
0
        static KhmerBreakEngine()
        {
            // Initialize UnicodeSets
            fKhmerWordSet = new UnicodeSet();
            fMarkSet      = new UnicodeSet();
            fBeginWordSet = new UnicodeSet();

            fKhmerWordSet.ApplyPattern("[[:Khmer:]&[:LineBreak=SA:]]");
            fKhmerWordSet.Compact();

            fMarkSet.ApplyPattern("[[:Khmer:]&[:LineBreak=SA:]&[:M:]]");
            fMarkSet.Add(0x0020);
            fEndWordSet = new UnicodeSet(fKhmerWordSet);
            fBeginWordSet.Add(0x1780, 0x17B3);
            fEndWordSet.Remove(0x17D2); // KHMER SIGN COENG that combines some following characters

            // Compact for caching
            fMarkSet.Compact();
            fEndWordSet.Compact();
            fBeginWordSet.Compact();

            // Freeze the static UnicodeSet
            fKhmerWordSet.Freeze();
            fMarkSet.Freeze();
            fEndWordSet.Freeze();
            fBeginWordSet.Freeze();
        }
Example #5
0
        static CjkBreakEngine()
        {
            fHangulWordSet.ApplyPattern("[\\uac00-\\ud7a3]");
            fHanWordSet.ApplyPattern("[:Han:]");
            fKatakanaWordSet.ApplyPattern("[[:Katakana:]\\uff9e\\uff9f]");
            fHiraganaWordSet.ApplyPattern("[:Hiragana:]");

            // freeze them all
            fHangulWordSet.Freeze();
            fHanWordSet.Freeze();
            fKatakanaWordSet.Freeze();
            fHiraganaWordSet.Freeze();
        }
Example #6
0
        public SourceTargetUtility(ITransform <string, string> transform, Normalizer2 normalizer)
        {
            this.transform = transform;
            if (normalizer != null)
            {
                //            synchronized (SourceTargetUtility.class) {
                //                if (NFC == null) {
                //                    NFC = Normalizer2.getInstance(null, "nfc", Mode.COMPOSE);
                //                    for (int i = 0; i <= 0x10FFFF; ++i) {
                //                        String d = NFC.getDecomposition(i);
                //                        if (d == null) {
                //                            continue;
                //                        }
                //                        String s = NFC.normalize(d);
                //                        if (!CharSequences.equals(i, s)) {
                //                            continue;
                //                        }
                //                        // composes
                //                        boolean first = false;
                //                        for (int trailing : CharSequences.codePoints(d)) {
                //                            if (first) {
                //                                first = false;
                //                            } else {
                //                                TRAILING_COMBINING.add(trailing);
                //                            }
                //                        }
                //                    }
                //                }
                //            }
                sourceCache = new UnicodeSet("[:^ccc=0:]");
            }
            else
            {
                sourceCache = new UnicodeSet();
            }
            sourceStrings = new HashSet <string>();
            for (int i = 0; i <= 0x10FFFF; ++i)
            {
                string s     = transform.Transform(UTF16.ValueOf(i));
                bool   added = false;
                if (!CharSequences.Equals(i, s))
                {
                    sourceCache.Add(i);
                    added = true;
                }
                if (normalizer == null)
                {
                    continue;
                }
                string d = NFC.GetDecomposition(i);
                if (d == null)
                {
                    continue;
                }
                s = transform.Transform(d);
                if (!d.Equals(s))
                {
                    sourceStrings.Add(d);
                }
                if (added)
                {
                    continue;
                }
                if (!normalizer.IsInert(i))
                {
                    sourceCache.Add(i);
                    continue;
                }
                // see if any of the non-starters change s; if so, add i
                //            for (String ns : TRAILING_COMBINING) {
                //                String s2 = transform.transform(s + ns);
                //                if (!s2.startsWith(s)) {
                //                    sourceCache.add(i);
                //                    break;
                //                }
                //            }

                // int endOfFirst = CharSequences.onCharacterBoundary(d, 1) ? 1 : 2;
                // if (endOfFirst >= d.length()) {
                // continue;
                // }
                // // now add all initial substrings
                // for (int j = 1; j < d.length(); ++j) {
                // if (!CharSequences.onCharacterBoundary(d, j)) {
                // continue;
                // }
                // String dd = d.substring(0,j);
                // s = transform.transform(dd);
                // if (!dd.equals(s)) {
                // sourceStrings.add(dd);
                // }
                // }
            }
            sourceCache.Freeze();
        }