static BurmeseBreakEngine() { // Initialize UnicodeSets fBurmeseWordSet = new UnicodeSet(); fMarkSet = new UnicodeSet(); fBeginWordSet = new UnicodeSet(); fBurmeseWordSet.ApplyPattern("[[:Mymr:]&[:LineBreak=SA:]]"); fBurmeseWordSet.Compact(); fMarkSet.ApplyPattern("[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"); fMarkSet.Add(0x0020); fEndWordSet = new UnicodeSet(fBurmeseWordSet); fBeginWordSet.Add(0x1000, 0x102A); // basic consonants and independent vowels // Compact for caching fMarkSet.Compact(); fEndWordSet.Compact(); fBeginWordSet.Compact(); // Freeze the static UnicodeSet fBurmeseWordSet.Freeze(); fMarkSet.Freeze(); fEndWordSet.Freeze(); fBeginWordSet.Freeze(); }
static LaoBreakEngine() { // Initialize UnicodeSets fLaoWordSet = new UnicodeSet(); fMarkSet = new UnicodeSet(); fBeginWordSet = new UnicodeSet(); fLaoWordSet.ApplyPattern("[[:Laoo:]&[:LineBreak=SA:]]"); fLaoWordSet.Compact(); fMarkSet.ApplyPattern("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"); fMarkSet.Add(0x0020); fEndWordSet = new UnicodeSet(fLaoWordSet); fEndWordSet.Remove(0x0EC0, 0x0EC4); // prefix vowels fBeginWordSet.Add(0x0E81, 0x0EAE); // basic consonants (including holes for corresponding Thai characters) fBeginWordSet.Add(0x0EDC, 0x0EDD); // digraph consonants (no Thai equivalent) fBeginWordSet.Add(0x0EC0, 0x0EC4); // prefix vowels // Compact for caching fMarkSet.Compact(); fEndWordSet.Compact(); fBeginWordSet.Compact(); // Freeze the static UnicodeSet fLaoWordSet.Freeze(); fMarkSet.Freeze(); fEndWordSet.Freeze(); fBeginWordSet.Freeze(); }
static ThaiBreakEngine() { // Initialize UnicodeSets fThaiWordSet = new UnicodeSet(); fMarkSet = new UnicodeSet(); fBeginWordSet = new UnicodeSet(); fSuffixSet = new UnicodeSet(); fThaiWordSet.ApplyPattern("[[:Thai:]&[:LineBreak=SA:]]"); fThaiWordSet.Compact(); fMarkSet.ApplyPattern("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"); fMarkSet.Add(0x0020); fEndWordSet = new UnicodeSet(fThaiWordSet); fEndWordSet.Remove(0x0E31); // MAI HAN-AKAT fEndWordSet.Remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI fBeginWordSet.Add(0x0E01, 0x0E2E); //KO KAI through HO NOKHUK fBeginWordSet.Add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI fSuffixSet.Add(THAI_PAIYANNOI); fSuffixSet.Add(THAI_MAIYAMOK); // Compact for caching fMarkSet.Compact(); fEndWordSet.Compact(); fBeginWordSet.Compact(); fSuffixSet.Compact(); // Freeze the static UnicodeSet fThaiWordSet.Freeze(); fMarkSet.Freeze(); fEndWordSet.Freeze(); fBeginWordSet.Freeze(); fSuffixSet.Freeze(); }
static KhmerBreakEngine() { // Initialize UnicodeSets fKhmerWordSet = new UnicodeSet(); fMarkSet = new UnicodeSet(); fBeginWordSet = new UnicodeSet(); fKhmerWordSet.ApplyPattern("[[:Khmer:]&[:LineBreak=SA:]]"); fKhmerWordSet.Compact(); fMarkSet.ApplyPattern("[[:Khmer:]&[:LineBreak=SA:]&[:M:]]"); fMarkSet.Add(0x0020); fEndWordSet = new UnicodeSet(fKhmerWordSet); fBeginWordSet.Add(0x1780, 0x17B3); fEndWordSet.Remove(0x17D2); // KHMER SIGN COENG that combines some following characters // Compact for caching fMarkSet.Compact(); fEndWordSet.Compact(); fBeginWordSet.Compact(); // Freeze the static UnicodeSet fKhmerWordSet.Freeze(); fMarkSet.Freeze(); fEndWordSet.Freeze(); fBeginWordSet.Freeze(); }
static CjkBreakEngine() { fHangulWordSet.ApplyPattern("[\\uac00-\\ud7a3]"); fHanWordSet.ApplyPattern("[:Han:]"); fKatakanaWordSet.ApplyPattern("[[:Katakana:]\\uff9e\\uff9f]"); fHiraganaWordSet.ApplyPattern("[:Hiragana:]"); // freeze them all fHangulWordSet.Freeze(); fHanWordSet.Freeze(); fKatakanaWordSet.Freeze(); fHiraganaWordSet.Freeze(); }
public SourceTargetUtility(ITransform <string, string> transform, Normalizer2 normalizer) { this.transform = transform; if (normalizer != null) { // synchronized (SourceTargetUtility.class) { // if (NFC == null) { // NFC = Normalizer2.getInstance(null, "nfc", Mode.COMPOSE); // for (int i = 0; i <= 0x10FFFF; ++i) { // String d = NFC.getDecomposition(i); // if (d == null) { // continue; // } // String s = NFC.normalize(d); // if (!CharSequences.equals(i, s)) { // continue; // } // // composes // boolean first = false; // for (int trailing : CharSequences.codePoints(d)) { // if (first) { // first = false; // } else { // TRAILING_COMBINING.add(trailing); // } // } // } // } // } sourceCache = new UnicodeSet("[:^ccc=0:]"); } else { sourceCache = new UnicodeSet(); } sourceStrings = new HashSet <string>(); for (int i = 0; i <= 0x10FFFF; ++i) { string s = transform.Transform(UTF16.ValueOf(i)); bool added = false; if (!CharSequences.Equals(i, s)) { sourceCache.Add(i); added = true; } if (normalizer == null) { continue; } string d = NFC.GetDecomposition(i); if (d == null) { continue; } s = transform.Transform(d); if (!d.Equals(s)) { sourceStrings.Add(d); } if (added) { continue; } if (!normalizer.IsInert(i)) { sourceCache.Add(i); continue; } // see if any of the non-starters change s; if so, add i // for (String ns : TRAILING_COMBINING) { // String s2 = transform.transform(s + ns); // if (!s2.startsWith(s)) { // sourceCache.add(i); // break; // } // } // int endOfFirst = CharSequences.onCharacterBoundary(d, 1) ? 1 : 2; // if (endOfFirst >= d.length()) { // continue; // } // // now add all initial substrings // for (int j = 1; j < d.length(); ++j) { // if (!CharSequences.onCharacterBoundary(d, j)) { // continue; // } // String dd = d.substring(0,j); // s = transform.transform(dd); // if (!dd.equals(s)) { // sourceStrings.add(dd); // } // } } sourceCache.Freeze(); }