static LaoBreakEngine() { // Initialize UnicodeSets fLaoWordSet = new UnicodeSet(); fMarkSet = new UnicodeSet(); fBeginWordSet = new UnicodeSet(); fLaoWordSet.ApplyPattern("[[:Laoo:]&[:LineBreak=SA:]]"); fLaoWordSet.Compact(); fMarkSet.ApplyPattern("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"); fMarkSet.Add(0x0020); fEndWordSet = new UnicodeSet(fLaoWordSet); fEndWordSet.Remove(0x0EC0, 0x0EC4); // prefix vowels fBeginWordSet.Add(0x0E81, 0x0EAE); // basic consonants (including holes for corresponding Thai characters) fBeginWordSet.Add(0x0EDC, 0x0EDD); // digraph consonants (no Thai equivalent) fBeginWordSet.Add(0x0EC0, 0x0EC4); // prefix vowels // Compact for caching fMarkSet.Compact(); fEndWordSet.Compact(); fBeginWordSet.Compact(); // Freeze the static UnicodeSet fLaoWordSet.Freeze(); fMarkSet.Freeze(); fEndWordSet.Freeze(); fBeginWordSet.Freeze(); }
static ThaiBreakEngine() { // Initialize UnicodeSets fThaiWordSet = new UnicodeSet(); fMarkSet = new UnicodeSet(); fBeginWordSet = new UnicodeSet(); fSuffixSet = new UnicodeSet(); fThaiWordSet.ApplyPattern("[[:Thai:]&[:LineBreak=SA:]]"); fThaiWordSet.Compact(); fMarkSet.ApplyPattern("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"); fMarkSet.Add(0x0020); fEndWordSet = new UnicodeSet(fThaiWordSet); fEndWordSet.Remove(0x0E31); // MAI HAN-AKAT fEndWordSet.Remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI fBeginWordSet.Add(0x0E01, 0x0E2E); //KO KAI through HO NOKHUK fBeginWordSet.Add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI fSuffixSet.Add(THAI_PAIYANNOI); fSuffixSet.Add(THAI_MAIYAMOK); // Compact for caching fMarkSet.Compact(); fEndWordSet.Compact(); fBeginWordSet.Compact(); fSuffixSet.Compact(); // Freeze the static UnicodeSet fThaiWordSet.Freeze(); fMarkSet.Freeze(); fEndWordSet.Freeze(); fBeginWordSet.Freeze(); fSuffixSet.Freeze(); }
static BurmeseBreakEngine() { // Initialize UnicodeSets fBurmeseWordSet = new UnicodeSet(); fMarkSet = new UnicodeSet(); fBeginWordSet = new UnicodeSet(); fBurmeseWordSet.ApplyPattern("[[:Mymr:]&[:LineBreak=SA:]]"); fBurmeseWordSet.Compact(); fMarkSet.ApplyPattern("[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"); fMarkSet.Add(0x0020); fEndWordSet = new UnicodeSet(fBurmeseWordSet); fBeginWordSet.Add(0x1000, 0x102A); // basic consonants and independent vowels // Compact for caching fMarkSet.Compact(); fEndWordSet.Compact(); fBeginWordSet.Compact(); // Freeze the static UnicodeSet fBurmeseWordSet.Freeze(); fMarkSet.Freeze(); fEndWordSet.Freeze(); fBeginWordSet.Freeze(); }
/// <seealso cref="Transliterator.AddSourceTargetSet(UnicodeSet, UnicodeSet, UnicodeSet)"/> public override void AddSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) { // Each form consists of a prefix, suffix, // * radix, minimum digit count, and maximum digit count. These // * values are stored as a five character header. ... UnicodeSet myFilter = GetFilterAsUnicodeSet(inputFilter); UnicodeSet items = new UnicodeSet(); StringBuilder buffer = new StringBuilder(); for (int i = 0; spec[i] != END;) { // first 5 items are header int end = i + spec[i] + spec[i + 1] + 5; int radix = spec[i + 2]; for (int j = 0; j < radix; ++j) { Utility.AppendNumber(buffer, j, radix, 0); } // then add the characters for (int j = i + 5; j < end; ++j) { items.Add(spec[j]); } // and go to next block i = end; } items.AddAll(buffer.ToString()); items.RetainAll(myFilter); if (items.Count > 0) { sourceSet.AddAll(items); targetSet.AddAll(0, 0x10FFFF); // assume we can produce any character } }
static KhmerBreakEngine() { // Initialize UnicodeSets fKhmerWordSet = new UnicodeSet(); fMarkSet = new UnicodeSet(); fBeginWordSet = new UnicodeSet(); fKhmerWordSet.ApplyPattern("[[:Khmer:]&[:LineBreak=SA:]]"); fKhmerWordSet.Compact(); fMarkSet.ApplyPattern("[[:Khmer:]&[:LineBreak=SA:]&[:M:]]"); fMarkSet.Add(0x0020); fEndWordSet = new UnicodeSet(fKhmerWordSet); fBeginWordSet.Add(0x1780, 0x17B3); fEndWordSet.Remove(0x17D2); // KHMER SIGN COENG that combines some following characters // Compact for caching fMarkSet.Compact(); fEndWordSet.Compact(); fBeginWordSet.Compact(); // Freeze the static UnicodeSet fKhmerWordSet.Freeze(); fMarkSet.Freeze(); fEndWordSet.Freeze(); fBeginWordSet.Freeze(); }
public CjkBreakEngine(bool korean) : base(BreakIterator.KIND_WORD) { fDictionary = DictionaryData.LoadDictionaryFor("Hira"); if (korean) { SetCharacters(fHangulWordSet); } else { //Chinese and Japanese UnicodeSet cjSet = new UnicodeSet(); cjSet.AddAll(fHanWordSet); cjSet.AddAll(fKatakanaWordSet); cjSet.AddAll(fHiraganaWordSet); cjSet.Add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK cjSet.Add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK SetCharacters(cjSet); } }
/// <summary> /// Find the source and target sets, subject to the input filter. /// There is a known issue with filters containing multiple characters. /// </summary> // TODO: Problem: the rule is [{ab}]c > x // The filter is [a{bc}]. // If the input is abc, then the rule will work. // However, following code applying the filter won't catch that case. internal void AddSourceTargetSet(UnicodeSet filter, UnicodeSet sourceSet, UnicodeSet targetSet, UnicodeSet revisiting) { int limit = anteContextLength + keyLength; UnicodeSet tempSource = new UnicodeSet(); UnicodeSet temp = new UnicodeSet(); // We need to walk through the pattern. // Iff some of the characters at ALL of the the positions are matched by the filter, then we add temp to toUnionTo for (int i = anteContextLength; i < limit;) { int ch = UTF16.CharAt(pattern, i); i += UTF16.GetCharCount(ch); IUnicodeMatcher matcher = data.LookupMatcher(ch); if (matcher == null) { if (!filter.Contains(ch)) { return; } tempSource.Add(ch); } else { try { if (!filter.ContainsSome((UnicodeSet)matcher)) { return; } matcher.AddMatchSetTo(tempSource); } catch (InvalidCastException) { // if the matcher is not a UnicodeSet temp.Clear(); matcher.AddMatchSetTo(temp); if (!filter.ContainsSome(temp)) { return; } tempSource.AddAll(temp); } } } // if we made our way through the gauntlet, add to source/target sourceSet.AddAll(tempSource); output.AddReplacementSetTo(targetSet); }
/// <summary> /// Union the set of all characters that may output by this object /// into the given set. /// </summary> /// <param name="toUnionTo">The set into which to union the output characters.</param> public virtual void AddReplacementSetTo(UnicodeSet toUnionTo) { int ch; for (int i = 0; i < output.Length; i += UTF16.GetCharCount(ch)) { ch = UTF16.CharAt(output, i); IUnicodeReplacer r = data.LookupReplacer(ch); if (r == null) { toUnionTo.Add(ch); } else { r.AddReplacementSetTo(toUnionTo); } } }
/// <summary> /// Implementation of <see cref="IUnicodeMatcher"/> API. Union the set of all /// characters that may be matched by this object into the given /// set. /// </summary> /// <param name="toUnionTo">The set into which to union the source characters.</param> public virtual void AddMatchSetTo(UnicodeSet toUnionTo) { int ch; for (int i = 0; i < pattern.Length; i += UTF16.GetCharCount(ch)) { ch = UTF16.CharAt(pattern, i); IUnicodeMatcher matcher = data.LookupMatcher(ch); if (matcher == null) { toUnionTo.Add(ch); } else { matcher.AddMatchSetTo(toUnionTo); } } }
public SourceTargetUtility(ITransform <string, string> transform, Normalizer2 normalizer) { this.transform = transform; if (normalizer != null) { // synchronized (SourceTargetUtility.class) { // if (NFC == null) { // NFC = Normalizer2.getInstance(null, "nfc", Mode.COMPOSE); // for (int i = 0; i <= 0x10FFFF; ++i) { // String d = NFC.getDecomposition(i); // if (d == null) { // continue; // } // String s = NFC.normalize(d); // if (!CharSequences.equals(i, s)) { // continue; // } // // composes // boolean first = false; // for (int trailing : CharSequences.codePoints(d)) { // if (first) { // first = false; // } else { // TRAILING_COMBINING.add(trailing); // } // } // } // } // } sourceCache = new UnicodeSet("[:^ccc=0:]"); } else { sourceCache = new UnicodeSet(); } sourceStrings = new HashSet <string>(); for (int i = 0; i <= 0x10FFFF; ++i) { string s = transform.Transform(UTF16.ValueOf(i)); bool added = false; if (!CharSequences.Equals(i, s)) { sourceCache.Add(i); added = true; } if (normalizer == null) { continue; } string d = NFC.GetDecomposition(i); if (d == null) { continue; } s = transform.Transform(d); if (!d.Equals(s)) { sourceStrings.Add(d); } if (added) { continue; } if (!normalizer.IsInert(i)) { sourceCache.Add(i); continue; } // see if any of the non-starters change s; if so, add i // for (String ns : TRAILING_COMBINING) { // String s2 = transform.transform(s + ns); // if (!s2.startsWith(s)) { // sourceCache.add(i); // break; // } // } // int endOfFirst = CharSequences.onCharacterBoundary(d, 1) ? 1 : 2; // if (endOfFirst >= d.length()) { // continue; // } // // now add all initial substrings // for (int j = 1; j < d.length(); ++j) { // if (!CharSequences.onCharacterBoundary(d, j)) { // continue; // } // String dd = d.substring(0,j); // s = transform.transform(dd); // if (!dd.equals(s)) { // sourceStrings.add(dd); // } // } } sourceCache.Freeze(); }