// set of property starts for UnicodeSet ------------------------------- *** public void AddPropertyStarts(UnicodeSet set) { int i, length; int c, start, limit; sbyte prev, jg; /* add the start code point of each same-value range of the trie */ TrieIterator iter = new TrieIterator(trie); RangeValueIterator_Constants.Element element = new RangeValueIterator_Constants.Element(); while (iter.Next(element)) { set.Add(element.start); } /* add the code points from the bidi mirroring table */ length = indexes[IX_MIRROR_LENGTH]; for (i = 0; i < length; ++i) { c = GetMirrorCodePoint(mirrors[i]); set.Add(c, c + 1); } /* * add the code points from the Joining_Group array where the value * changes */ start = indexes[IX_JG_START]; limit = indexes[IX_JG_LIMIT]; length = limit - start; prev = 0; for (i = 0; i < length; ++i) { jg = jgArray[i]; if (jg != prev) { set.Add(start); prev = jg; } ++start; } if (prev != 0) { /* * add the limit code point if the last value was not 0 (it is now * start==limit) */ set.Add(limit); } /* * add code points with hardcoded properties, plus the ones following * them */ /* (none right now) */ }
static public void AddAll(UnicodeSetIterator source, UnicodeSet result) { while (source.NextRange()) { if (source.codepoint == IBM.ICU.Text.UnicodeSetIterator.IS_STRING) { result.Add(source.str0); } else { result.Add(source.codepoint, source.codepointEnd); } } }
/// <summary> /// Temporary test, just to see how the stuff works. /// </summary> /// static public void Main(String[] args) { String[] testCases = { "fiss", "h\u03a3" }; CaseIterator ci = new CaseIterator(); for (int i = 0; i < testCases.Length; ++i) { String item = testCases[i]; System.Console.Out.WriteLine(); System.Console.Out.WriteLine("Testing: " + toName.Transliterate(item)); System.Console.Out.WriteLine(); ci.Reset(item); int count_0 = 0; for (String temp = ci.Next(); temp != null; temp = ci.Next()) { System.Console.Out.WriteLine(toName.Transliterate(temp)); count_0++; } System.Console.Out.WriteLine("Total: " + count_0); } // generate a list of all caseless characters -- characters whose // case closure is themselves. UnicodeSet caseless = new UnicodeSet(); for (int i_1 = 0; i_1 <= 0x10FFFF; ++i_1) { String cp = IBM.ICU.Text.UTF16.ValueOf(i_1); ci.Reset(cp); int count_2 = 0; String fold = null; for (String temp_3 = ci.Next(); temp_3 != null; temp_3 = ci.Next()) { fold = temp_3; if (++count_2 > 1) { break; } } if (count_2 == 1 && fold.Equals(cp)) { caseless.Add(i_1); } } System.Console.Out.WriteLine("caseless = " + caseless.ToPattern(true)); UnicodeSet not_lc = new UnicodeSet("[:^lc:]"); UnicodeSet a = new UnicodeSet(); a.Set(not_lc); a.RemoveAll(caseless); System.Console.Out.WriteLine("[:^lc:] - caseless = " + a.ToPattern(true)); a.Set(caseless); a.RemoveAll(not_lc); System.Console.Out.WriteLine("caseless - [:^lc:] = " + a.ToPattern(true)); }
// Java porting note: ICU4C returns U_SUCCESS(error) and it's not applicable to ICU4J. // Also, ICU4C requires handleCE32() to be public because it is used by the callback // function (enumTailoredRange()). This is not necessary for Java implementation. private void HandleCE32(int start, int end, int ce32) { Debug.Assert(ce32 != Collation.FALLBACK_CE32); if (Collation.IsSpecialCE32(ce32)) { ce32 = data.GetIndirectCE32(ce32); if (ce32 == Collation.FALLBACK_CE32) { return; } } do { int baseCE32 = baseData.GetFinalCE32(baseData.GetCE32(start)); // Do not just continue if ce32 == baseCE32 because // contractions and expansions in different data objects // normally differ even if they have the same data offsets. if (Collation.IsSelfContainedCE32(ce32) && Collation.IsSelfContainedCE32(baseCE32)) { // fastpath if (ce32 != baseCE32) { tailored.Add(start); } } else { Compare(start, ce32, baseCE32); } } while (++start <= end); }
internal void AddExpansions(int start, int end) { if (unreversedPrefix.Length == 0 && suffix == null) { if (expansions != null) { expansions.Add(start, end); } } else { AddStrings(start, end, expansions); } }
/* * @see com.ibm.icu.dev.test.TestBoilerplate#_createTestObject() */ protected internal override bool _addTestObject(IList list) { if (list.Count > 32) { return(false); } UnicodeSet result = new UnicodeSet(); for (int i = 0; i < 50; ++i) { result.Add(IBM.ICU.Charset.TestUtilities.random.Next(100)); } ILOG.J2CsMapping.Collections.Generics.Collections.Add(list, result.ToString()); return(true); }
public static UnicodeSet GetSet(IDictionary <Integer, T> m, T value) { UnicodeSet result = new UnicodeSet(); foreach (var key in m.Keys) { T val = m.Get(key); if (!val.Equals(value)) { continue; } result.Add(key.Value); } return(result); }
public static UnicodeSet GetSet(IDictionary m, Object value_ren) { UnicodeSet result = new UnicodeSet(); for (IIterator it = new ILOG.J2CsMapping.Collections.IteratorAdapter(new ILOG.J2CsMapping.Collections.ListSet(m.Keys).GetEnumerator()); it.HasNext();) { Object key = it.Next(); Object val = ILOG.J2CsMapping.Collections.Collections.Get(m, key); if (!val.Equals(value_ren)) { continue; } result.Add(((Int32)key)); } return(result); }
public static UnicodeSet GetSet(IDictionary <int, T> m, T value) { UnicodeSet result = new UnicodeSet(); // ICU4N: Optimized by looping over the pair instead of doing a separate // value lookup on each loop. foreach (var pair in m) { if (!pair.Value.Equals(value)) { continue; } result.Add(pair.Key); } return(result); }
/// <summary> /// Union the set of all characters that may output by this object into the /// given set. /// </summary> /// /// <param name="toUnionTo">the set into which to union the output characters</param> public virtual void AddReplacementSetTo(UnicodeSet toUnionTo) { int ch; for (int i = 0; i < output.Length; i += IBM.ICU.Text.UTF16.GetCharCount(ch)) { ch = IBM.ICU.Text.UTF16.CharAt(output, i); UnicodeReplacer r = data.LookupReplacer(ch); if (r == null) { toUnionTo.Add(ch); } else { r.AddReplacementSetTo(toUnionTo); } } }
/// <summary> /// Implementation of UnicodeMatcher API. Union the set of all characters /// that may be matched by this object into the given set. /// </summary> /// /// <param name="toUnionTo">the set into which to union the source characters</param> public virtual void AddMatchSetTo(UnicodeSet toUnionTo) { int ch; for (int i = 0; i < pattern.Length; i += IBM.ICU.Text.UTF16.GetCharCount(ch)) { ch = IBM.ICU.Text.UTF16.CharAt(pattern, i); UnicodeMatcher matcher = data.LookupMatcher(ch); if (matcher == null) { toUnionTo.Add(ch); } else { matcher.AddMatchSetTo(toUnionTo); } } }
internal void AddStrings(int start, int end, UnicodeSet set) { if (set == null) { return; } StringBuilder s = new StringBuilder(unreversedPrefix.ToString()); do { s.AppendCodePoint(start); if (suffix != null) { s.Append(suffix); } set.Add(s); s.Length = unreversedPrefix.Length; } while (++start <= end); }
/// <summary> /// Union the set of all characters that may be modified by this rule into /// the given set. /// </summary> /// internal void AddSourceSetTo(UnicodeSet toUnionTo) { int limit = anteContextLength + keyLength; for (int i = anteContextLength; i < limit;) { int ch = IBM.ICU.Text.UTF16.CharAt(pattern, i); i += IBM.ICU.Text.UTF16.GetCharCount(ch); UnicodeMatcher matcher = data.LookupMatcher(ch); if (matcher == null) { toUnionTo.Add(ch); } else { matcher.AddMatchSetTo(toUnionTo); } } }
// set of property starts for UnicodeSet ------------------------------- *** public void AddPropertyStarts(UnicodeSet set) { /* add the start code point of each same-value range of the trie */ using (var trieIterator = trie.GetEnumerator()) { Trie2.Range range; while (trieIterator.MoveNext() && !(range = trieIterator.Current).LeadSurrogate) { set.Add(range.StartCodePoint); } } /* add code points with hardcoded properties, plus the ones following them */ /* (none right now, see comment below) */ /* * Omit code points with hardcoded specialcasing properties * because we do not build property UnicodeSets for them right now. */ }
/// <summary> /// Modifies Unicode set to flatten the strings. Eg [abc{da}] => [abcd] /// Returns the set for chaining. /// </summary> /// /// <param name="exemplar1"></param> /// <returns></returns> public static UnicodeSet Flatten(UnicodeSet exemplar1) { UnicodeSet result = new UnicodeSet(); bool gotString = false; for (UnicodeSetIterator it = new UnicodeSetIterator(exemplar1); it .NextRange();) { if (it.codepoint == IBM.ICU.Text.UnicodeSetIterator.IS_STRING) { result.AddAll(it.str0); gotString = true; } else { result.Add(it.codepoint, it.codepointEnd); } } if (gotString) { exemplar1.Set(result); } return(exemplar1); }
// set of property starts for UnicodeSet ------------------------------- *** public void AddPropertyStarts(UnicodeSet set) { int i, length; int c, start, limit; byte prev, jg; /* add the start code point of each same-value range of the trie */ using (var trieIterator = trie.GetEnumerator()) { Trie2Range range; while (trieIterator.MoveNext() && !(range = trieIterator.Current).IsLeadSurrogate) { set.Add(range.StartCodePoint); } } /* add the code points from the bidi mirroring table */ length = indexes[IX_MIRROR_LENGTH]; for (i = 0; i < length; ++i) { c = GetMirrorCodePoint(mirrors[i]); set.Add(c, c + 1); } /* add the code points from the Joining_Group array where the value changes */ start = indexes[IX_JG_START]; limit = indexes[IX_JG_LIMIT]; byte[] jga = jgArray; for (; ;) { length = limit - start; prev = 0; for (i = 0; i < length; ++i) { jg = jga[i]; if (jg != prev) { set.Add(start); prev = jg; } ++start; } if (prev != 0) { /* add the limit code point if the last value was not 0 (it is now start==limit) */ set.Add(limit); } if (limit == indexes[IX_JG_LIMIT]) { /* switch to the second Joining_Group range */ start = indexes[IX_JG_START2]; limit = indexes[IX_JG_LIMIT2]; jga = jgArray2; } else { break; } } /* add code points with hardcoded properties, plus the ones following them */ /* (none right now) */ }
/// <summary> /// Quote a literal string, using the available settings. Thus syntax /// characters, quote characters, and ignorable characters will be put into /// quotes. /// </summary> /// /// <param name="string"></param> /// <returns></returns> public String QuoteLiteral(String str0) { if (needingQuoteCharacters == null) { needingQuoteCharacters = new UnicodeSet().AddAll(syntaxCharacters) .AddAll(ignorableCharacters).AddAll(extraQuotingCharacters); // .addAll(quoteCharacters) if (usingSlash) { needingQuoteCharacters.Add(BACK_SLASH); } if (usingQuote) { needingQuoteCharacters.Add(SINGLE_QUOTE); } } StringBuilder result = new StringBuilder(); int quotedChar = NO_QUOTE; int cp; for (int i = 0; i < str0.Length; i += IBM.ICU.Text.UTF16.GetCharCount(cp)) { cp = IBM.ICU.Text.UTF16.CharAt(str0, i); if (escapeCharacters.Contains(cp)) { // we may have to fix up previous characters if (quotedChar == IN_QUOTE) { result.Append(SINGLE_QUOTE); quotedChar = NO_QUOTE; } AppendEscaped(result, cp); continue; } if (needingQuoteCharacters.Contains(cp)) { // if we have already started a quote if (quotedChar == IN_QUOTE) { IBM.ICU.Text.UTF16.Append(result, cp); if (usingQuote && cp == SINGLE_QUOTE) // double it { result.Append(SINGLE_QUOTE); } continue; } // otherwise not already in quote if (usingSlash) { result.Append(BACK_SLASH); IBM.ICU.Text.UTF16.Append(result, cp); continue; } if (usingQuote) { if (cp == SINGLE_QUOTE) // double it and continue { result.Append(SINGLE_QUOTE); result.Append(SINGLE_QUOTE); continue; } result.Append(SINGLE_QUOTE); IBM.ICU.Text.UTF16.Append(result, cp); quotedChar = IN_QUOTE; continue; } // we have no choice but to use \\u or \\U AppendEscaped(result, cp); continue; } // otherwise cp doesn't need quoting // we may have to fix up previous characters if (quotedChar == IN_QUOTE) { result.Append(SINGLE_QUOTE); quotedChar = NO_QUOTE; } IBM.ICU.Text.UTF16.Append(result, cp); } // all done. // we may have to fix up previous characters if (quotedChar == IN_QUOTE) { result.Append(SINGLE_QUOTE); } return(result.ToString()); }
/// <summary> /// Adds all simple case mappings and the full case folding for <paramref name="c"/> to sa, /// and also adds special case closure mappings. /// </summary> /// <remarks> /// <paramref name="c"/> itself is not added. /// For example, the mappings /// <list type="bullet"> /// <item><description>for s include long s</description></item> /// <item><description>for sharp s include ss</description></item> /// <item><description>for k include the Kelvin sign</description></item> /// </list> /// </remarks> public void AddCaseClosure(int c, UnicodeSet set) { /* * Hardcode the case closure of i and its relatives and ignore the * data file data for these characters. * The Turkic dotless i and dotted I with their case mapping conditions * and case folding option make the related characters behave specially. * This code matches their closure behavior to their case folding behavior. */ switch (c) { case 0x49: /* regular i and I are in one equivalence class */ set.Add(0x69); return; case 0x69: set.Add(0x49); return; case 0x130: /* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */ set.Add(iDot); return; case 0x131: /* dotless i is in a class by itself */ return; default: /* otherwise use the data file data */ break; } int props = trie.Get(c); if (!PropsHasException(props)) { if (GetTypeFromProps(props) != NONE) { /* add the one simple case mapping, no matter what type it is */ int delta = GetDelta(props); if (delta != 0) { set.Add(c + delta); } } } else { /* * c has exceptions, so there may be multiple simple and/or * full case mappings. Add them all. */ int excOffset0, excOffset = GetExceptionsOffset(props); int closureOffset; int excWord = exceptions[excOffset++]; int index, closureLength, fullLength, length; excOffset0 = excOffset; /* add all simple case mappings */ for (index = EXC_LOWER; index <= EXC_TITLE; ++index) { if (HasSlot(excWord, index)) { excOffset = excOffset0; c = GetSlotValue(excWord, index, excOffset); set.Add(c); } } /* get the closure string pointer & length */ if (HasSlot(excWord, EXC_CLOSURE)) { excOffset = excOffset0; long value = GetSlotValueAndOffset(excWord, EXC_CLOSURE, excOffset); closureLength = (int)value & CLOSURE_MAX_LENGTH; /* higher bits are reserved */ closureOffset = (int)(value >> 32) + 1; /* behind this slot, unless there are full case mappings */ } else { closureLength = 0; closureOffset = 0; } /* add the full case folding */ if (HasSlot(excWord, EXC_FULL_MAPPINGS)) { excOffset = excOffset0; long value = GetSlotValueAndOffset(excWord, EXC_FULL_MAPPINGS, excOffset); fullLength = (int)value; /* start of full case mapping strings */ excOffset = (int)(value >> 32) + 1; fullLength &= 0xffff; /* bits 16 and higher are reserved */ /* skip the lowercase result string */ excOffset += fullLength & FULL_LOWER; fullLength >>= 4; /* add the full case folding string */ length = fullLength & 0xf; if (length != 0) { set.Add(exceptions.Substring(excOffset, length)); // ICU4N: excOffset + length - excOffset == length excOffset += length; } /* skip the uppercase and titlecase strings */ fullLength >>= 4; excOffset += fullLength & 0xf; fullLength >>= 4; excOffset += fullLength; closureOffset = excOffset; /* behind full case mappings */ } /* add each code point in the closure string */ int limit = closureOffset + closureLength; for (index = closureOffset; index < limit; index += UTF16.GetCharCount(c)) { c = exceptions.CodePointAt(index); set.Add(c); } } }
/// <summary> /// Maps the string to single code points and adds the associated case closure /// mappings. /// </summary> /// <remarks> /// The string is mapped to code points if it is their full case folding string. /// In other words, this performs a reverse full case folding and then /// adds the case closure items of the resulting code points. /// If the string is found and its closure applied, then /// the string itself is added as well as part of its code points' closure. /// </remarks> /// <returns>true if the string was found.</returns> public bool AddStringCaseClosure(string s, UnicodeSet set) { int i, length, start, limit, result, unfoldOffset, unfoldRows, unfoldRowWidth, unfoldStringWidth; if (unfold == null || s == null) { return(false); /* no reverse case folding data, or no string */ } length = s.Length; if (length <= 1) { /* the string is too short to find any match */ /* * more precise would be: * if(!u_strHasMoreChar32Than(s, length, 1)) * but this does not make much practical difference because * a single supplementary code point would just not be found */ return(false); } unfoldRows = unfold[UNFOLD_ROWS]; unfoldRowWidth = unfold[UNFOLD_ROW_WIDTH]; unfoldStringWidth = unfold[UNFOLD_STRING_WIDTH]; //unfoldCPWidth=unfoldRowWidth-unfoldStringWidth; if (length > unfoldStringWidth) { /* the string is too long to find any match */ return(false); } /* do a binary search for the string */ start = 0; limit = unfoldRows; while (start < limit) { i = (start + limit) / 2; unfoldOffset = ((i + 1) * unfoldRowWidth); // +1 to skip the header values above result = StrCmpMax(s, unfoldOffset, unfoldStringWidth); if (result == 0) { /* found the string: add each code point, and its case closure */ int c; for (i = unfoldStringWidth; i < unfoldRowWidth && unfold[unfoldOffset + i] != 0; i += UTF16.GetCharCount(c)) { c = UTF16.CharAt(unfold, unfoldOffset, unfold.Length, i); set.Add(c); AddCaseClosure(c, set); } return(true); } else if (result < 0) { limit = i; } else /* result>0 */ { start = i + 1; } } return(false); /* string not found */ }