/// <summary> /// Gets the current string from the iterator. Only use after calling <see cref="Next()"/>, /// not <see cref="NextRange()"/>. /// </summary> /// <stable>ICU 4.0</stable> public virtual string GetString() // ICU4N TODO: API String vs GetString() - confusing. This should be made into String property and the current string property made into a private field. { if (Codepoint != IS_STRING) { return(UTF16.ValueOf(Codepoint)); } return(String); }
public static void Permute(string source, bool skipZeros, ISet <string> output) { // TODO: optimize //if (PROGRESS) System.out.println("Permute: " + source); // optimization: // if zero or one character, just return a set with it // we check for length < 2 to keep from counting code points all the time if (source.Length <= 2 && UTF16.CountCodePoint(source) <= 1) { output.Add(source); return; } // otherwise iterate through the string, and recursively permute all the other characters ISet <string> subpermute = new HashSet <string>(); int cp; for (int i = 0; i < source.Length; i += UTF16.GetCharCount(cp)) { cp = UTF16.CharAt(source, i); // optimization: // if the character is canonical combining class zero, // don't permute it if (skipZeros && i != 0 && UCharacter.GetCombiningClass(cp) == 0) { //System.out.println("Skipping " + Utility.hex(UTF16.valueOf(source, i))); continue; } // see what the permutations of the characters before and after this one are subpermute.Clear(); Permute(source.Substring(0, i - 0) // ICU4N: Checked 2nd parameter + source.Substring(i + UTF16.GetCharCount(cp)), skipZeros, subpermute); // ICU4N: Substring only has 1 parameter // prefix this character to all of them string chStr = UTF16.ValueOf(source, i); foreach (string s in subpermute) { string piece = chStr + s; //if (PROGRESS) System.out.println(" Piece: " + piece); output.Add(piece); } } }
public SourceTargetUtility(ITransform <string, string> transform, Normalizer2 normalizer) { this.transform = transform; if (normalizer != null) { // synchronized (SourceTargetUtility.class) { // if (NFC == null) { // NFC = Normalizer2.getInstance(null, "nfc", Mode.COMPOSE); // for (int i = 0; i <= 0x10FFFF; ++i) { // String d = NFC.getDecomposition(i); // if (d == null) { // continue; // } // String s = NFC.normalize(d); // if (!CharSequences.equals(i, s)) { // continue; // } // // composes // boolean first = false; // for (int trailing : CharSequences.codePoints(d)) { // if (first) { // first = false; // } else { // TRAILING_COMBINING.add(trailing); // } // } // } // } // } sourceCache = new UnicodeSet("[:^ccc=0:]"); } else { sourceCache = new UnicodeSet(); } sourceStrings = new HashSet <string>(); for (int i = 0; i <= 0x10FFFF; ++i) { string s = transform.Transform(UTF16.ValueOf(i)); bool added = false; if (!CharSequences.Equals(i, s)) { sourceCache.Add(i); added = true; } if (normalizer == null) { continue; } string d = NFC.GetDecomposition(i); if (d == null) { continue; } s = transform.Transform(d); if (!d.Equals(s)) { sourceStrings.Add(d); } if (added) { continue; } if (!normalizer.IsInert(i)) { sourceCache.Add(i); continue; } // see if any of the non-starters change s; if so, add i // for (String ns : TRAILING_COMBINING) { // String s2 = transform.transform(s + ns); // if (!s2.startsWith(s)) { // sourceCache.add(i); // break; // } // } // int endOfFirst = CharSequences.onCharacterBoundary(d, 1) ? 1 : 2; // if (endOfFirst >= d.length()) { // continue; // } // // now add all initial substrings // for (int j = 1; j < d.length(); ++j) { // if (!CharSequences.onCharacterBoundary(d, j)) { // continue; // } // String dd = d.substring(0,j); // s = transform.transform(dd); // if (!dd.equals(s)) { // sourceStrings.add(dd); // } // } } sourceCache.Freeze(); }
/// <summary> /// Implements <see cref="Transliterator.HandleTransliterate(IReplaceable, TransliterationPosition, bool)"/> /// </summary> protected override void HandleTransliterate(IReplaceable text, TransliterationPosition offsets, bool isIncremental) { lock (syncLock) { if (csp == null) { return; } if (offsets.Start >= offsets.Limit) { return; } iter.SetText(text); result.Length = 0; int c, delta; // Walk through original string // If there is a case change, modify corresponding position in replaceable iter.SetIndex(offsets.Start); iter.SetLimit(offsets.Limit); iter.SetContextLimits(offsets.ContextStart, offsets.ContextLimit); while ((c = iter.NextCaseMapCP()) >= 0) { c = csp.ToFullFolding(c, result, 0); // toFullFolding(int c, StringBuffer out, int options) if (iter.DidReachLimit && isIncremental) { // the case mapping function tried to look beyond the context limit // wait for more input offsets.Start = iter.CaseMapCPStart; return; } /* decode the result */ if (c < 0) { /* c mapped to itself, no change */ continue; } else if (c <= UCaseProperties.MaxStringLength) { /* replace by the mapping string */ delta = iter.Replace(result.ToString()); result.Length = 0; } else { /* replace by single-code point mapping */ delta = iter.Replace(UTF16.ValueOf(c)); } if (delta != 0) { offsets.Limit += delta; offsets.ContextLimit += delta; } } offsets.Start = offsets.Limit; } }
/// <summary> /// Implements <see cref="Transliterator.HandleTransliterate(IReplaceable, Position, bool)"/>. /// </summary> protected override void HandleTransliterate(IReplaceable text, Position pos, bool isIncremental) { int start = pos.Start; int limit = pos.Limit; int i, ipat; //loop: while (start < limit) { // Loop over the forms in spec[]. Exit this loop when we // match one of the specs. Exit the outer loop if a // partial match is detected and isIncremental is true. for (ipat = 0; spec[ipat] != END;) { // Read the header int prefixLen = spec[ipat++]; int suffixLen = spec[ipat++]; int radix = spec[ipat++]; int minDigits = spec[ipat++]; int maxDigits = spec[ipat++]; // s is a copy of start that is advanced over the // characters as we parse them. int s = start; bool match = true; for (i = 0; i < prefixLen; ++i) { if (s >= limit) { if (i > 0) { // We've already matched a character. This is // a partial match, so we return if in // incremental mode. In non-incremental mode, // go to the next spec. if (isIncremental) { goto loop_break; } match = false; break; } } char c = text[s++]; if (c != spec[ipat + i]) { match = false; break; } } if (match) { int u = 0; int digitCount = 0; for (; ;) { if (s >= limit) { // Check for partial match in incremental mode. if (s > start && isIncremental) { goto loop_break; } break; } int ch = text.Char32At(s); int digit = UCharacter.Digit(ch, radix); if (digit < 0) { break; } s += UTF16.GetCharCount(ch); u = (u * radix) + digit; if (++digitCount == maxDigits) { break; } } match = (digitCount >= minDigits); if (match) { for (i = 0; i < suffixLen; ++i) { if (s >= limit) { // Check for partial match in incremental mode. if (s > start && isIncremental) { goto loop_break; } match = false; break; } char c = text[s++]; if (c != spec[ipat + prefixLen + i]) { match = false; break; } } if (match) { // At this point, we have a match string str = UTF16.ValueOf(u); text.Replace(start, s, str); limit -= s - start - str.Length; // The following break statement leaves the // loop that is traversing the forms in // spec[]. We then parse the next input // character. break; } } } ipat += prefixLen + suffixLen; } if (start < limit) { start += UTF16.GetCharCount(text.Char32At(start)); } } loop_break : { } pos.ContextLimit += limit - pos.Limit; pos.Limit = limit; pos.Start = start; }
/// <summary> /// Implements <see cref="Transliterator.HandleTransliterate(IReplaceable, Position, bool)"/>. /// </summary> protected override void HandleTransliterate(IReplaceable text, Position offsets, bool isIncremental) { lock (this) { // TODO reimplement, see ustrcase.c // using a real word break iterator // instead of just looking for a transition between cased and uncased characters // call CaseMapTransliterator::handleTransliterate() for lowercasing? (set fMap) // needs to take isIncremental into account because case mappings are context-sensitive // also detect when lowercasing function did not finish because of context if (offsets.Start >= offsets.Limit) { return; } // case type: >0 cased (UCaseProps.LOWER etc.) ==0 uncased <0 case-ignorable int type; // Our mode; we are either converting letter toTitle or // toLower. bool doTitle = true; // Determine if there is a preceding context of cased case-ignorable*, // in which case we want to start in toLower mode. If the // prior context is anything else (including empty) then start // in toTitle mode. int c, start; for (start = offsets.Start - 1; start >= offsets.ContextStart; start -= UTF16.GetCharCount(c)) { c = text.Char32At(start); type = csp.GetTypeOrIgnorable(c); if (type > 0) { // cased doTitle = false; break; } else if (type == 0) { // uncased but not ignorable break; } // else (type<0) case-ignorable: continue } // Convert things after a cased character toLower; things // after a uncased, non-case-ignorable character toTitle. Case-ignorable // characters are copied directly and do not change the mode. iter.SetText(text); iter.SetIndex(offsets.Start); iter.SetLimit(offsets.Limit); iter.SetContextLimits(offsets.ContextStart, offsets.ContextLimit); result.Length = 0; // Walk through original string // If there is a case change, modify corresponding position in replaceable int delta; while ((c = iter.NextCaseMapCP()) >= 0) { type = csp.GetTypeOrIgnorable(c); if (type >= 0) { // not case-ignorable if (doTitle) { c = csp.ToFullTitle(c, iter, result, caseLocale); } else { c = csp.ToFullLower(c, iter, result, caseLocale); } doTitle = type == 0; // doTitle=isUncased if (iter.DidReachLimit && isIncremental) { // the case mapping function tried to look beyond the context limit // wait for more input offsets.Start = iter.CaseMapCPStart; return; } /* decode the result */ if (c < 0) { /* c mapped to itself, no change */ continue; } else if (c <= UCaseProps.MAX_STRING_LENGTH) { /* replace by the mapping string */ delta = iter.Replace(result.ToString()); result.Length = 0; } else { /* replace by single-code point mapping */ delta = iter.Replace(UTF16.ValueOf(c)); } if (delta != 0) { offsets.Limit += delta; offsets.ContextLimit += delta; } } } offsets.Start = offsets.Limit; } }
/// <summary> /// Implements <see cref="Transliterator.HandleTransliterate(IReplaceable, Position, bool)"/>. /// </summary> protected override void HandleTransliterate(IReplaceable text, Position offsets, bool isIncremental) { int maxLen = UCharacterName.Instance.MaxCharNameLength + 1; // allow for temporary trailing space StringBuffer name = new StringBuffer(maxLen); // Get the legal character set UnicodeSet legal = new UnicodeSet(); UCharacterName.Instance.GetCharNameCharacters(legal); int cursor = offsets.Start; int limit = offsets.Limit; // Modes: // 0 - looking for open delimiter // 1 - after open delimiter int mode = 0; int openPos = -1; // open delim candidate pos int c; while (cursor < limit) { c = text.Char32At(cursor); switch (mode) { case 0: // looking for open delimiter if (c == OPEN_DELIM) { // quick check first openPos = cursor; int i = Utility.ParsePattern(OPEN_PAT, text, cursor, limit); if (i >= 0 && i < limit) { mode = 1; name.Length = 0; cursor = i; continue; // *** reprocess char32At(cursor) } } break; case 1: // after open delimiter // Look for legal chars. If \s+ is found, convert it // to a single space. If closeDelimiter is found, exit // the loop. If any other character is found, exit the // loop. If the limit is reached, exit the loop. // Convert \s+ => SPACE. This assumes there are no // runs of >1 space characters in names. if (PatternProps.IsWhiteSpace(c)) { // Ignore leading whitespace if (name.Length > 0 && name[name.Length - 1] != SPACE) { name.Append(SPACE); // If we are too long then abort. maxLen includes // temporary trailing space, so use '>'. if (name.Length > maxLen) { mode = 0; } } break; } if (c == CLOSE_DELIM) { int len = name.Length; // Delete trailing space, if any if (len > 0 && name[len - 1] == SPACE) { name.Length = --len; } c = UCharacter.GetCharFromExtendedName(name.ToString()); if (c != -1) { // Lookup succeeded // assert(UTF16.getCharCount(CLOSE_DELIM) == 1); cursor++; // advance over CLOSE_DELIM string str = UTF16.ValueOf(c); text.Replace(openPos, cursor, str); // Adjust indices for the change in the length of // the string. Do not assume that str.length() == // 1, in case of surrogates. int delta = cursor - openPos - str.Length; cursor -= delta; limit -= delta; // assert(cursor == openPos + str.length()); } // If the lookup failed, we leave things as-is and // still switch to mode 0 and continue. mode = 0; openPos = -1; // close off candidate continue; // *** reprocess char32At(cursor) } if (legal.Contains(c)) { UTF16.Append(name, c); // If we go past the longest possible name then abort. // maxLen includes temporary trailing space, so use '>='. if (name.Length >= maxLen) { mode = 0; } } // Invalid character else { --cursor; // Backup and reprocess this character mode = 0; } break; } cursor += UTF16.GetCharCount(c); } offsets.ContextLimit += limit - offsets.Limit; offsets.Limit = limit; // In incremental mode, only advance the cursor up to the last // open delimiter candidate. offsets.Start = (isIncremental && openPos >= 0) ? openPos : cursor; }
/// <summary> /// See if the decomposition of cp2 is at segment starting at <paramref name="segmentPos"/> /// (with canonical rearrangment!). /// If so, take the remainder, and return the equivalents. /// </summary> /// <param name="comp"></param> /// <param name="segment"></param> /// <param name="segmentPos"></param> /// <param name="buf"></param> /// <returns></returns> private ISet <string> Extract(int comp, string segment, int segmentPos, StringBuffer buf) { if (PROGRESS) { Console.Out.WriteLine(" extract: " + Utility.Hex(UTF16.ValueOf(comp)) + ", " + Utility.Hex(segment.Substring(segmentPos))); } string decomp = nfcImpl.GetDecomposition(comp); if (decomp == null) { decomp = UTF16.ValueOf(comp); } // See if it matches the start of segment (at segmentPos) bool ok = false; int cp; int decompPos = 0; int decompCp = UTF16.CharAt(decomp, 0); decompPos += UTF16.GetCharCount(decompCp); // adjust position to skip first char //int decompClass = getClass(decompCp); buf.Length = 0; // initialize working buffer, shared among callees for (int i = segmentPos; i < segment.Length; i += UTF16.GetCharCount(cp)) { cp = UTF16.CharAt(segment, i); if (cp == decompCp) { // if equal, eat another cp from decomp if (PROGRESS) { Console.Out.WriteLine(" matches: " + Utility.Hex(UTF16.ValueOf(cp))); } if (decompPos == decomp.Length) { // done, have all decomp characters! buf.Append(segment.Substring(i + UTF16.GetCharCount(cp))); // add remaining segment chars ok = true; break; } decompCp = UTF16.CharAt(decomp, decompPos); decompPos += UTF16.GetCharCount(decompCp); //decompClass = getClass(decompCp); } else { if (PROGRESS) { Console.Out.WriteLine(" buffer: " + Utility.Hex(UTF16.ValueOf(cp))); } // brute force approach UTF16.Append(buf, cp); /* TODO: optimize * // since we know that the classes are monotonically increasing, after zero * // e.g. 0 5 7 9 0 3 * // we can do an optimization * // there are only a few cases that work: zero, less, same, greater * // if both classes are the same, we fail * // if the decomp class < the segment class, we fail * * segClass = getClass(cp); * if (decompClass <= segClass) return null; */ } } if (!ok) { return(null); // we failed, characters left over } if (PROGRESS) { Console.Out.WriteLine("Matches"); } if (buf.Length == 0) { return(SET_WITH_NULL_STRING); // succeed, but no remainder } string remainder = buf.ToString(); // brute force approach // to check to make sure result is canonically equivalent /* * String trial = Normalizer.normalize(UTF16.valueOf(comp) + remainder, Normalizer.DECOMP, 0); * if (!segment.regionMatches(segmentPos, trial, 0, segment.length() - segmentPos)) return null; */ if (0 != Normalizer.Compare(UTF16.ValueOf(comp) + remainder, segment.Substring(segmentPos), 0)) { return(null); } // get the remaining combinations return(GetEquivalents2(remainder)); }
private ISet <string> GetEquivalents2(string segment) { ISet <string> result = new HashSet <string>(); if (PROGRESS) { Console.Out.WriteLine("Adding: " + Utility.Hex(segment)); } result.Add(segment); StringBuffer workingBuffer = new StringBuffer(); UnicodeSet starts = new UnicodeSet(); // cycle through all the characters int cp; for (int i = 0; i < segment.Length; i += Character.CharCount(cp)) { // see if any character is at the start of some decomposition cp = segment.CodePointAt(i); if (!nfcImpl.GetCanonStartSet(cp, starts)) { continue; } // if so, see which decompositions match for (UnicodeSetIterator iter = new UnicodeSetIterator(starts); iter.Next();) { int cp2 = iter.Codepoint; ISet <string> remainder = Extract(cp2, segment, i, workingBuffer); if (remainder == null) { continue; } // there were some matches, so add all the possibilities to the set. string prefix = segment.Substring(0, i - 0); // ICU4N: Checked 2nd parameter prefix += UTF16.ValueOf(cp2); foreach (string item in remainder) { result.Add(prefix + item); } } } return(result); /* * Set result = new HashSet(); * if (PROGRESS) System.out.println("Adding: " + NAME.transliterate(segment)); * result.add(segment); * StringBuffer workingBuffer = new StringBuffer(); * * // cycle through all the characters * int cp; * * for (int i = 0; i < segment.length(); i += UTF16.getCharCount(cp)) { * // see if any character is at the start of some decomposition * cp = UTF16.charAt(segment, i); * NormalizerImpl.getCanonStartSet(c,fillSet) * UnicodeSet starts = AT_START.get(cp); * if (starts == null) continue; * UnicodeSetIterator usi = new UnicodeSetIterator(starts); * // if so, see which decompositions match * while (usi.next()) { * int cp2 = usi.codepoint; * // we know that there are no strings in it * // so we don't have to check CharacterIterator.IS_STRING * Set remainder = extract(cp2, segment, i, workingBuffer); * if (remainder == null) continue; * * // there were some matches, so add all the possibilities to the set. * String prefix = segment.substring(0, i) + UTF16.valueOf(cp2); * Iterator it = remainder.iterator(); * while (it.hasNext()) { * String item = (String) it.next(); * if (PROGRESS) System.out.println("Adding: " + NAME.transliterate(prefix + item)); * result.add(prefix + item); * } * } * } * return result; */ }