/// <summary> /// Default implementation of UnicodeMatcher::matches() for Unicode filters. /// Matches a single 16-bit code unit at offset. /// </summary> /// /// @stable ICU 2.0 public virtual int Matches(Replaceable text, int[] offset, int limit, bool incremental) { int c; if (offset[0] < limit && Contains(c = text.Char32At(offset[0]))) { offset[0] += IBM.ICU.Text.UTF16.GetCharCount(c); return(IBM.ICU.Text.UnicodeMatcher_Constants.U_MATCH); } if (offset[0] > limit && Contains(c = text.Char32At(offset[0]))) { // Backup offset by 1, unless the preceding character is a // surrogate pair -- then backup by 2 (keep offset pointing at // the lead surrogate). --offset[0]; if (offset[0] >= 0) { offset[0] -= IBM.ICU.Text.UTF16.GetCharCount(text.Char32At(offset[0])) - 1; } return(IBM.ICU.Text.UnicodeMatcher_Constants.U_MATCH); } if (incremental && offset[0] == limit) { return(IBM.ICU.Text.UnicodeMatcher_Constants.U_PARTIAL_MATCH); } return(IBM.ICU.Text.UnicodeMatcher_Constants.U_MISMATCH); }
/// <summary> /// Returns TRUE if there are any more runs. TRUE is always returned at /// least once. Upon return, the caller should examine scriptCode, start, /// and limit. /// </summary> /// public bool Next() { int ch; int s; scriptCode = IBM.ICU.Lang.UScript.INVALID_CODE; // don't know script yet start = limit; // Are we done? if (start == textLimit) { return(false); } // Move start back to include adjacent COMMON or INHERITED // characters while (start > textStart) { ch = text.Char32At(start - 1); // look back s = IBM.ICU.Lang.UScript.GetScript(ch); if (s == IBM.ICU.Lang.UScript.COMMON || s == IBM.ICU.Lang.UScript.INHERITED) { --start; } else { break; } } // Move limit ahead to include COMMON, INHERITED, and characters // of the current script. while (limit < textLimit) { ch = text.Char32At(limit); // look ahead s = IBM.ICU.Lang.UScript.GetScript(ch); if (s != IBM.ICU.Lang.UScript.COMMON && s != IBM.ICU.Lang.UScript.INHERITED) { if (scriptCode == IBM.ICU.Lang.UScript.INVALID_CODE) { scriptCode = s; } else if (s != scriptCode) { break; } } ++limit; } // Return TRUE even if the entire text is COMMON / INHERITED, in // which case scriptCode will be UScript.INVALID_CODE. return(true); }
/// <summary> /// Transliterate the given text with the given UTransPosition indices. /// Return TRUE if the transliteration should continue or FALSE if it should /// halt (because of a U_PARTIAL_MATCH match). Note that FALSE is only ever /// returned if isIncremental is TRUE. /// </summary> /// /// <param name="text">the text to be transliterated</param> /// <param name="pos">the position indices, which will be updated</param> /// <param name="incremental">if TRUE, assume new text may be inserted at index.limit, andreturn FALSE if thre is a partial match.</param> /// <returns>TRUE unless a U_PARTIAL_MATCH has been obtained, indicating that /// transliteration should stop until more text arrives.</returns> public bool Transliterate(Replaceable text, Transliterator.Position pos, bool incremental) { int indexByte = text.Char32At(pos.start) & 0xFF; for (int i = index[indexByte]; i < index[indexByte + 1]; ++i) { int m = rules[i].MatchAndReplace(text, pos, incremental); switch (m) { case IBM.ICU.Text.UnicodeMatcher_Constants.U_MATCH: if (IBM.ICU.Text.Transliterator.DEBUG) { System.Console.Out.WriteLine(((incremental) ? "Rule.i: match " : "Rule: match ") + rules[i].ToRule(true) + " => " + IBM.ICU.Impl.UtilityExtensions.FormatInput(text, pos)); } return(true); case IBM.ICU.Text.UnicodeMatcher_Constants.U_PARTIAL_MATCH: if (IBM.ICU.Text.Transliterator.DEBUG) { System.Console.Out .WriteLine(((incremental) ? "Rule.i: partial match " : "Rule: partial match ") + rules[i].ToRule(true) + " => " + IBM.ICU.Impl.UtilityExtensions.FormatInput(text, pos)); } return(false); default: if (IBM.ICU.Text.Transliterator.DEBUG) { System.Console.Out.WriteLine("Rule: no match " + rules[i]); } break; } } // No match or partial match from any rule pos.start += IBM.ICU.Text.UTF16.GetCharCount(text.Char32At(pos.start)); if (IBM.ICU.Text.Transliterator.DEBUG) { System.Console.Out.WriteLine(((incremental) ? "Rule.i: no match => " : "Rule: no match => ") + IBM.ICU.Impl.UtilityExtensions.FormatInput(text, pos)); } return(true); }
/// <summary> /// Implements <see cref="M:IBM.ICU.Text.Transliterator.HandleTransliterate(IBM.ICU.Text.Replaceable, null, System.Boolean)"/>. /// </summary> /// protected internal override void HandleTransliterate(Replaceable text, Transliterator.Position offsets, bool isIncremental) { int start = offsets.start; int limit = offsets.limit; if (start >= limit) { return; } int overallDelta = 0; // Walk through the string looking for safe characters. // Whenever you hit one normalize from the start of the last // safe character up to just before the next safe character // Also, if you hit the end and we are not in incremental mode, // do to end. // TODO: fix for surrogates // TODO: add QuickCheck, so we rarely convert OK stuff int lastSafe = start; // go back to start in any event int cp; for (int i = start + 1; i < limit; i += IBM.ICU.Text.UTF16.GetCharCount(cp)) { cp = text.Char32At(i); if (IBM.ICU.Lang.UCharacter.GetCombiningClass(cp) == 0 && !unsafeStart.Contains(cp)) { int delta = Convert(text, lastSafe, i, null); i += delta; limit += delta; overallDelta += delta; lastSafe = i; } } if (!isIncremental) { int delta_0 = Convert(text, lastSafe, limit, null); overallDelta += delta_0; lastSafe = limit + delta_0; } else { // We are incremental, so accept the last characters IF they turn // into skippables int delta_1 = Convert(text, lastSafe, limit, skippable); if (delta_1 != Int32.MinValue) { overallDelta += delta_1; lastSafe = limit + delta_1; } } offsets.contextLimit += overallDelta; offsets.limit += overallDelta; offsets.start = lastSafe; }
/// <summary> /// Implements <see cref="M:IBM.ICU.Text.Transliterator.HandleTransliterate(IBM.ICU.Text.Replaceable, null, System.Boolean)"/>. /// </summary> /// protected internal override void HandleTransliterate(Replaceable text, Transliterator.Position pos, bool incremental) { int start = pos.start; int limit = pos.limit; StringBuilder buf = new StringBuilder(prefix); int prefixLen = prefix.Length; bool redoPrefix = false; while (start < limit) { int c = (grokSupplementals) ? (int)(text.Char32At(start)) : (int)(text.CharAt(start)); int charLen = (grokSupplementals) ? IBM.ICU.Text.UTF16.GetCharCount(c) : 1; if ((c & -65536) != 0 && supplementalHandler != null) { buf.Length = 0; buf.Append(supplementalHandler.prefix); IBM.ICU.Impl.Utility.AppendNumber(buf, c, supplementalHandler.radix, supplementalHandler.minDigits); buf.Append(supplementalHandler.suffix); redoPrefix = true; } else { if (redoPrefix) { buf.Length = 0; buf.Append(prefix); redoPrefix = false; } else { buf.Length = prefixLen; } IBM.ICU.Impl.Utility.AppendNumber(buf, c, radix, minDigits); buf.Append(suffix); } text.Replace(start, start + charLen, buf.ToString()); start += buf.Length; limit += buf.Length - charLen; } pos.contextLimit += limit - pos.limit; pos.limit = limit; pos.start = start; }
/// <summary> /// Implements <see cref="M:IBM.ICU.Text.Transliterator.HandleTransliterate(IBM.ICU.Text.Replaceable, null, System.Boolean)"/>. /// </summary> /// protected internal override void HandleTransliterate(Replaceable text, Transliterator.Position offsets, bool isIncremental) { int cursor = offsets.start; int limit = offsets.limit; StringBuilder str = new StringBuilder(); str.Append(OPEN_DELIM); int len; String name; while (cursor < limit) { int c = text.Char32At(cursor); if ((name = IBM.ICU.Lang.UCharacter.GetExtendedName(c)) != null) { str.Length = OPEN_DELIM_LEN; str.Append(name).Append(CLOSE_DELIM); int clen = IBM.ICU.Text.UTF16.GetCharCount(c); text.Replace(cursor, cursor + clen, str.ToString()); len = str.Length; cursor += len; // advance cursor by 1 and adjust for new text limit += len - clen; // change in length } else { ++cursor; } } offsets.contextLimit += limit - offsets.limit; offsets.limit = limit; offsets.start = cursor; }
/// <summary> /// Implements <see cref="M:IBM.ICU.Text.Transliterator.HandleTransliterate(IBM.ICU.Text.Replaceable, null, System.Boolean)"/>. /// </summary> /// protected internal override void HandleTransliterate(Replaceable text, Transliterator.Position offsets, bool isIncremental) { int maxLen = IBM.ICU.Impl.UCharacterName.GetInstance().GetMaxCharNameLength() + 1; // allow // for // temporary // trailing // space StringBuilder name = new StringBuilder(maxLen); // Get the legal character set UnicodeSet legal = new UnicodeSet(); IBM.ICU.Impl.UCharacterName.GetInstance().GetCharNameCharacters(legal); int cursor = offsets.start; int limit = offsets.limit; // Modes: // 0 - looking for open delimiter // 1 - after open delimiter int mode = 0; int openPos = -1; // open delim candidate pos int c; while (cursor < limit) { c = text.Char32At(cursor); switch (mode) { case 0: // looking for open delimiter if (c == OPEN_DELIM) // quick check first { openPos = cursor; int i = IBM.ICU.Impl.Utility.ParsePattern(OPEN_PAT, text, cursor, limit); if (i >= 0 && i < limit) { mode = 1; name.Length = 0; cursor = i; continue; // *** reprocess char32At(cursor) } } break; case 1: // after open delimiter // Look for legal chars. If \s+ is found, convert it // to a single space. If closeDelimiter is found, exit // the loop. If any other character is found, exit the // loop. If the limit is reached, exit the loop. // Convert \s+ => SPACE. This assumes there are no // runs of >1 space characters in names. if (IBM.ICU.Impl.UCharacterProperty.IsRuleWhiteSpace(c)) { // Ignore leading whitespace if (name.Length > 0 && name[name.Length - 1] != SPACE) { name.Append(SPACE); // If we are too long then abort. maxLen includes // temporary trailing space, so use '>'. if (name.Length > maxLen) { mode = 0; } } break; } if (c == CLOSE_DELIM) { int len = name.Length; // Delete trailing space, if any if (len > 0 && name[len - 1] == SPACE) { name.Length = --len; } c = IBM.ICU.Lang.UCharacter.GetCharFromExtendedName(name.ToString()); if (c != -1) { // Lookup succeeded // assert(UTF16.getCharCount(CLOSE_DELIM) == 1); cursor++; // advance over CLOSE_DELIM String str = IBM.ICU.Text.UTF16.ValueOf(c); text.Replace(openPos, cursor, str); // Adjust indices for the change in the length of // the string. Do not assume that str.length() == // 1, in case of surrogates. int delta = cursor - openPos - str.Length; cursor -= delta; limit -= delta; // assert(cursor == openPos + str.length()); } // If the lookup failed, we leave things as-is and // still switch to mode 0 and continue. mode = 0; openPos = -1; // close off candidate continue; // *** reprocess char32At(cursor) } if (legal.Contains(c)) { IBM.ICU.Text.UTF16.Append(name, c); // If we go past the longest possible name then abort. // maxLen includes temporary trailing space, so use '>='. if (name.Length >= maxLen) { mode = 0; } } // Invalid character else { --cursor; // Backup and reprocess this character mode = 0; } break; } cursor += IBM.ICU.Text.UTF16.GetCharCount(c); } offsets.contextLimit += limit - offsets.limit; offsets.limit = limit; // In incremental mode, only advance the cursor up to the last // open delimiter candidate. offsets.start = (isIncremental && openPos >= 0) ? openPos : cursor; }
// = public static UnicodeReplacer valueOf(String output, // = int cursorPos, // = RuleBasedTransliterator.Data data) { // = if (output.length() == 1) { // = char c = output.charAt(0); // = UnicodeReplacer r = data.lookupReplacer(c); // = if (r != null) { // = return r; // = } // = } // = return new StringReplacer(output, cursorPos, data); // = } /// <summary> /// UnicodeReplacer API /// </summary> /// public virtual int Replace(Replaceable text, int start, int limit, int[] cursor) { int outLen; int newStart = 0; // NOTE: It should be possible to _always_ run the complex // processing code; just slower. If not, then there is a bug // in the complex processing code. // Simple (no nested replacers) Processing Code : if (!isComplex) { text.Replace(start, limit, output); outLen = output.Length; // Setup default cursor position (for cursorPos within output) newStart = cursorPos; } // Complex (nested replacers) Processing Code : else { /* * When there are segments to be copied, use the Replaceable.copy() * API in order to retain out-of-band data. Copy everything to the * end of the string, then copy them back over the key. This * preserves the integrity of indices into the key and surrounding * context while generating the output text. */ StringBuilder buf = new StringBuilder(); int oOutput; // offset into 'output' isComplex = false; // The temporary buffer starts at tempStart, and : // to destLimit + tempExtra. The start of the buffer has a single // character from before the key. This provides style // data when addition characters are filled into the // temporary buffer. If there is nothing to the left, use // the non-character U+FFFF, which Replaceable subclasses // should treat specially as a "no-style character." // destStart points to the point after the style context // character, so it is tempStart+1 or tempStart+2. int tempStart = text.Length(); // start of temp buffer int destStart = tempStart; // copy new text to here if (start > 0) { int len = IBM.ICU.Text.UTF16.GetCharCount(text.Char32At(start - 1)); text.Copy(start - len, start, tempStart); destStart += len; } else { text.Replace(tempStart, tempStart, "\uFFFF"); destStart++; } int destLimit = destStart; int tempExtra = 0; // temp chars after destLimit for (oOutput = 0; oOutput < output.Length;) { if (oOutput == cursorPos) { // Record the position of the cursor newStart = buf.Length + destLimit - destStart; // relative // to start // the buf.length() was inserted for bug 5789 // the problem is that if we are accumulating into a buffer // (when r == null below) // then the actual length of the text at that point needs to // add the buf length. // there was an alternative suggested in #5789, but that // looks like it won't work // if we have accumulated some stuff in the dest part AND // have a non-zero buffer. } int c = IBM.ICU.Text.UTF16.CharAt(output, oOutput); // When we are at the last position copy the right style // context character into the temporary buffer. We don't // do this before because it will provide an incorrect // right context for previous replace() operations. int nextIndex = oOutput + IBM.ICU.Text.UTF16.GetCharCount(c); if (nextIndex == output.Length) { tempExtra = IBM.ICU.Text.UTF16.GetCharCount(text.Char32At(limit)); text.Copy(limit, limit + tempExtra, destLimit); } UnicodeReplacer r = data.LookupReplacer(c); if (r == null) { // Accumulate straight (non-segment) text. IBM.ICU.Text.UTF16.Append(buf, c); } else { isComplex = true; // Insert any accumulated straight text. if (buf.Length > 0) { text.Replace(destLimit, destLimit, buf.ToString()); destLimit += buf.Length; buf.Length = 0; } // Delegate output generation to replacer object int len_0 = r.Replace(text, destLimit, destLimit, cursor); destLimit += len_0; } oOutput = nextIndex; } // Insert any accumulated straight text. if (buf.Length > 0) { text.Replace(destLimit, destLimit, buf.ToString()); destLimit += buf.Length; } if (oOutput == cursorPos) { // Record the position of the cursor newStart = destLimit - destStart; // relative to start } outLen = destLimit - destStart; // Copy new text to start, and delete it text.Copy(destStart, destLimit, start); text.Replace(tempStart + outLen, destLimit + tempExtra + outLen, ""); // Delete the old text (the key) text.Replace(start + outLen, limit + outLen, ""); } if (hasCursor) { // Adjust the cursor for positions outside the key. These // refer to code points rather than code units. If cursorPos // is within the output string, then use newStart, which has // already been set above. if (cursorPos < 0) { newStart = start; int n = cursorPos; // Outside the output string, cursorPos counts code points while (n < 0 && newStart > 0) { newStart -= IBM.ICU.Text.UTF16.GetCharCount(text.Char32At(newStart - 1)); ++n; } newStart += n; } else if (cursorPos > output.Length) { newStart = start + outLen; int n_1 = cursorPos - output.Length; // Outside the output string, cursorPos counts code points while (n_1 > 0 && newStart < text.Length()) { newStart += IBM.ICU.Text.UTF16.GetCharCount(text.Char32At(newStart)); --n_1; } newStart += n_1; } else { // Cursor is within output string. It has been set up above // to be relative to start. newStart += start; } cursor[0] = newStart; } return(outLen); }
/// <summary> /// Implements <see cref="M:IBM.ICU.Text.Transliterator.HandleTransliterate(IBM.ICU.Text.Replaceable, null, System.Boolean)"/>. /// </summary> /// protected internal override void HandleTransliterate(Replaceable text, Transliterator.Position pos, bool isIncremental) { int start = pos.start; int limit = pos.limit; int i, j, ipat; loop : { while (start < limit) { // Loop over the forms in spec[]. Exit this loop when we // match one of the specs. Exit the outer loop if a // partial match is detected and isIncremental is true. for (j = 0, ipat = 0; spec[ipat] != END; ++j) { // Read the header int prefixLen = spec[ipat++]; int suffixLen = spec[ipat++]; int radix = spec[ipat++]; int minDigits = spec[ipat++]; int maxDigits = spec[ipat++]; // s is a copy of start that is advanced over the // characters as we parse them. int s = start; bool match = true; for (i = 0; i < prefixLen; ++i) { if (s >= limit) { if (i > 0) { // We've already matched a character. This is // a partial match, so we return if in // incremental mode. In non-incremental mode, // go to the next spec. if (isIncremental) { goto gotoloop; } match = false; break; } } char c = text.CharAt(s++); if (c != spec[ipat + i]) { match = false; break; } } if (match) { int u = 0; int digitCount = 0; for (;;) { if (s >= limit) { // Check for partial match in incremental mode. if (s > start && isIncremental) { goto gotoloop; } break; } int ch = text.Char32At(s); int digit = IBM.ICU.Lang.UCharacter.Digit(ch, radix); if (digit < 0) { break; } s += IBM.ICU.Text.UTF16.GetCharCount(ch); u = (u * radix) + digit; if (++digitCount == maxDigits) { break; } } match = (digitCount >= minDigits); if (match) { for (i = 0; i < suffixLen; ++i) { if (s >= limit) { // Check for partial match in incremental mode. if (s > start && isIncremental) { goto gotoloop; } match = false; break; } char c_0 = text.CharAt(s++); if (c_0 != spec[ipat + prefixLen + i]) { match = false; break; } } if (match) { // At this point, we have a match String str = IBM.ICU.Text.UTF16.ValueOf(u); text.Replace(start, s, str); limit -= s - start - str.Length; // The following break statement leaves the // loop that is traversing the forms in // spec[]. We then parse the next input // character. break; } } } ipat += prefixLen + suffixLen; } if (start < limit) { start += IBM.ICU.Text.UTF16.GetCharCount(text.Char32At(start)); } } } gotoloop: ; pos.contextLimit += limit - pos.limit; pos.limit = limit; pos.start = start; }
static internal int PosAfter(Replaceable str, int pos) { return((pos >= 0 && pos < str.Length()) ? pos + IBM.ICU.Text.UTF16.GetCharCount(str.Char32At(pos)) : pos + 1); }
static internal int PosBefore(Replaceable str, int pos) { return((pos > 0) ? pos - IBM.ICU.Text.UTF16.GetCharCount(str.Char32At(pos - 1)) : pos - 1); }
/// <summary> /// Implements <see cref="M:IBM.ICU.Text.Transliterator.HandleTransliterate(IBM.ICU.Text.Replaceable, null, System.Boolean)"/>. /// </summary> /// protected internal override void HandleTransliterate(Replaceable text, Transliterator.Position offsets, bool isIncremental) { // TODO reimplement, see ustrcase.c // using a real word break iterator // instead of just looking for a transition between cased and uncased // characters // call CaseMapTransliterator::handleTransliterate() for lowercasing? // (set fMap) // needs to take isIncremental into account because case mappings are // context-sensitive // also detect when lowercasing function did not finish because of // context if (offsets.start >= offsets.limit) { return; } // case type: >0 cased (UCaseProps.LOWER etc.) ==0 uncased <0 // case-ignorable int type; // Our mode; we are either converting letter toTitle or // toLower. bool doTitle = true; // Determine if there is a preceding context of cased case-ignorable*, // in which case we want to start in toLower mode. If the // prior context is anything else (including empty) then start // in toTitle mode. int c, start; for (start = offsets.start - 1; start >= offsets.contextStart; start -= IBM.ICU.Text.UTF16 .GetCharCount(c)) { c = text.Char32At(start); type = csp.GetTypeOrIgnorable(c); if (type > 0) // cased { doTitle = false; break; } else if (type == 0) // uncased but not ignorable { break; } // else (type<0) case-ignorable: continue } // Convert things after a cased character toLower; things // after a uncased, non-case-ignorable character toTitle. Case-ignorable // characters are copied directly and do not change the mode. iter.SetText(text); iter.SetIndex(offsets.start); iter.SetLimit(offsets.limit); iter.SetContextLimits(offsets.contextStart, offsets.contextLimit); result.Length = 0; // Walk through original string // If there is a case change, modify corresponding position in // replaceable int delta; while ((c = iter.NextCaseMapCP()) >= 0) { type = csp.GetTypeOrIgnorable(c); if (type >= 0) // not case-ignorable { if (doTitle) { c = csp.ToFullTitle(c, iter, result, locale, locCache); } else { c = csp.ToFullLower(c, iter, result, locale, locCache); } doTitle = type == 0; // doTitle=isUncased if (iter.DidReachLimit() && isIncremental) { // the case mapping function tried to look beyond the // context limit // wait for more input offsets.start = iter.GetCaseMapCPStart(); return; } /* decode the result */ if (c < 0) { /* c mapped to itself, no change */ continue; } else if (c <= IBM.ICU.Impl.UCaseProps.MAX_STRING_LENGTH) { /* replace by the mapping string */ delta = iter.Replace(result.ToString()); result.Length = 0; } else { /* replace by single-code point mapping */ delta = iter.Replace(IBM.ICU.Text.UTF16.ValueOf(c)); } if (delta != 0) { offsets.limit += delta; offsets.contextLimit += delta; } } } offsets.start = offsets.limit; }