//= public static UnicodeReplacer valueOf(String output, //= int cursorPos, //= RuleBasedTransliterator.Data data) { //= if (output.length() == 1) { //= char c = output.charAt(0); //= UnicodeReplacer r = data.lookupReplacer(c); //= if (r != null) { //= return r; //= } //= } //= return new StringReplacer(output, cursorPos, data); //= } /// <summary> /// <see cref="IUnicodeReplacer"/> API /// </summary> public virtual int Replace(IReplaceable text, int start, int limit, int[] cursor) { int outLen; int newStart = 0; // NOTE: It should be possible to _always_ run the complex // processing code; just slower. If not, then there is a bug // in the complex processing code. // Simple (no nested replacers) Processing Code : if (!isComplex) { text.Replace(start, limit, output); outLen = output.Length; // Setup default cursor position (for cursorPos within output) newStart = cursorPos; } // Complex (nested replacers) Processing Code : else { /* When there are segments to be copied, use the Replaceable.copy() * API in order to retain out-of-band data. Copy everything to the * end of the string, then copy them back over the key. This preserves * the integrity of indices into the key and surrounding context while * generating the output text. */ StringBuffer buf = new StringBuffer(); int oOutput; // offset into 'output' isComplex = false; // The temporary buffer starts at tempStart, and extends // to destLimit + tempExtra. The start of the buffer has a single // character from before the key. This provides style // data when addition characters are filled into the // temporary buffer. If there is nothing to the left, use // the non-character U+FFFF, which Replaceable subclasses // should treat specially as a "no-style character." // destStart points to the point after the style context // character, so it is tempStart+1 or tempStart+2. int tempStart = text.Length; // start of temp buffer int destStart = tempStart; // copy new text to here if (start > 0) { int len = UTF16.GetCharCount(text.Char32At(start - 1)); text.Copy(start - len, start, tempStart); destStart += len; } else { text.Replace(tempStart, tempStart, "\uFFFF"); destStart++; } int destLimit = destStart; int tempExtra = 0; // temp chars after destLimit for (oOutput = 0; oOutput < output.Length;) { if (oOutput == cursorPos) { // Record the position of the cursor newStart = buf.Length + destLimit - destStart; // relative to start // the buf.length() was inserted for bug 5789 // the problem is that if we are accumulating into a buffer (when r == null below) // then the actual length of the text at that point needs to add the buf length. // there was an alternative suggested in #5789, but that looks like it won't work // if we have accumulated some stuff in the dest part AND have a non-zero buffer. } int c = UTF16.CharAt(output, oOutput); // When we are at the last position copy the right style // context character into the temporary buffer. We don't // do this before because it will provide an incorrect // right context for previous replace() operations. int nextIndex = oOutput + UTF16.GetCharCount(c); if (nextIndex == output.Length) { tempExtra = UTF16.GetCharCount(text.Char32At(limit)); text.Copy(limit, limit + tempExtra, destLimit); } IUnicodeReplacer r = data.LookupReplacer(c); if (r == null) { // Accumulate straight (non-segment) text. UTF16.Append(buf, c); } else { isComplex = true; // Insert any accumulated straight text. if (buf.Length > 0) { text.Replace(destLimit, destLimit, buf.ToString()); destLimit += buf.Length; buf.Length = 0; } // Delegate output generation to replacer object int len = r.Replace(text, destLimit, destLimit, cursor); destLimit += len; } oOutput = nextIndex; } // Insert any accumulated straight text. if (buf.Length > 0) { text.Replace(destLimit, destLimit, buf.ToString()); destLimit += buf.Length; } if (oOutput == cursorPos) { // Record the position of the cursor newStart = destLimit - destStart; // relative to start } outLen = destLimit - destStart; // Copy new text to start, and delete it text.Copy(destStart, destLimit, start); text.Replace(tempStart + outLen, destLimit + tempExtra + outLen, ""); // Delete the old text (the key) text.Replace(start + outLen, limit + outLen, ""); } if (hasCursor) { // Adjust the cursor for positions outside the key. These // refer to code points rather than code units. If cursorPos // is within the output string, then use newStart, which has // already been set above. if (cursorPos < 0) { newStart = start; int n = cursorPos; // Outside the output string, cursorPos counts code points while (n < 0 && newStart > 0) { newStart -= UTF16.GetCharCount(text.Char32At(newStart - 1)); ++n; } newStart += n; } else if (cursorPos > output.Length) { newStart = start + outLen; int n = cursorPos - output.Length; // Outside the output string, cursorPos counts code points while (n > 0 && newStart < text.Length) { newStart += UTF16.GetCharCount(text.Char32At(newStart)); --n; } newStart += n; } else { // Cursor is within output string. It has been set up above // to be relative to start. newStart += start; } cursor[0] = newStart; } return(outLen); }
/// <summary> /// Implements <see cref="Transliterator.HandleTransliterate(IReplaceable, Position, bool)"/>. /// </summary> protected override void HandleTransliterate(IReplaceable text, Position offsets, bool isIncremental) { int maxLen = UCharacterName.Instance.MaxCharNameLength + 1; // allow for temporary trailing space StringBuffer name = new StringBuffer(maxLen); // Get the legal character set UnicodeSet legal = new UnicodeSet(); UCharacterName.Instance.GetCharNameCharacters(legal); int cursor = offsets.Start; int limit = offsets.Limit; // Modes: // 0 - looking for open delimiter // 1 - after open delimiter int mode = 0; int openPos = -1; // open delim candidate pos int c; while (cursor < limit) { c = text.Char32At(cursor); switch (mode) { case 0: // looking for open delimiter if (c == OPEN_DELIM) { // quick check first openPos = cursor; int i = Utility.ParsePattern(OPEN_PAT, text, cursor, limit); if (i >= 0 && i < limit) { mode = 1; name.Length = 0; cursor = i; continue; // *** reprocess char32At(cursor) } } break; case 1: // after open delimiter // Look for legal chars. If \s+ is found, convert it // to a single space. If closeDelimiter is found, exit // the loop. If any other character is found, exit the // loop. If the limit is reached, exit the loop. // Convert \s+ => SPACE. This assumes there are no // runs of >1 space characters in names. if (PatternProps.IsWhiteSpace(c)) { // Ignore leading whitespace if (name.Length > 0 && name[name.Length - 1] != SPACE) { name.Append(SPACE); // If we are too long then abort. maxLen includes // temporary trailing space, so use '>'. if (name.Length > maxLen) { mode = 0; } } break; } if (c == CLOSE_DELIM) { int len = name.Length; // Delete trailing space, if any if (len > 0 && name[len - 1] == SPACE) { name.Length = --len; } c = UCharacter.GetCharFromExtendedName(name.ToString()); if (c != -1) { // Lookup succeeded // assert(UTF16.getCharCount(CLOSE_DELIM) == 1); cursor++; // advance over CLOSE_DELIM string str = UTF16.ValueOf(c); text.Replace(openPos, cursor, str); // Adjust indices for the change in the length of // the string. Do not assume that str.length() == // 1, in case of surrogates. int delta = cursor - openPos - str.Length; cursor -= delta; limit -= delta; // assert(cursor == openPos + str.length()); } // If the lookup failed, we leave things as-is and // still switch to mode 0 and continue. mode = 0; openPos = -1; // close off candidate continue; // *** reprocess char32At(cursor) } if (legal.Contains(c)) { UTF16.Append(name, c); // If we go past the longest possible name then abort. // maxLen includes temporary trailing space, so use '>='. if (name.Length >= maxLen) { mode = 0; } } // Invalid character else { --cursor; // Backup and reprocess this character mode = 0; } break; } cursor += UTF16.GetCharCount(c); } offsets.ContextLimit += limit - offsets.Limit; offsets.Limit = limit; // In incremental mode, only advance the cursor up to the last // open delimiter candidate. offsets.Start = (isIncremental && openPos >= 0) ? openPos : cursor; }
private StringBuffer Map(UCharacterIterator iter, StringPrepOptions options) { Values val = new Values(); char result = (char)0; int ch = UCharacterIterator.DONE; StringBuffer dest = new StringBuffer(); bool allowUnassigned = ((options & StringPrepOptions.AllowUnassigned) > 0); while ((ch = iter.NextCodePoint()) != UCharacterIterator.DONE) { result = GetCodePointValue(ch); GetValues(result, val); // check if the source codepoint is unassigned if (val.type == UNASSIGNED && allowUnassigned == false) { throw new StringPrepParseException("An unassigned code point was found in the input", StringPrepErrorType.UnassignedError, iter.GetText(), iter.Index); } else if ((val.type == MAP)) { int index, length; if (val.isIndex) { index = val.value; if (index >= indexes[ONE_UCHAR_MAPPING_INDEX_START] && index < indexes[TWO_UCHARS_MAPPING_INDEX_START]) { length = 1; } else if (index >= indexes[TWO_UCHARS_MAPPING_INDEX_START] && index < indexes[THREE_UCHARS_MAPPING_INDEX_START]) { length = 2; } else if (index >= indexes[THREE_UCHARS_MAPPING_INDEX_START] && index < indexes[FOUR_UCHARS_MAPPING_INDEX_START]) { length = 3; } else { length = mappingData[index++]; } /* copy mapping to destination */ dest.Append(mappingData, index, length); continue; } else { ch -= val.value; } } else if (val.type == DELETE) { // just consume the codepoint and contine continue; } //copy the source into destination UTF16.Append(dest, ch); } return(dest); }
/// <summary> /// See if the decomposition of cp2 is at segment starting at <paramref name="segmentPos"/> /// (with canonical rearrangment!). /// If so, take the remainder, and return the equivalents. /// </summary> /// <param name="comp"></param> /// <param name="segment"></param> /// <param name="segmentPos"></param> /// <param name="buf"></param> /// <returns></returns> private ISet <string> Extract(int comp, string segment, int segmentPos, StringBuffer buf) { if (PROGRESS) { Console.Out.WriteLine(" extract: " + Utility.Hex(UTF16.ValueOf(comp)) + ", " + Utility.Hex(segment.Substring(segmentPos))); } string decomp = nfcImpl.GetDecomposition(comp); if (decomp == null) { decomp = UTF16.ValueOf(comp); } // See if it matches the start of segment (at segmentPos) bool ok = false; int cp; int decompPos = 0; int decompCp = UTF16.CharAt(decomp, 0); decompPos += UTF16.GetCharCount(decompCp); // adjust position to skip first char //int decompClass = getClass(decompCp); buf.Length = 0; // initialize working buffer, shared among callees for (int i = segmentPos; i < segment.Length; i += UTF16.GetCharCount(cp)) { cp = UTF16.CharAt(segment, i); if (cp == decompCp) { // if equal, eat another cp from decomp if (PROGRESS) { Console.Out.WriteLine(" matches: " + Utility.Hex(UTF16.ValueOf(cp))); } if (decompPos == decomp.Length) { // done, have all decomp characters! buf.Append(segment.Substring(i + UTF16.GetCharCount(cp))); // add remaining segment chars ok = true; break; } decompCp = UTF16.CharAt(decomp, decompPos); decompPos += UTF16.GetCharCount(decompCp); //decompClass = getClass(decompCp); } else { if (PROGRESS) { Console.Out.WriteLine(" buffer: " + Utility.Hex(UTF16.ValueOf(cp))); } // brute force approach UTF16.Append(buf, cp); /* TODO: optimize * // since we know that the classes are monotonically increasing, after zero * // e.g. 0 5 7 9 0 3 * // we can do an optimization * // there are only a few cases that work: zero, less, same, greater * // if both classes are the same, we fail * // if the decomp class < the segment class, we fail * * segClass = getClass(cp); * if (decompClass <= segClass) return null; */ } } if (!ok) { return(null); // we failed, characters left over } if (PROGRESS) { Console.Out.WriteLine("Matches"); } if (buf.Length == 0) { return(SET_WITH_NULL_STRING); // succeed, but no remainder } string remainder = buf.ToString(); // brute force approach // to check to make sure result is canonically equivalent /* * String trial = Normalizer.normalize(UTF16.valueOf(comp) + remainder, Normalizer.DECOMP, 0); * if (!segment.regionMatches(segmentPos, trial, 0, segment.length() - segmentPos)) return null; */ if (0 != Normalizer.Compare(UTF16.ValueOf(comp) + remainder, segment.Substring(segmentPos), 0)) { return(null); } // get the remaining combinations return(GetEquivalents2(remainder)); }