private int Next() { int c; if (forward) { if (index < contextLimit) { c = rep.Char32At(index); index += UTF16.GetCharCount(c); return(c); } else { // forward context iteration reached the limit reachedLimit = true; } } else if (!forward && index > contextStart) { c = rep.Char32At(index - 1); index -= UTF16.GetCharCount(c); return(c); } return(-1); }
/// <summary> /// Default implementation of <see cref="IUnicodeMatcher.Matches(IReplaceable, int[], int, bool)"/> for Unicode /// filters. Matches a single 16-bit code unit at offset. /// </summary> /// <stable>ICU 2.0</stable> public virtual MatchDegree Matches(IReplaceable text, int[] offset, int limit, bool incremental) { int c; if (offset[0] < limit && Contains(c = text.Char32At(offset[0]))) { offset[0] += UTF16.GetCharCount(c); return(MatchDegree.Match); } if (offset[0] > limit && Contains(text.Char32At(offset[0]))) { // Backup offset by 1, unless the preceding character is a // surrogate pair -- then backup by 2 (keep offset pointing at // the lead surrogate). --offset[0]; if (offset[0] >= 0) { offset[0] -= UTF16.GetCharCount(text.Char32At(offset[0])) - 1; } return(MatchDegree.Match); } if (incremental && offset[0] == limit) { return(MatchDegree.PartialMatch); } return(MatchDegree.Mismatch); }
/// <summary> /// Gets the current string from the iterator. Only use after calling <see cref="Next()"/>, /// not <see cref="NextRange()"/>. /// </summary> /// <stable>ICU 4.0</stable> public virtual string GetString() // ICU4N TODO: API String vs GetString() - confusing. This should be made into String property and the current string property made into a private field. { if (Codepoint != IS_STRING) { return(UTF16.ValueOf(Codepoint)); } return(String); }
/// <summary> /// Implement <see cref="IUnicodeMatcher"/> /// </summary> public virtual bool MatchesIndexValue(int v) { if (pattern.Length == 0) { return(true); } int c = UTF16.CharAt(pattern, 0); IUnicodeMatcher m = data.LookupMatcher(c); return((m == null) ? ((c & 0xFF) == v) : m.MatchesIndexValue(v)); }
/** * Internal method. Returns 8-bit index value for this rule. * This is the low byte of the first character of the key, * unless the first character of the key is a set. If it's a * set, or otherwise can match multiple keys, the index value is -1. */ internal int GetIndexValue() { if (anteContextLength == pattern.Length) { // A pattern with just ante context {such as foo)>bar} can // match any key. return(-1); } int c = UTF16.CharAt(pattern, anteContextLength); return(data.LookupMatcher(c) == null ? (c & 0xFF) : -1); }
/// <summary> /// Implements <see cref="Transliterator.HandleTransliterate(IReplaceable, Position, bool)"/>. /// </summary> protected override void HandleTransliterate(IReplaceable text, Position pos, bool incremental) { int start = pos.Start; int limit = pos.Limit; StringBuilder buf = new StringBuilder(prefix); int prefixLen = prefix.Length; bool redoPrefix = false; while (start < limit) { int c = grokSupplementals ? text.Char32At(start) : text[start]; int charLen = grokSupplementals ? UTF16.GetCharCount(c) : 1; if ((c & 0xFFFF0000) != 0 && supplementalHandler != null) { buf.Length = 0; buf.Append(supplementalHandler.prefix); Utility.AppendNumber(buf, c, supplementalHandler.radix, supplementalHandler.minDigits); buf.Append(supplementalHandler.suffix); redoPrefix = true; } else { if (redoPrefix) { buf.Length = 0; buf.Append(prefix); redoPrefix = false; } else { buf.Length = prefixLen; } Utility.AppendNumber(buf, c, radix, minDigits); buf.Append(suffix); } text.Replace(start, start + charLen, buf.ToString()); start += buf.Length; limit += buf.Length - charLen; } pos.ContextLimit += limit - pos.Limit; pos.Limit = limit; pos.Start = start; }
/// <summary> /// Iterate forward through the string to fetch the next code point /// to be case-mapped, and set the context indexes for it. /// </summary> /// <returns>The next code point to be case-mapped, or <0 when the iteration is done.</returns> public virtual int NextCaseMapCP() { int c; if (cpLimit < limit) { cpStart = cpLimit; c = rep.Char32At(cpLimit); cpLimit += UTF16.GetCharCount(c); return(c); } else { return(-1); } }
/// <summary> /// Set a new source for this iterator. Allows object reuse. /// </summary> /// <param name="newSource">The source string to iterate against. This allows the same iterator to be used /// while changing the source string, saving object creation.</param> /// <stable>ICU 2.4</stable> public void SetSource(string newSource) { source = nfd.Normalize(newSource); done = false; // catch degenerate case if (newSource.Length == 0) { pieces = new string[1][]; current = new int[1]; pieces[0] = new string[] { "" }; return; } // find the segments IList <string> segmentList = new List <string>(); int cp; int start = 0; // i should be the end of the first code point // break up the string into segements int i = UTF16.FindOffsetFromCodePoint(source, 1); for (; i < source.Length; i += Character.CharCount(cp)) { cp = source.CodePointAt(i); if (nfcImpl.IsCanonSegmentStarter(cp)) { segmentList.Add(source.Substring(start, i - start)); // add up to i // ICU4N: Corrected 2nd substring parameter start = i; } } segmentList.Add(source.Substring(start, i - start)); // add last one // ICU4N: Corrected 2nd substring parameter // allocate the arrays, and find the strings that are CE to each segment pieces = new string[segmentList.Count][]; current = new int[segmentList.Count]; for (i = 0; i < pieces.Length; ++i) { if (PROGRESS) { Console.Out.WriteLine("SEGMENT"); } pieces[i] = GetEquivalents(segmentList[i]); } }
/// <summary> /// Find the source and target sets, subject to the input filter. /// There is a known issue with filters containing multiple characters. /// </summary> // TODO: Problem: the rule is [{ab}]c > x // The filter is [a{bc}]. // If the input is abc, then the rule will work. // However, following code applying the filter won't catch that case. internal void AddSourceTargetSet(UnicodeSet filter, UnicodeSet sourceSet, UnicodeSet targetSet, UnicodeSet revisiting) { int limit = anteContextLength + keyLength; UnicodeSet tempSource = new UnicodeSet(); UnicodeSet temp = new UnicodeSet(); // We need to walk through the pattern. // Iff some of the characters at ALL of the the positions are matched by the filter, then we add temp to toUnionTo for (int i = anteContextLength; i < limit;) { int ch = UTF16.CharAt(pattern, i); i += UTF16.GetCharCount(ch); IUnicodeMatcher matcher = data.LookupMatcher(ch); if (matcher == null) { if (!filter.Contains(ch)) { return; } tempSource.Add(ch); } else { try { if (!filter.ContainsSome((UnicodeSet)matcher)) { return; } matcher.AddMatchSetTo(tempSource); } catch (InvalidCastException) { // if the matcher is not a UnicodeSet temp.Clear(); matcher.AddMatchSetTo(temp); if (!filter.ContainsSome(temp)) { return; } tempSource.AddAll(temp); } } } // if we made our way through the gauntlet, add to source/target sourceSet.AddAll(tempSource); output.AddReplacementSetTo(targetSet); }
public static void Permute(string source, bool skipZeros, ISet <string> output) { // TODO: optimize //if (PROGRESS) System.out.println("Permute: " + source); // optimization: // if zero or one character, just return a set with it // we check for length < 2 to keep from counting code points all the time if (source.Length <= 2 && UTF16.CountCodePoint(source) <= 1) { output.Add(source); return; } // otherwise iterate through the string, and recursively permute all the other characters ISet <string> subpermute = new HashSet <string>(); int cp; for (int i = 0; i < source.Length; i += UTF16.GetCharCount(cp)) { cp = UTF16.CharAt(source, i); // optimization: // if the character is canonical combining class zero, // don't permute it if (skipZeros && i != 0 && UCharacter.GetCombiningClass(cp) == 0) { //System.out.println("Skipping " + Utility.hex(UTF16.valueOf(source, i))); continue; } // see what the permutations of the characters before and after this one are subpermute.Clear(); Permute(source.Substring(0, i - 0) // ICU4N: Checked 2nd parameter + source.Substring(i + UTF16.GetCharCount(cp)), skipZeros, subpermute); // ICU4N: Substring only has 1 parameter // prefix this character to all of them string chStr = UTF16.ValueOf(source, i); foreach (string s in subpermute) { string piece = chStr + s; //if (PROGRESS) System.out.println(" Piece: " + piece); output.Add(piece); } } }
/// <summary> /// Return the 32-bit code point at the given 16-bit offset into /// the text. This assumes the text is stored as 16-bit code units /// with surrogate pairs intermixed. If the offset of a leading or /// trailing code unit of a surrogate pair is given, return the /// code point of the surrogate pair. /// <para/> /// Usage Note: If you are making external changes to a <see cref="StringBuffer"/> /// that is passed into the <see cref="ReplaceableString"/> constructor, /// it is recommended to call <see cref="ReplaceableString.ToString()"/> if /// the contents of the <see cref="StringBuffer"/> changed but the length /// did not change before calling this method. Since the indexer of the /// <see cref="StringBuffer"/> in .NET is slow, the contents are cached internally /// so multiple calls to this method in a row are not expensive. /// <see cref="ReplaceableString.ToString()"/> forces a reload of the cache. /// </summary> /// <param name="offset">An integer between 0 and <see cref="Length"/>-1 inclusive.</param> /// <returns>32-bit code point of text at given offset.</returns> /// <stable>ICU 2.0</stable> public virtual int Char32At(int offset) { // ICU4N: In .NET, the StringBuilder indexer is extremely slow, // so we realize (cache) a string whenever a change is detected. // GetHashCode() is not a 100% reliable way to determine if the contents // of the StringBuilder have changed but more reliable than Length. // The Length property is a bit cheaper, so we check that first. string realizedString = realized; if (realizedString is null || changed || previousLength != buf.Length || previousHashCode != buf.GetHashCode()) { realizedString = RealizeString(); } return(UTF16.CharAt(realizedString, offset)); }
/// <summary> /// Transliterate the given text with the given UTransPosition /// indices. Return TRUE if the transliteration should continue /// or FALSE if it should halt (because of a U_PARTIAL_MATCH match). /// Note that FALSE is only ever returned if isIncremental is TRUE. /// </summary> /// <param name="text">The text to be transliterated.</param> /// <param name="pos">The position indices, which will be updated.</param> /// <param name="incremental">If TRUE, assume new text may be inserted /// at index.Limit, and return FALSE if thre is a partial match.</param> /// <returns>TRUE unless a U_PARTIAL_MATCH has been obtained, /// indicating that transliteration should stop until more text /// arrives.</returns> public virtual bool Transliterate(IReplaceable text, TransliterationPosition pos, bool incremental) { int indexByte = text.Char32At(pos.Start) & 0xFF; for (int i = index[indexByte]; i < index[indexByte + 1]; ++i) { MatchDegree m = rules[i].MatchAndReplace(text, pos, incremental); switch (m) { case MatchDegree.Match: if (Transliterator.DEBUG) { Console.Out.WriteLine((incremental ? "Rule.i: match " : "Rule: match ") + rules[i].ToRule(true) + " => " + UtilityExtensions.FormatInput(text, pos)); } return(true); case MatchDegree.PartialMatch: if (Transliterator.DEBUG) { Console.Out.WriteLine((incremental ? "Rule.i: partial match " : "Rule: partial match ") + rules[i].ToRule(true) + " => " + UtilityExtensions.FormatInput(text, pos)); } return(false); default: if (Transliterator.DEBUG) { Console.Out.WriteLine("Rule: no match " + rules[i]); } break; } } // No match or partial match from any rule pos.Start += UTF16.GetCharCount(text.Char32At(pos.Start)); if (Transliterator.DEBUG) { Console.Out.WriteLine((incremental ? "Rule.i: no match => " : "Rule: no match => ") + UtilityExtensions.FormatInput(text, pos)); } return(true); }
/// <summary> /// Union the set of all characters that may output by this object /// into the given set. /// </summary> /// <param name="toUnionTo">The set into which to union the output characters.</param> public virtual void AddReplacementSetTo(UnicodeSet toUnionTo) { int ch; for (int i = 0; i < output.Length; i += UTF16.GetCharCount(ch)) { ch = UTF16.CharAt(output, i); IUnicodeReplacer r = data.LookupReplacer(ch); if (r == null) { toUnionTo.Add(ch); } else { r.AddReplacementSetTo(toUnionTo); } } }
/// <summary> /// Implementation of <see cref="IUnicodeMatcher"/> API. Union the set of all /// characters that may be matched by this object into the given /// set. /// </summary> /// <param name="toUnionTo">The set into which to union the source characters.</param> public virtual void AddMatchSetTo(UnicodeSet toUnionTo) { int ch; for (int i = 0; i < pattern.Length; i += UTF16.GetCharCount(ch)) { ch = UTF16.CharAt(pattern, i); IUnicodeMatcher matcher = data.LookupMatcher(ch); if (matcher == null) { toUnionTo.Add(ch); } else { matcher.AddMatchSetTo(toUnionTo); } } }
/// <summary> /// Retreat to the start of the previous code point in the text, and return it (pre-decrement semantics). If the /// index is not preceeded by a valid surrogate pair, the behavior is the same as <see cref="Previous()"/>. Otherwise /// the iterator is decremented to the start of the surrogate pair, and the code point represented by the pair is /// returned. /// </summary> /// <returns>The previous code point in the text, or <see cref="UForwardCharacterIterator.Done"/> if the new index is before the start of the text.</returns> /// <stable>ICU 2.4</stable> public virtual int PreviousCodePoint() { int ch1 = Previous(); if (UTF16.IsTrailSurrogate((char)ch1)) { int ch2 = Previous(); if (UTF16.IsLeadSurrogate((char)ch2)) { return(Character.ToCodePoint((char)ch2, (char)ch1)); } else if (ch2 != Done) { // unmatched trail surrogate so back out Next(); } } return(ch1); }
/// <summary> /// Implements <see cref="Transliterator.HandleTransliterate(IReplaceable, Position, bool)"/>. /// </summary> protected override void HandleTransliterate(IReplaceable text, Position offsets, bool isIncremental) { int cursor = offsets.Start; int limit = offsets.Limit; StringBuilder str = new StringBuilder(); str.Append(OPEN_DELIM); int len; string name; while (cursor < limit) { int c = text.Char32At(cursor); if ((name = UCharacter.GetExtendedName(c)) != null) { str.Length = OPEN_DELIM_LEN; str.Append(name).Append(CLOSE_DELIM); int clen = UTF16.GetCharCount(c); text.Replace(cursor, cursor + clen, str.ToString()); len = str.Length; cursor += len; // advance cursor by 1 and adjust for new text limit += len - clen; // change in length } else { ++cursor; } } offsets.ContextLimit += limit - offsets.Limit; offsets.Limit = limit; offsets.Start = cursor; }
// // RBBISymbolTable::parseReference This function from the abstract symbol table interface // looks for a $variable name in the source text. // It does not look it up, only scans for it. // It is used by the UnicodeSet parser. // public virtual string ParseReference(string text, ParsePosition pos, int limit) { int start = pos.Index; int i = start; string result = ""; while (i < limit) { int c = UTF16.CharAt(text, i); if ((i == start && !UChar.IsUnicodeIdentifierStart(c)) || !UChar.IsUnicodeIdentifierPart(c)) { break; } i += UTF16.GetCharCount(c); } if (i == start) { // No valid name chars return(result); // Indicate failure with empty string } pos.Index = i; result = text.Substring(start, i - start); // ICU4N: Corrected 2nd parameter return(result); }
/// <summary> /// Implements <see cref="Transliterator.HandleTransliterate(IReplaceable, TransliterationPosition, bool)"/> /// </summary> protected override void HandleTransliterate(IReplaceable text, TransliterationPosition offsets, bool isIncremental) { lock (syncLock) { if (csp == null) { return; } if (offsets.Start >= offsets.Limit) { return; } iter.SetText(text); result.Length = 0; int c, delta; // Walk through original string // If there is a case change, modify corresponding position in replaceable iter.SetIndex(offsets.Start); iter.SetLimit(offsets.Limit); iter.SetContextLimits(offsets.ContextStart, offsets.ContextLimit); while ((c = iter.NextCaseMapCP()) >= 0) { c = csp.ToFullFolding(c, result, 0); // toFullFolding(int c, StringBuffer out, int options) if (iter.DidReachLimit && isIncremental) { // the case mapping function tried to look beyond the context limit // wait for more input offsets.Start = iter.CaseMapCPStart; return; } /* decode the result */ if (c < 0) { /* c mapped to itself, no change */ continue; } else if (c <= UCaseProperties.MaxStringLength) { /* replace by the mapping string */ delta = iter.Replace(result.ToString()); result.Length = 0; } else { /* replace by single-code point mapping */ delta = iter.Replace(UTF16.ValueOf(c)); } if (delta != 0) { offsets.Limit += delta; offsets.ContextLimit += delta; } } offsets.Start = offsets.Limit; } }
private ISet <string> GetEquivalents2(string segment) { ISet <string> result = new HashSet <string>(); if (PROGRESS) { Console.Out.WriteLine("Adding: " + Utility.Hex(segment)); } result.Add(segment); StringBuffer workingBuffer = new StringBuffer(); UnicodeSet starts = new UnicodeSet(); // cycle through all the characters int cp; for (int i = 0; i < segment.Length; i += Character.CharCount(cp)) { // see if any character is at the start of some decomposition cp = segment.CodePointAt(i); if (!nfcImpl.GetCanonStartSet(cp, starts)) { continue; } // if so, see which decompositions match for (UnicodeSetIterator iter = new UnicodeSetIterator(starts); iter.Next();) { int cp2 = iter.Codepoint; ISet <string> remainder = Extract(cp2, segment, i, workingBuffer); if (remainder == null) { continue; } // there were some matches, so add all the possibilities to the set. string prefix = segment.Substring(0, i - 0); // ICU4N: Checked 2nd parameter prefix += UTF16.ValueOf(cp2); foreach (string item in remainder) { result.Add(prefix + item); } } } return(result); /* * Set result = new HashSet(); * if (PROGRESS) System.out.println("Adding: " + NAME.transliterate(segment)); * result.add(segment); * StringBuffer workingBuffer = new StringBuffer(); * * // cycle through all the characters * int cp; * * for (int i = 0; i < segment.length(); i += UTF16.getCharCount(cp)) { * // see if any character is at the start of some decomposition * cp = UTF16.charAt(segment, i); * NormalizerImpl.getCanonStartSet(c,fillSet) * UnicodeSet starts = AT_START.get(cp); * if (starts == null) continue; * UnicodeSetIterator usi = new UnicodeSetIterator(starts); * // if so, see which decompositions match * while (usi.next()) { * int cp2 = usi.codepoint; * // we know that there are no strings in it * // so we don't have to check CharacterIterator.IS_STRING * Set remainder = extract(cp2, segment, i, workingBuffer); * if (remainder == null) continue; * * // there were some matches, so add all the possibilities to the set. * String prefix = segment.substring(0, i) + UTF16.valueOf(cp2); * Iterator it = remainder.iterator(); * while (it.hasNext()) { * String item = (String) it.next(); * if (PROGRESS) System.out.println("Adding: " + NAME.transliterate(prefix + item)); * result.add(prefix + item); * } * } * } * return result; */ }
internal static int PosAfter(IReplaceable str, int pos) { return((pos >= 0 && pos < str.Length) ? pos + UTF16.GetCharCount(str.Char32At(pos)) : pos + 1); }
protected override void HandleTransliterate(IReplaceable text, TransliterationPosition pos, bool incremental) { lock (this) { boundaryCount = 0; int boundary = 0; GetBreakIterator(); // Lazy-create it if necessary bi.SetText(new ReplaceableCharacterIterator(text, pos.Start, pos.Limit, pos.Start)); // TODO: fix clumsy workaround used below. /* * char[] tempBuffer = new char[text.length()]; * text.getChars(0, text.length(), tempBuffer, 0); * bi.setText(new StringCharacterIterator(new String(tempBuffer), pos.start, pos.limit, pos.start)); */ // end debugging // To make things much easier, we will stack the boundaries, and then insert at the end. // generally, we won't need too many, since we will be filtered. for (boundary = bi.First(); boundary != BreakIterator.Done && boundary < pos.Limit; boundary = bi.Next()) { if (boundary == 0) { continue; } // HACK: Check to see that preceeding item was a letter int cp = UTF16.CharAt(text, boundary - 1); int type = UChar.GetUnicodeCategory(cp).ToInt32(); //System.out.println(Integer.toString(cp,16) + " (before): " + type); if (((1 << type) & LETTER_OR_MARK_MASK) == 0) { continue; } cp = UTF16.CharAt(text, boundary); type = UChar.GetUnicodeCategory(cp).ToInt32(); //System.out.println(Integer.toString(cp,16) + " (after): " + type); if (((1 << type) & LETTER_OR_MARK_MASK) == 0) { continue; } if (boundaryCount >= boundaries.Length) { // realloc if necessary int[] temp = new int[boundaries.Length * 2]; System.Array.Copy(boundaries, 0, temp, 0, boundaries.Length); boundaries = temp; } boundaries[boundaryCount++] = boundary; //System.out.println(boundary); } int delta = 0; int lastBoundary = 0; if (boundaryCount != 0) { // if we found something, adjust delta = boundaryCount * insertion.Length; lastBoundary = boundaries[boundaryCount - 1]; // we do this from the end backwards, so that we don't have to keep updating. while (boundaryCount > 0) { boundary = boundaries[--boundaryCount]; text.Replace(boundary, boundary, insertion); } } // Now fix up the return values pos.ContextLimit += delta; pos.Limit += delta; pos.Start = incremental ? lastBoundary + delta : pos.Limit; } }
/// <summary> /// See if the decomposition of cp2 is at segment starting at <paramref name="segmentPos"/> /// (with canonical rearrangment!). /// If so, take the remainder, and return the equivalents. /// </summary> /// <param name="comp"></param> /// <param name="segment"></param> /// <param name="segmentPos"></param> /// <param name="buf"></param> /// <returns></returns> private ISet <string> Extract(int comp, string segment, int segmentPos, StringBuffer buf) { if (PROGRESS) { Console.Out.WriteLine(" extract: " + Utility.Hex(UTF16.ValueOf(comp)) + ", " + Utility.Hex(segment.Substring(segmentPos))); } string decomp = nfcImpl.GetDecomposition(comp); if (decomp == null) { decomp = UTF16.ValueOf(comp); } // See if it matches the start of segment (at segmentPos) bool ok = false; int cp; int decompPos = 0; int decompCp = UTF16.CharAt(decomp, 0); decompPos += UTF16.GetCharCount(decompCp); // adjust position to skip first char //int decompClass = getClass(decompCp); buf.Length = 0; // initialize working buffer, shared among callees for (int i = segmentPos; i < segment.Length; i += UTF16.GetCharCount(cp)) { cp = UTF16.CharAt(segment, i); if (cp == decompCp) { // if equal, eat another cp from decomp if (PROGRESS) { Console.Out.WriteLine(" matches: " + Utility.Hex(UTF16.ValueOf(cp))); } if (decompPos == decomp.Length) { // done, have all decomp characters! buf.Append(segment.Substring(i + UTF16.GetCharCount(cp))); // add remaining segment chars ok = true; break; } decompCp = UTF16.CharAt(decomp, decompPos); decompPos += UTF16.GetCharCount(decompCp); //decompClass = getClass(decompCp); } else { if (PROGRESS) { Console.Out.WriteLine(" buffer: " + Utility.Hex(UTF16.ValueOf(cp))); } // brute force approach UTF16.Append(buf, cp); /* TODO: optimize * // since we know that the classes are monotonically increasing, after zero * // e.g. 0 5 7 9 0 3 * // we can do an optimization * // there are only a few cases that work: zero, less, same, greater * // if both classes are the same, we fail * // if the decomp class < the segment class, we fail * * segClass = getClass(cp); * if (decompClass <= segClass) return null; */ } } if (!ok) { return(null); // we failed, characters left over } if (PROGRESS) { Console.Out.WriteLine("Matches"); } if (buf.Length == 0) { return(SET_WITH_NULL_STRING); // succeed, but no remainder } string remainder = buf.ToString(); // brute force approach // to check to make sure result is canonically equivalent /* * String trial = Normalizer.normalize(UTF16.valueOf(comp) + remainder, Normalizer.DECOMP, 0); * if (!segment.regionMatches(segmentPos, trial, 0, segment.length() - segmentPos)) return null; */ if (0 != Normalizer.Compare(UTF16.ValueOf(comp) + remainder, segment.Substring(segmentPos), 0)) { return(null); } // get the remaining combinations return(GetEquivalents2(remainder)); }
private StringBuffer Map(UCharacterIterator iter, StringPrepOptions options) { Values val = new Values(); char result = (char)0; int ch = UCharacterIterator.DONE; StringBuffer dest = new StringBuffer(); bool allowUnassigned = ((options & StringPrepOptions.AllowUnassigned) > 0); while ((ch = iter.NextCodePoint()) != UCharacterIterator.DONE) { result = GetCodePointValue(ch); GetValues(result, val); // check if the source codepoint is unassigned if (val.type == UNASSIGNED && allowUnassigned == false) { throw new StringPrepParseException("An unassigned code point was found in the input", StringPrepErrorType.UnassignedError, iter.GetText(), iter.Index); } else if ((val.type == MAP)) { int index, length; if (val.isIndex) { index = val.value; if (index >= indexes[ONE_UCHAR_MAPPING_INDEX_START] && index < indexes[TWO_UCHARS_MAPPING_INDEX_START]) { length = 1; } else if (index >= indexes[TWO_UCHARS_MAPPING_INDEX_START] && index < indexes[THREE_UCHARS_MAPPING_INDEX_START]) { length = 2; } else if (index >= indexes[THREE_UCHARS_MAPPING_INDEX_START] && index < indexes[FOUR_UCHARS_MAPPING_INDEX_START]) { length = 3; } else { length = mappingData[index++]; } /* copy mapping to destination */ dest.Append(mappingData, index, length); continue; } else { ch -= val.value; } } else if (val.type == DELETE) { // just consume the codepoint and contine continue; } //copy the source into destination UTF16.Append(dest, ch); } return(dest); }
internal static int PosBefore(IReplaceable str, int pos) { return((pos > 0) ? pos - UTF16.GetCharCount(str.Char32At(pos - 1)) : pos - 1); }
/// <summary> /// Implements <see cref="Transliterator.HandleTransliterate(IReplaceable, Position, bool)"/>. /// </summary> protected override void HandleTransliterate(IReplaceable text, Position offsets, bool isIncremental) { int maxLen = UCharacterName.Instance.MaxCharNameLength + 1; // allow for temporary trailing space StringBuffer name = new StringBuffer(maxLen); // Get the legal character set UnicodeSet legal = new UnicodeSet(); UCharacterName.Instance.GetCharNameCharacters(legal); int cursor = offsets.Start; int limit = offsets.Limit; // Modes: // 0 - looking for open delimiter // 1 - after open delimiter int mode = 0; int openPos = -1; // open delim candidate pos int c; while (cursor < limit) { c = text.Char32At(cursor); switch (mode) { case 0: // looking for open delimiter if (c == OPEN_DELIM) { // quick check first openPos = cursor; int i = Utility.ParsePattern(OPEN_PAT, text, cursor, limit); if (i >= 0 && i < limit) { mode = 1; name.Length = 0; cursor = i; continue; // *** reprocess char32At(cursor) } } break; case 1: // after open delimiter // Look for legal chars. If \s+ is found, convert it // to a single space. If closeDelimiter is found, exit // the loop. If any other character is found, exit the // loop. If the limit is reached, exit the loop. // Convert \s+ => SPACE. This assumes there are no // runs of >1 space characters in names. if (PatternProps.IsWhiteSpace(c)) { // Ignore leading whitespace if (name.Length > 0 && name[name.Length - 1] != SPACE) { name.Append(SPACE); // If we are too long then abort. maxLen includes // temporary trailing space, so use '>'. if (name.Length > maxLen) { mode = 0; } } break; } if (c == CLOSE_DELIM) { int len = name.Length; // Delete trailing space, if any if (len > 0 && name[len - 1] == SPACE) { name.Length = --len; } c = UCharacter.GetCharFromExtendedName(name.ToString()); if (c != -1) { // Lookup succeeded // assert(UTF16.getCharCount(CLOSE_DELIM) == 1); cursor++; // advance over CLOSE_DELIM string str = UTF16.ValueOf(c); text.Replace(openPos, cursor, str); // Adjust indices for the change in the length of // the string. Do not assume that str.length() == // 1, in case of surrogates. int delta = cursor - openPos - str.Length; cursor -= delta; limit -= delta; // assert(cursor == openPos + str.length()); } // If the lookup failed, we leave things as-is and // still switch to mode 0 and continue. mode = 0; openPos = -1; // close off candidate continue; // *** reprocess char32At(cursor) } if (legal.Contains(c)) { UTF16.Append(name, c); // If we go past the longest possible name then abort. // maxLen includes temporary trailing space, so use '>='. if (name.Length >= maxLen) { mode = 0; } } // Invalid character else { --cursor; // Backup and reprocess this character mode = 0; } break; } cursor += UTF16.GetCharCount(c); } offsets.ContextLimit += limit - offsets.Limit; offsets.Limit = limit; // In incremental mode, only advance the cursor up to the last // open delimiter candidate. offsets.Start = (isIncremental && openPos >= 0) ? openPos : cursor; }
//= public static UnicodeReplacer valueOf(String output, //= int cursorPos, //= RuleBasedTransliterator.Data data) { //= if (output.length() == 1) { //= char c = output.charAt(0); //= UnicodeReplacer r = data.lookupReplacer(c); //= if (r != null) { //= return r; //= } //= } //= return new StringReplacer(output, cursorPos, data); //= } /// <summary> /// <see cref="IUnicodeReplacer"/> API /// </summary> public virtual int Replace(IReplaceable text, int start, int limit, int[] cursor) { int outLen; int newStart = 0; // NOTE: It should be possible to _always_ run the complex // processing code; just slower. If not, then there is a bug // in the complex processing code. // Simple (no nested replacers) Processing Code : if (!isComplex) { text.Replace(start, limit, output); outLen = output.Length; // Setup default cursor position (for cursorPos within output) newStart = cursorPos; } // Complex (nested replacers) Processing Code : else { /* When there are segments to be copied, use the Replaceable.copy() * API in order to retain out-of-band data. Copy everything to the * end of the string, then copy them back over the key. This preserves * the integrity of indices into the key and surrounding context while * generating the output text. */ StringBuffer buf = new StringBuffer(); int oOutput; // offset into 'output' isComplex = false; // The temporary buffer starts at tempStart, and extends // to destLimit + tempExtra. The start of the buffer has a single // character from before the key. This provides style // data when addition characters are filled into the // temporary buffer. If there is nothing to the left, use // the non-character U+FFFF, which Replaceable subclasses // should treat specially as a "no-style character." // destStart points to the point after the style context // character, so it is tempStart+1 or tempStart+2. int tempStart = text.Length; // start of temp buffer int destStart = tempStart; // copy new text to here if (start > 0) { int len = UTF16.GetCharCount(text.Char32At(start - 1)); text.Copy(start - len, start, tempStart); destStart += len; } else { text.Replace(tempStart, tempStart, "\uFFFF"); destStart++; } int destLimit = destStart; int tempExtra = 0; // temp chars after destLimit for (oOutput = 0; oOutput < output.Length;) { if (oOutput == cursorPos) { // Record the position of the cursor newStart = buf.Length + destLimit - destStart; // relative to start // the buf.length() was inserted for bug 5789 // the problem is that if we are accumulating into a buffer (when r == null below) // then the actual length of the text at that point needs to add the buf length. // there was an alternative suggested in #5789, but that looks like it won't work // if we have accumulated some stuff in the dest part AND have a non-zero buffer. } int c = UTF16.CharAt(output, oOutput); // When we are at the last position copy the right style // context character into the temporary buffer. We don't // do this before because it will provide an incorrect // right context for previous replace() operations. int nextIndex = oOutput + UTF16.GetCharCount(c); if (nextIndex == output.Length) { tempExtra = UTF16.GetCharCount(text.Char32At(limit)); text.Copy(limit, limit + tempExtra, destLimit); } IUnicodeReplacer r = data.LookupReplacer(c); if (r == null) { // Accumulate straight (non-segment) text. UTF16.Append(buf, c); } else { isComplex = true; // Insert any accumulated straight text. if (buf.Length > 0) { text.Replace(destLimit, destLimit, buf.ToString()); destLimit += buf.Length; buf.Length = 0; } // Delegate output generation to replacer object int len = r.Replace(text, destLimit, destLimit, cursor); destLimit += len; } oOutput = nextIndex; } // Insert any accumulated straight text. if (buf.Length > 0) { text.Replace(destLimit, destLimit, buf.ToString()); destLimit += buf.Length; } if (oOutput == cursorPos) { // Record the position of the cursor newStart = destLimit - destStart; // relative to start } outLen = destLimit - destStart; // Copy new text to start, and delete it text.Copy(destStart, destLimit, start); text.Replace(tempStart + outLen, destLimit + tempExtra + outLen, ""); // Delete the old text (the key) text.Replace(start + outLen, limit + outLen, ""); } if (hasCursor) { // Adjust the cursor for positions outside the key. These // refer to code points rather than code units. If cursorPos // is within the output string, then use newStart, which has // already been set above. if (cursorPos < 0) { newStart = start; int n = cursorPos; // Outside the output string, cursorPos counts code points while (n < 0 && newStart > 0) { newStart -= UTF16.GetCharCount(text.Char32At(newStart - 1)); ++n; } newStart += n; } else if (cursorPos > output.Length) { newStart = start + outLen; int n = cursorPos - output.Length; // Outside the output string, cursorPos counts code points while (n > 0 && newStart < text.Length) { newStart += UTF16.GetCharCount(text.Char32At(newStart)); --n; } newStart += n; } else { // Cursor is within output string. It has been set up above // to be relative to start. newStart += start; } cursor[0] = newStart; } return(outLen); }
/// <summary> /// Implements <see cref="Transliterator.HandleTransliterate(IReplaceable, Position, bool)"/>. /// </summary> protected override void HandleTransliterate(IReplaceable text, Position offsets, bool isIncremental) { lock (this) { // TODO reimplement, see ustrcase.c // using a real word break iterator // instead of just looking for a transition between cased and uncased characters // call CaseMapTransliterator::handleTransliterate() for lowercasing? (set fMap) // needs to take isIncremental into account because case mappings are context-sensitive // also detect when lowercasing function did not finish because of context if (offsets.Start >= offsets.Limit) { return; } // case type: >0 cased (UCaseProps.LOWER etc.) ==0 uncased <0 case-ignorable int type; // Our mode; we are either converting letter toTitle or // toLower. bool doTitle = true; // Determine if there is a preceding context of cased case-ignorable*, // in which case we want to start in toLower mode. If the // prior context is anything else (including empty) then start // in toTitle mode. int c, start; for (start = offsets.Start - 1; start >= offsets.ContextStart; start -= UTF16.GetCharCount(c)) { c = text.Char32At(start); type = csp.GetTypeOrIgnorable(c); if (type > 0) { // cased doTitle = false; break; } else if (type == 0) { // uncased but not ignorable break; } // else (type<0) case-ignorable: continue } // Convert things after a cased character toLower; things // after a uncased, non-case-ignorable character toTitle. Case-ignorable // characters are copied directly and do not change the mode. iter.SetText(text); iter.SetIndex(offsets.Start); iter.SetLimit(offsets.Limit); iter.SetContextLimits(offsets.ContextStart, offsets.ContextLimit); result.Length = 0; // Walk through original string // If there is a case change, modify corresponding position in replaceable int delta; while ((c = iter.NextCaseMapCP()) >= 0) { type = csp.GetTypeOrIgnorable(c); if (type >= 0) { // not case-ignorable if (doTitle) { c = csp.ToFullTitle(c, iter, result, caseLocale); } else { c = csp.ToFullLower(c, iter, result, caseLocale); } doTitle = type == 0; // doTitle=isUncased if (iter.DidReachLimit && isIncremental) { // the case mapping function tried to look beyond the context limit // wait for more input offsets.Start = iter.CaseMapCPStart; return; } /* decode the result */ if (c < 0) { /* c mapped to itself, no change */ continue; } else if (c <= UCaseProps.MAX_STRING_LENGTH) { /* replace by the mapping string */ delta = iter.Replace(result.ToString()); result.Length = 0; } else { /* replace by single-code point mapping */ delta = iter.Replace(UTF16.ValueOf(c)); } if (delta != 0) { offsets.Limit += delta; offsets.ContextLimit += delta; } } } offsets.Start = offsets.Limit; } }
/// <summary> /// Implements <see cref="Transliterator.HandleTransliterate(IReplaceable, Position, bool)"/>. /// </summary> protected override void HandleTransliterate(IReplaceable text, Position pos, bool isIncremental) { int start = pos.Start; int limit = pos.Limit; int i, ipat; //loop: while (start < limit) { // Loop over the forms in spec[]. Exit this loop when we // match one of the specs. Exit the outer loop if a // partial match is detected and isIncremental is true. for (ipat = 0; spec[ipat] != END;) { // Read the header int prefixLen = spec[ipat++]; int suffixLen = spec[ipat++]; int radix = spec[ipat++]; int minDigits = spec[ipat++]; int maxDigits = spec[ipat++]; // s is a copy of start that is advanced over the // characters as we parse them. int s = start; bool match = true; for (i = 0; i < prefixLen; ++i) { if (s >= limit) { if (i > 0) { // We've already matched a character. This is // a partial match, so we return if in // incremental mode. In non-incremental mode, // go to the next spec. if (isIncremental) { goto loop_break; } match = false; break; } } char c = text[s++]; if (c != spec[ipat + i]) { match = false; break; } } if (match) { int u = 0; int digitCount = 0; for (; ;) { if (s >= limit) { // Check for partial match in incremental mode. if (s > start && isIncremental) { goto loop_break; } break; } int ch = text.Char32At(s); int digit = UCharacter.Digit(ch, radix); if (digit < 0) { break; } s += UTF16.GetCharCount(ch); u = (u * radix) + digit; if (++digitCount == maxDigits) { break; } } match = (digitCount >= minDigits); if (match) { for (i = 0; i < suffixLen; ++i) { if (s >= limit) { // Check for partial match in incremental mode. if (s > start && isIncremental) { goto loop_break; } match = false; break; } char c = text[s++]; if (c != spec[ipat + prefixLen + i]) { match = false; break; } } if (match) { // At this point, we have a match string str = UTF16.ValueOf(u); text.Replace(start, s, str); limit -= s - start - str.Length; // The following break statement leaves the // loop that is traversing the forms in // spec[]. We then parse the next input // character. break; } } } ipat += prefixLen + suffixLen; } if (start < limit) { start += UTF16.GetCharCount(text.Char32At(start)); } } loop_break : { } pos.ContextLimit += limit - pos.Limit; pos.Limit = limit; pos.Start = start; }
public SourceTargetUtility(ITransform <string, string> transform, Normalizer2 normalizer) { this.transform = transform; if (normalizer != null) { // synchronized (SourceTargetUtility.class) { // if (NFC == null) { // NFC = Normalizer2.getInstance(null, "nfc", Mode.COMPOSE); // for (int i = 0; i <= 0x10FFFF; ++i) { // String d = NFC.getDecomposition(i); // if (d == null) { // continue; // } // String s = NFC.normalize(d); // if (!CharSequences.equals(i, s)) { // continue; // } // // composes // boolean first = false; // for (int trailing : CharSequences.codePoints(d)) { // if (first) { // first = false; // } else { // TRAILING_COMBINING.add(trailing); // } // } // } // } // } sourceCache = new UnicodeSet("[:^ccc=0:]"); } else { sourceCache = new UnicodeSet(); } sourceStrings = new HashSet <string>(); for (int i = 0; i <= 0x10FFFF; ++i) { string s = transform.Transform(UTF16.ValueOf(i)); bool added = false; if (!CharSequences.Equals(i, s)) { sourceCache.Add(i); added = true; } if (normalizer == null) { continue; } string d = NFC.GetDecomposition(i); if (d == null) { continue; } s = transform.Transform(d); if (!d.Equals(s)) { sourceStrings.Add(d); } if (added) { continue; } if (!normalizer.IsInert(i)) { sourceCache.Add(i); continue; } // see if any of the non-starters change s; if so, add i // for (String ns : TRAILING_COMBINING) { // String s2 = transform.transform(s + ns); // if (!s2.startsWith(s)) { // sourceCache.add(i); // break; // } // } // int endOfFirst = CharSequences.onCharacterBoundary(d, 1) ? 1 : 2; // if (endOfFirst >= d.length()) { // continue; // } // // now add all initial substrings // for (int j = 1; j < d.length(); ++j) { // if (!CharSequences.onCharacterBoundary(d, j)) { // continue; // } // String dd = d.substring(0,j); // s = transform.transform(dd); // if (!dd.equals(s)) { // sourceStrings.add(dd); // } // } } sourceCache.Freeze(); }
/// <summary> /// Return the 32-bit code point at the given 16-bit offset into /// the text. This assumes the text is stored as 16-bit code units /// with surrogate pairs intermixed. If the offset of a leading or /// trailing code unit of a surrogate pair is given, return the /// code point of the surrogate pair. /// </summary> /// <param name="offset">An integer between 0 and <see cref="Length"/>-1 inclusive.</param> /// <returns>32-bit code point of text at given offset.</returns> /// <stable>ICU 2.0</stable> public virtual int Char32At(int offset) { return(UTF16.CharAt(buf, offset)); }