/// <summary> /// Implement <see cref="IUnicodeMatcher"/> /// </summary> public virtual bool MatchesIndexValue(int v) { if (pattern.Length == 0) { return(true); } int c = UTF16.CharAt(pattern, 0); IUnicodeMatcher m = data.LookupMatcher(c); return((m == null) ? ((c & 0xFF) == v) : m.MatchesIndexValue(v)); }
/** * Internal method. Returns 8-bit index value for this rule. * This is the low byte of the first character of the key, * unless the first character of the key is a set. If it's a * set, or otherwise can match multiple keys, the index value is -1. */ internal int GetIndexValue() { if (anteContextLength == pattern.Length) { // A pattern with just ante context {such as foo)>bar} can // match any key. return(-1); } int c = UTF16.CharAt(pattern, anteContextLength); return(data.LookupMatcher(c) == null ? (c & 0xFF) : -1); }
/// <summary> /// Find the source and target sets, subject to the input filter. /// There is a known issue with filters containing multiple characters. /// </summary> // TODO: Problem: the rule is [{ab}]c > x // The filter is [a{bc}]. // If the input is abc, then the rule will work. // However, following code applying the filter won't catch that case. internal void AddSourceTargetSet(UnicodeSet filter, UnicodeSet sourceSet, UnicodeSet targetSet, UnicodeSet revisiting) { int limit = anteContextLength + keyLength; UnicodeSet tempSource = new UnicodeSet(); UnicodeSet temp = new UnicodeSet(); // We need to walk through the pattern. // Iff some of the characters at ALL of the the positions are matched by the filter, then we add temp to toUnionTo for (int i = anteContextLength; i < limit;) { int ch = UTF16.CharAt(pattern, i); i += UTF16.GetCharCount(ch); IUnicodeMatcher matcher = data.LookupMatcher(ch); if (matcher == null) { if (!filter.Contains(ch)) { return; } tempSource.Add(ch); } else { try { if (!filter.ContainsSome((UnicodeSet)matcher)) { return; } matcher.AddMatchSetTo(tempSource); } catch (InvalidCastException) { // if the matcher is not a UnicodeSet temp.Clear(); matcher.AddMatchSetTo(temp); if (!filter.ContainsSome(temp)) { return; } tempSource.AddAll(temp); } } } // if we made our way through the gauntlet, add to source/target sourceSet.AddAll(tempSource); output.AddReplacementSetTo(targetSet); }
/// <summary> /// Return the 32-bit code point at the given 16-bit offset into /// the text. This assumes the text is stored as 16-bit code units /// with surrogate pairs intermixed. If the offset of a leading or /// trailing code unit of a surrogate pair is given, return the /// code point of the surrogate pair. /// <para/> /// Usage Note: If you are making external changes to a <see cref="StringBuffer"/> /// that is passed into the <see cref="ReplaceableString"/> constructor, /// it is recommended to call <see cref="ReplaceableString.ToString()"/> if /// the contents of the <see cref="StringBuffer"/> changed but the length /// did not change before calling this method. Since the indexer of the /// <see cref="StringBuffer"/> in .NET is slow, the contents are cached internally /// so multiple calls to this method in a row are not expensive. /// <see cref="ReplaceableString.ToString()"/> forces a reload of the cache. /// </summary> /// <param name="offset">An integer between 0 and <see cref="Length"/>-1 inclusive.</param> /// <returns>32-bit code point of text at given offset.</returns> /// <stable>ICU 2.0</stable> public virtual int Char32At(int offset) { // ICU4N: In .NET, the StringBuilder indexer is extremely slow, // so we realize (cache) a string whenever a change is detected. // GetHashCode() is not a 100% reliable way to determine if the contents // of the StringBuilder have changed but more reliable than Length. // The Length property is a bit cheaper, so we check that first. string realizedString = realized; if (realizedString is null || changed || previousLength != buf.Length || previousHashCode != buf.GetHashCode()) { realizedString = RealizeString(); } return(UTF16.CharAt(realizedString, offset)); }
public static void Permute(string source, bool skipZeros, ISet <string> output) { // TODO: optimize //if (PROGRESS) System.out.println("Permute: " + source); // optimization: // if zero or one character, just return a set with it // we check for length < 2 to keep from counting code points all the time if (source.Length <= 2 && UTF16.CountCodePoint(source) <= 1) { output.Add(source); return; } // otherwise iterate through the string, and recursively permute all the other characters ISet <string> subpermute = new HashSet <string>(); int cp; for (int i = 0; i < source.Length; i += UTF16.GetCharCount(cp)) { cp = UTF16.CharAt(source, i); // optimization: // if the character is canonical combining class zero, // don't permute it if (skipZeros && i != 0 && UCharacter.GetCombiningClass(cp) == 0) { //System.out.println("Skipping " + Utility.hex(UTF16.valueOf(source, i))); continue; } // see what the permutations of the characters before and after this one are subpermute.Clear(); Permute(source.Substring(0, i - 0) // ICU4N: Checked 2nd parameter + source.Substring(i + UTF16.GetCharCount(cp)), skipZeros, subpermute); // ICU4N: Substring only has 1 parameter // prefix this character to all of them string chStr = UTF16.ValueOf(source, i); foreach (string s in subpermute) { string piece = chStr + s; //if (PROGRESS) System.out.println(" Piece: " + piece); output.Add(piece); } } }
/// <summary> /// Union the set of all characters that may output by this object /// into the given set. /// </summary> /// <param name="toUnionTo">The set into which to union the output characters.</param> public virtual void AddReplacementSetTo(UnicodeSet toUnionTo) { int ch; for (int i = 0; i < output.Length; i += UTF16.GetCharCount(ch)) { ch = UTF16.CharAt(output, i); IUnicodeReplacer r = data.LookupReplacer(ch); if (r == null) { toUnionTo.Add(ch); } else { r.AddReplacementSetTo(toUnionTo); } } }
/// <summary> /// Implementation of <see cref="IUnicodeMatcher"/> API. Union the set of all /// characters that may be matched by this object into the given /// set. /// </summary> /// <param name="toUnionTo">The set into which to union the source characters.</param> public virtual void AddMatchSetTo(UnicodeSet toUnionTo) { int ch; for (int i = 0; i < pattern.Length; i += UTF16.GetCharCount(ch)) { ch = UTF16.CharAt(pattern, i); IUnicodeMatcher matcher = data.LookupMatcher(ch); if (matcher == null) { toUnionTo.Add(ch); } else { matcher.AddMatchSetTo(toUnionTo); } } }
// // RBBISymbolTable::parseReference This function from the abstract symbol table interface // looks for a $variable name in the source text. // It does not look it up, only scans for it. // It is used by the UnicodeSet parser. // public virtual string ParseReference(string text, ParsePosition pos, int limit) { int start = pos.Index; int i = start; string result = ""; while (i < limit) { int c = UTF16.CharAt(text, i); if ((i == start && !UChar.IsUnicodeIdentifierStart(c)) || !UChar.IsUnicodeIdentifierPart(c)) { break; } i += UTF16.GetCharCount(c); } if (i == start) { // No valid name chars return(result); // Indicate failure with empty string } pos.Index = i; result = text.Substring(start, i - start); // ICU4N: Corrected 2nd parameter return(result); }
//= public static UnicodeReplacer valueOf(String output, //= int cursorPos, //= RuleBasedTransliterator.Data data) { //= if (output.length() == 1) { //= char c = output.charAt(0); //= UnicodeReplacer r = data.lookupReplacer(c); //= if (r != null) { //= return r; //= } //= } //= return new StringReplacer(output, cursorPos, data); //= } /// <summary> /// <see cref="IUnicodeReplacer"/> API /// </summary> public virtual int Replace(IReplaceable text, int start, int limit, int[] cursor) { int outLen; int newStart = 0; // NOTE: It should be possible to _always_ run the complex // processing code; just slower. If not, then there is a bug // in the complex processing code. // Simple (no nested replacers) Processing Code : if (!isComplex) { text.Replace(start, limit, output); outLen = output.Length; // Setup default cursor position (for cursorPos within output) newStart = cursorPos; } // Complex (nested replacers) Processing Code : else { /* When there are segments to be copied, use the Replaceable.copy() * API in order to retain out-of-band data. Copy everything to the * end of the string, then copy them back over the key. This preserves * the integrity of indices into the key and surrounding context while * generating the output text. */ StringBuffer buf = new StringBuffer(); int oOutput; // offset into 'output' isComplex = false; // The temporary buffer starts at tempStart, and extends // to destLimit + tempExtra. The start of the buffer has a single // character from before the key. This provides style // data when addition characters are filled into the // temporary buffer. If there is nothing to the left, use // the non-character U+FFFF, which Replaceable subclasses // should treat specially as a "no-style character." // destStart points to the point after the style context // character, so it is tempStart+1 or tempStart+2. int tempStart = text.Length; // start of temp buffer int destStart = tempStart; // copy new text to here if (start > 0) { int len = UTF16.GetCharCount(text.Char32At(start - 1)); text.Copy(start - len, start, tempStart); destStart += len; } else { text.Replace(tempStart, tempStart, "\uFFFF"); destStart++; } int destLimit = destStart; int tempExtra = 0; // temp chars after destLimit for (oOutput = 0; oOutput < output.Length;) { if (oOutput == cursorPos) { // Record the position of the cursor newStart = buf.Length + destLimit - destStart; // relative to start // the buf.length() was inserted for bug 5789 // the problem is that if we are accumulating into a buffer (when r == null below) // then the actual length of the text at that point needs to add the buf length. // there was an alternative suggested in #5789, but that looks like it won't work // if we have accumulated some stuff in the dest part AND have a non-zero buffer. } int c = UTF16.CharAt(output, oOutput); // When we are at the last position copy the right style // context character into the temporary buffer. We don't // do this before because it will provide an incorrect // right context for previous replace() operations. int nextIndex = oOutput + UTF16.GetCharCount(c); if (nextIndex == output.Length) { tempExtra = UTF16.GetCharCount(text.Char32At(limit)); text.Copy(limit, limit + tempExtra, destLimit); } IUnicodeReplacer r = data.LookupReplacer(c); if (r == null) { // Accumulate straight (non-segment) text. UTF16.Append(buf, c); } else { isComplex = true; // Insert any accumulated straight text. if (buf.Length > 0) { text.Replace(destLimit, destLimit, buf.ToString()); destLimit += buf.Length; buf.Length = 0; } // Delegate output generation to replacer object int len = r.Replace(text, destLimit, destLimit, cursor); destLimit += len; } oOutput = nextIndex; } // Insert any accumulated straight text. if (buf.Length > 0) { text.Replace(destLimit, destLimit, buf.ToString()); destLimit += buf.Length; } if (oOutput == cursorPos) { // Record the position of the cursor newStart = destLimit - destStart; // relative to start } outLen = destLimit - destStart; // Copy new text to start, and delete it text.Copy(destStart, destLimit, start); text.Replace(tempStart + outLen, destLimit + tempExtra + outLen, ""); // Delete the old text (the key) text.Replace(start + outLen, limit + outLen, ""); } if (hasCursor) { // Adjust the cursor for positions outside the key. These // refer to code points rather than code units. If cursorPos // is within the output string, then use newStart, which has // already been set above. if (cursorPos < 0) { newStart = start; int n = cursorPos; // Outside the output string, cursorPos counts code points while (n < 0 && newStart > 0) { newStart -= UTF16.GetCharCount(text.Char32At(newStart - 1)); ++n; } newStart += n; } else if (cursorPos > output.Length) { newStart = start + outLen; int n = cursorPos - output.Length; // Outside the output string, cursorPos counts code points while (n > 0 && newStart < text.Length) { newStart += UTF16.GetCharCount(text.Char32At(newStart)); --n; } newStart += n; } else { // Cursor is within output string. It has been set up above // to be relative to start. newStart += start; } cursor[0] = newStart; } return(outLen); }
protected override void HandleTransliterate(IReplaceable text, TransliterationPosition pos, bool incremental) { lock (this) { boundaryCount = 0; int boundary = 0; GetBreakIterator(); // Lazy-create it if necessary bi.SetText(new ReplaceableCharacterIterator(text, pos.Start, pos.Limit, pos.Start)); // TODO: fix clumsy workaround used below. /* * char[] tempBuffer = new char[text.length()]; * text.getChars(0, text.length(), tempBuffer, 0); * bi.setText(new StringCharacterIterator(new String(tempBuffer), pos.start, pos.limit, pos.start)); */ // end debugging // To make things much easier, we will stack the boundaries, and then insert at the end. // generally, we won't need too many, since we will be filtered. for (boundary = bi.First(); boundary != BreakIterator.Done && boundary < pos.Limit; boundary = bi.Next()) { if (boundary == 0) { continue; } // HACK: Check to see that preceeding item was a letter int cp = UTF16.CharAt(text, boundary - 1); int type = UChar.GetUnicodeCategory(cp).ToInt32(); //System.out.println(Integer.toString(cp,16) + " (before): " + type); if (((1 << type) & LETTER_OR_MARK_MASK) == 0) { continue; } cp = UTF16.CharAt(text, boundary); type = UChar.GetUnicodeCategory(cp).ToInt32(); //System.out.println(Integer.toString(cp,16) + " (after): " + type); if (((1 << type) & LETTER_OR_MARK_MASK) == 0) { continue; } if (boundaryCount >= boundaries.Length) { // realloc if necessary int[] temp = new int[boundaries.Length * 2]; System.Array.Copy(boundaries, 0, temp, 0, boundaries.Length); boundaries = temp; } boundaries[boundaryCount++] = boundary; //System.out.println(boundary); } int delta = 0; int lastBoundary = 0; if (boundaryCount != 0) { // if we found something, adjust delta = boundaryCount * insertion.Length; lastBoundary = boundaries[boundaryCount - 1]; // we do this from the end backwards, so that we don't have to keep updating. while (boundaryCount > 0) { boundary = boundaries[--boundaryCount]; text.Replace(boundary, boundary, insertion); } } // Now fix up the return values pos.ContextLimit += delta; pos.Limit += delta; pos.Start = incremental ? lastBoundary + delta : pos.Limit; } }
/// <summary> /// Return the 32-bit code point at the given 16-bit offset into /// the text. This assumes the text is stored as 16-bit code units /// with surrogate pairs intermixed. If the offset of a leading or /// trailing code unit of a surrogate pair is given, return the /// code point of the surrogate pair. /// </summary> /// <param name="offset">An integer between 0 and <see cref="Length"/>-1 inclusive.</param> /// <returns>32-bit code point of text at given offset.</returns> /// <stable>ICU 2.0</stable> public virtual int Char32At(int offset) { return(UTF16.CharAt(buf, offset)); }
/// <summary> /// See if the decomposition of cp2 is at segment starting at <paramref name="segmentPos"/> /// (with canonical rearrangment!). /// If so, take the remainder, and return the equivalents. /// </summary> /// <param name="comp"></param> /// <param name="segment"></param> /// <param name="segmentPos"></param> /// <param name="buf"></param> /// <returns></returns> private ISet <string> Extract(int comp, string segment, int segmentPos, StringBuffer buf) { if (PROGRESS) { Console.Out.WriteLine(" extract: " + Utility.Hex(UTF16.ValueOf(comp)) + ", " + Utility.Hex(segment.Substring(segmentPos))); } string decomp = nfcImpl.GetDecomposition(comp); if (decomp == null) { decomp = UTF16.ValueOf(comp); } // See if it matches the start of segment (at segmentPos) bool ok = false; int cp; int decompPos = 0; int decompCp = UTF16.CharAt(decomp, 0); decompPos += UTF16.GetCharCount(decompCp); // adjust position to skip first char //int decompClass = getClass(decompCp); buf.Length = 0; // initialize working buffer, shared among callees for (int i = segmentPos; i < segment.Length; i += UTF16.GetCharCount(cp)) { cp = UTF16.CharAt(segment, i); if (cp == decompCp) { // if equal, eat another cp from decomp if (PROGRESS) { Console.Out.WriteLine(" matches: " + Utility.Hex(UTF16.ValueOf(cp))); } if (decompPos == decomp.Length) { // done, have all decomp characters! buf.Append(segment.Substring(i + UTF16.GetCharCount(cp))); // add remaining segment chars ok = true; break; } decompCp = UTF16.CharAt(decomp, decompPos); decompPos += UTF16.GetCharCount(decompCp); //decompClass = getClass(decompCp); } else { if (PROGRESS) { Console.Out.WriteLine(" buffer: " + Utility.Hex(UTF16.ValueOf(cp))); } // brute force approach UTF16.Append(buf, cp); /* TODO: optimize * // since we know that the classes are monotonically increasing, after zero * // e.g. 0 5 7 9 0 3 * // we can do an optimization * // there are only a few cases that work: zero, less, same, greater * // if both classes are the same, we fail * // if the decomp class < the segment class, we fail * * segClass = getClass(cp); * if (decompClass <= segClass) return null; */ } } if (!ok) { return(null); // we failed, characters left over } if (PROGRESS) { Console.Out.WriteLine("Matches"); } if (buf.Length == 0) { return(SET_WITH_NULL_STRING); // succeed, but no remainder } string remainder = buf.ToString(); // brute force approach // to check to make sure result is canonically equivalent /* * String trial = Normalizer.normalize(UTF16.valueOf(comp) + remainder, Normalizer.DECOMP, 0); * if (!segment.regionMatches(segmentPos, trial, 0, segment.length() - segmentPos)) return null; */ if (0 != Normalizer.Compare(UTF16.ValueOf(comp) + remainder, segment.Substring(segmentPos), 0)) { return(null); } // get the remaining combinations return(GetEquivalents2(remainder)); }