public override int Matches(CharacterIterator text_, int maxLength, int[] lengths, int[] count_, int limit, int[] values) { UCharacterIterator text = UCharacterIterator.GetInstance(text_); BytesTrie bt = new BytesTrie(characters, 0); int c = text.NextCodePoint(); if (c == UCharacterIterator.DONE) { return(0); } Result result = bt.First(Transform(c)); // TODO: should numChars count Character.charCount() ? int numChars = 1; int count = 0; for (; ;) { if (result.HasValue()) { if (count < limit) { if (values != null) { values[count] = bt.GetValue(); } lengths[count] = numChars; count++; } if (result == Result.FinalValue) { break; } } else if (result == Result.NoMatch) { break; } if (numChars >= maxLength) { break; } c = text.NextCodePoint(); if (c == UCharacterIterator.DONE) { break; } ++numChars; result = bt.Next(Transform(c)); } count_[0] = count; return(numChars); }
/// <summary> /// Set a new source string iterator for iteration, and reset the /// offset to the beginning of the text. /// <para/> /// The source iterator's integrity will be preserved since a new copy /// will be created for use. /// </summary> /// <param name="source">The new source string iterator for iteration.</param> /// <stable>ICU 2.8</stable> public void SetText(UCharacterIterator source) { string_ = source.GetText(); // TODO: do we need to remember the source string in a field? // Note: In C++, we just setText(source.getText()). // In Java, we actually operate on a character iterator. // (The old code apparently did so only for a CharacterIterator; // for a UCharacterIterator it also just used source.getText()). // TODO: do we need to remember the cloned iterator in a field? UCharacterIterator src; //try //{ src = (UCharacterIterator)source.Clone(); //} //catch (CloneNotSupportedException e) //{ // // Fall back to ICU 52 behavior of iterating over the text contents // // of the UCharacterIterator. // setText(source.getText()); // return; //} src.SetToStart(); CollationIterator newIter; bool numeric = rbc_.settings.ReadOnly.IsNumeric; if (rbc_.settings.ReadOnly.DontCheckFCD) { newIter = new IterCollationIterator(rbc_.data, numeric, src); } else { newIter = new FCDIterCollationIterator(rbc_.data, numeric, src, 0); } iter_ = newIter; otherHalf_ = 0; dir_ = 0; }
/// <summary> /// Prepare the input String for use in applications with the given profile. This operation maps, normalizes(NFKC), /// checks for prohibited and BiDi characters in the order defined by RFC 3454 /// depending on the options specified in the profile. /// </summary> /// <param name="src">A string.</param> /// <param name="options">A bit set of options: /// <list type="bullet"> /// <item><term><see cref="StringPrepOptions.Default"/></term><description>Prohibit processing of unassigned code points in the input.</description></item> /// <item><term><see cref="StringPrepOptions.AllowUnassigned"/></term><description>Treat the unassigned code points are in the input as normal Unicode code points.</description></item> /// </list> /// </param> /// <returns>A string containing the output.</returns> /// <exception cref="StringPrepParseException">An exception occurs when parsing a string is invalid.</exception> /// <stable>ICU 4.2</stable> public string Prepare(string src, StringPrepOptions options) { StringBuffer result = Prepare(UCharacterIterator.GetInstance(src), options); return(result.ToString()); }
/* * boolean isLabelSeparator(int ch){ * int result = getCodePointValue(ch); * if( (result & 0x07) == LABEL_SEPARATOR){ * return true; * } * return false; * } */ /* * 1) Map -- For each character in the input, check if it has a mapping * and, if so, replace it with its mapping. * * 2) Normalize -- Possibly normalize the result of step 1 using Unicode * normalization. * * 3) Prohibit -- Check for any characters that are not allowed in the * output. If any are found, return an error. * * 4) Check bidi -- Possibly check for right-to-left characters, and if * any are found, make sure that the whole string satisfies the * requirements for bidirectional strings. If the string does not * satisfy the requirements for bidirectional strings, return an * error. * [Unicode3.2] defines several bidirectional categories; each character * has one bidirectional category assigned to it. For the purposes of * the requirements below, an "RandALCat character" is a character that * has Unicode bidirectional categories "R" or "AL"; an "LCat character" * is a character that has Unicode bidirectional category "L". Note * * * that there are many characters which fall in neither of the above * definitions; Latin digits (<U+0030> through <U+0039>) are examples of * this because they have bidirectional category "EN". * * In any profile that specifies bidirectional character handling, all * three of the following requirements MUST be met: * * 1) The characters in section 5.8 MUST be prohibited. * * 2) If a string contains any RandALCat character, the string MUST NOT * contain any LCat character. * * 3) If a string contains any RandALCat character, a RandALCat * character MUST be the first character of the string, and a * RandALCat character MUST be the last character of the string. */ /// <summary> /// Prepare the input buffer for use in applications with the given profile. This operation maps, normalizes(NFKC), /// checks for prohibited and BiDi characters in the order defined by RFC 3454 /// depending on the options specified in the profile. /// </summary> /// <param name="src">A <see cref="UCharacterIterator"/> object containing the source string.</param> /// <param name="options">A bit set of options: /// <list type="bullet"> /// <item><term><see cref="StringPrepOptions.Default"/></term><description>Prohibit processing of unassigned code points in the input.</description></item> /// <item><term><see cref="StringPrepOptions.AllowUnassigned"/></term><description>Treat the unassigned code points are in the input as normal Unicode code points.</description></item> /// </list> /// </param> /// <returns>A <see cref="StringBuffer"/> containing the output.</returns> /// <exception cref="StringPrepParseException">An exception occurs when parsing a string is invalid.</exception> /// <stable>ICU 2.8</stable> public StringBuffer Prepare(UCharacterIterator src, StringPrepOptions options) { // map StringBuffer mapOut = Map(src, options); StringBuffer normOut = mapOut;// initialize if (doNFKC) { // normalize normOut = Normalize(mapOut); } int ch; char result; UCharacterIterator iter = UCharacterIterator.GetInstance(normOut); Values val = new Values(); #pragma warning disable 612, 618 UCharacterDirection direction = UCharacterDirection.CharDirectionCount, firstCharDir = UCharacterDirection.CharDirectionCount; #pragma warning restore 612, 618 int rtlPos = -1, ltrPos = -1; bool rightToLeft = false, leftToRight = false; while ((ch = iter.NextCodePoint()) != UCharacterIterator.DONE) { result = GetCodePointValue(ch); GetValues(result, val); if (val.type == PROHIBITED) { throw new StringPrepParseException("A prohibited code point was found in the input", StringPrepErrorType.ProhibitedError, iter.GetText(), val.value); } if (checkBiDi) { direction = (UCharacterDirection)bdp.GetClass(ch); #pragma warning disable 612, 618 if (firstCharDir == UCharacterDirection.CharDirectionCount) #pragma warning restore 612, 618 { firstCharDir = direction; } if (direction == UCharacterDirection.LeftToRight) { leftToRight = true; ltrPos = iter.Index - 1; } if (direction == UCharacterDirection.RightToLeft || direction == UCharacterDirection.RightToLeftArabic) { rightToLeft = true; rtlPos = iter.Index - 1; } } } if (checkBiDi == true) { // satisfy 2 if (leftToRight == true && rightToLeft == true) { throw new StringPrepParseException("The input does not conform to the rules for BiDi code points.", StringPrepErrorType.CheckBiDiError, iter.GetText(), (rtlPos > ltrPos) ? rtlPos : ltrPos); } //satisfy 3 if (rightToLeft == true && !((firstCharDir == UCharacterDirection.RightToLeft || firstCharDir == UCharacterDirection.RightToLeftArabic) && (direction == UCharacterDirection.RightToLeft || direction == UCharacterDirection.RightToLeftArabic)) ) { throw new StringPrepParseException("The input does not conform to the rules for BiDi code points.", StringPrepErrorType.CheckBiDiError, iter.GetText(), (rtlPos > ltrPos) ? rtlPos : ltrPos); } } return(normOut); }
private StringBuffer Map(UCharacterIterator iter, StringPrepOptions options) { Values val = new Values(); char result = (char)0; int ch = UCharacterIterator.DONE; StringBuffer dest = new StringBuffer(); bool allowUnassigned = ((options & StringPrepOptions.AllowUnassigned) > 0); while ((ch = iter.NextCodePoint()) != UCharacterIterator.DONE) { result = GetCodePointValue(ch); GetValues(result, val); // check if the source codepoint is unassigned if (val.type == UNASSIGNED && allowUnassigned == false) { throw new StringPrepParseException("An unassigned code point was found in the input", StringPrepErrorType.UnassignedError, iter.GetText(), iter.Index); } else if ((val.type == MAP)) { int index, length; if (val.isIndex) { index = val.value; if (index >= indexes[ONE_UCHAR_MAPPING_INDEX_START] && index < indexes[TWO_UCHARS_MAPPING_INDEX_START]) { length = 1; } else if (index >= indexes[TWO_UCHARS_MAPPING_INDEX_START] && index < indexes[THREE_UCHARS_MAPPING_INDEX_START]) { length = 2; } else if (index >= indexes[THREE_UCHARS_MAPPING_INDEX_START] && index < indexes[FOUR_UCHARS_MAPPING_INDEX_START]) { length = 3; } else { length = mappingData[index++]; } /* copy mapping to destination */ dest.Append(mappingData, index, length); continue; } else { ch -= val.value; } } else if (val.type == DELETE) { // just consume the codepoint and contine continue; } //copy the source into destination UTF16.Append(dest, ch); } return(dest); }
/// <summary> /// <see cref="CollationElementIterator"/> constructor. This takes a source /// character iterator and a <see cref="Text.RuleBasedCollator"/>. The iterator will /// walk through the source string based on the rules defined by /// the collator. If the source string is empty, <see cref="NullOrder"/> will be /// returned on the first call to <see cref="Next()"/>. /// </summary> /// <param name="source">The source string iterator.</param> /// <param name="collator">The <see cref="Text.RuleBasedCollator"/>.</param> /// <stable>ICU 2.8</stable> internal CollationElementIterator(UCharacterIterator source, RuleBasedCollator collator) : this(collator) { SetText(source); }