private static void GetValues(char trieWord, StringPrep.Values values) { values.Reset(); if (trieWord == 0) { /* * Initial value stored in the mapping table just return TYPE_LIMIT * .. so that the source codepoint is copied to the destination */ values.type = TYPE_LIMIT; } else if (trieWord >= TYPE_THRESHOLD) { values.type = (trieWord - TYPE_THRESHOLD); } else { /* get the type */ values.type = MAP; /* ascertain if the value is index or delta */ if ((trieWord & 0x02) > 0) { values.isIndex = true; values.value_ren = trieWord >> 2; // mask off the lower 2 bits and // shift } else { values.isIndex = false; values.value_ren = ((int)(trieWord << 16)) >> 16; values.value_ren = (values.value_ren >> 2); } if ((trieWord >> 2) == MAX_INDEX_VALUE) { values.type = DELETE; values.isIndex = false; values.value_ren = 0; } } }
/* * boolean isLabelSeparator(int ch){ int result = getCodePointValue(ch); if( * (result & 0x07) == LABEL_SEPARATOR){ return true; } return false; } */ /* * 1) Map -- For each character in the input, check if it has a mapping and, * if so, replace it with its mapping. * * 2) Normalize -- Possibly normalize the result of step 1 using Unicode * normalization. * * 3) Prohibit -- Check for any characters that are not allowed in the * output. If any are found, return an error. * * 4) Check bidi -- Possibly check for right-to-left characters, and if any * are found, make sure that the whole string satisfies the requirements for * bidirectional strings. If the string does not satisfy the requirements * for bidirectional strings, return an error. [Unicode3.2] defines several * bidirectional categories; each character has one bidirectional category * assigned to it. For the purposes of the requirements below, an * "RandALCat character" is a character that has Unicode bidirectional * categories "R" or "AL"; an "LCat character" is a character that has * Unicode bidirectional category "L". Note * * * that there are many characters which fall in neither of the above * definitions; Latin digits (<U+0030> through <U+0039>) are examples of * this because they have bidirectional category "EN". * * In any profile that specifies bidirectional character handling, all three * of the following requirements MUST be met: * * 1) The characters in section 5.8 MUST be prohibited. * * 2) If a string contains any RandALCat character, the string MUST NOT * contain any LCat character. * * 3) If a string contains any RandALCat character, a RandALCat character * MUST be the first character of the string, and a RandALCat character MUST * be the last character of the string. */ /// <summary> /// Prepare the input buffer for use in applications with the given profile. /// This operation maps, normalizes(NFKC), checks for prohited and BiDi /// characters in the order defined by RFC 3454 depending on the options /// specified in the profile. /// </summary> /// /// <param name="src">A UCharacterIterator object containing the source string</param> /// <param name="options">A bit set of options:- StringPrep.NONE Prohibit processing of unassigned codepoints in the input- StringPrep.ALLOW_UNASSIGNED Treat the unassigned code pointsare in the input as normal Unicode code points.</param> /// <returns>StringBuffer A StringBuffer containing the output</returns> /// <exception cref="ParseException"></exception> /// @stable ICU 2.8 public StringBuilder Prepare(UCharacterIterator src, int options) { // map StringBuilder mapOut = Map(src, options); StringBuilder normOut = mapOut; // initialize if (doNFKC) { // normalize normOut = Normalize(mapOut); } int ch; char result; UCharacterIterator iter = IBM.ICU.Text.UCharacterIterator.GetInstance(normOut); StringPrep.Values val = new StringPrep.Values(); int direction = IBM.ICU.Lang.UCharacterEnums.ECharacterDirection.CHAR_DIRECTION_COUNT, firstCharDir = IBM.ICU.Lang.UCharacterEnums.ECharacterDirection.CHAR_DIRECTION_COUNT; int rtlPos = -1, ltrPos = -1; bool rightToLeft = false, leftToRight = false; while ((ch = iter.NextCodePoint()) != IBM.ICU.Text.UForwardCharacterIterator_Constants.DONE) { result = GetCodePointValue(ch); GetValues(result, val); if (val.type == PROHIBITED) { throw new StringPrepParseException( "A prohibited code point was found in the input", IBM.ICU.Text.StringPrepParseException.PROHIBITED_ERROR, iter.GetText(), val.value_ren); } if (checkBiDi) { direction = bdp.GetClass(ch); if (firstCharDir == IBM.ICU.Lang.UCharacterEnums.ECharacterDirection.CHAR_DIRECTION_COUNT) { firstCharDir = direction; } if (direction == IBM.ICU.Lang.UCharacterEnums.ECharacterDirection.LEFT_TO_RIGHT) { leftToRight = true; ltrPos = iter.GetIndex() - 1; } if (direction == IBM.ICU.Lang.UCharacterEnums.ECharacterDirection.RIGHT_TO_LEFT || direction == IBM.ICU.Lang.UCharacterEnums.ECharacterDirection.RIGHT_TO_LEFT_ARABIC) { rightToLeft = true; rtlPos = iter.GetIndex() - 1; } } } if (checkBiDi == true) { // satisfy 2 if (leftToRight == true && rightToLeft == true) { throw new StringPrepParseException( "The input does not conform to the rules for BiDi code points.", IBM.ICU.Text.StringPrepParseException.CHECK_BIDI_ERROR, iter .GetText(), (rtlPos > ltrPos) ? rtlPos : ltrPos); } // satisfy 3 if (rightToLeft == true && !((firstCharDir == IBM.ICU.Lang.UCharacterEnums.ECharacterDirection.RIGHT_TO_LEFT || firstCharDir == IBM.ICU.Lang.UCharacterEnums.ECharacterDirection.RIGHT_TO_LEFT_ARABIC) && (direction == IBM.ICU.Lang.UCharacterEnums.ECharacterDirection.RIGHT_TO_LEFT || direction == IBM.ICU.Lang.UCharacterEnums.ECharacterDirection.RIGHT_TO_LEFT_ARABIC))) { throw new StringPrepParseException( "The input does not conform to the rules for BiDi code points.", IBM.ICU.Text.StringPrepParseException.CHECK_BIDI_ERROR, iter .GetText(), (rtlPos > ltrPos) ? rtlPos : ltrPos); } } return(normOut); }
private StringBuilder Map(UCharacterIterator iter, int options) { StringPrep.Values val = new StringPrep.Values(); char result = (char)(0); int ch = IBM.ICU.Text.UForwardCharacterIterator_Constants.DONE; StringBuilder dest = new StringBuilder(); bool allowUnassigned = ((options & ALLOW_UNASSIGNED) > 0); while ((ch = iter.NextCodePoint()) != IBM.ICU.Text.UForwardCharacterIterator_Constants.DONE) { result = GetCodePointValue(ch); GetValues(result, val); // check if the source codepoint is unassigned if (val.type == UNASSIGNED && allowUnassigned == false) { throw new StringPrepParseException( "An unassigned code point was found in the input", IBM.ICU.Text.StringPrepParseException.UNASSIGNED_ERROR, iter.GetText(), iter.GetIndex()); } else if ((val.type == MAP)) { int index, length; if (val.isIndex) { index = val.value_ren; if (index >= indexes[ONE_UCHAR_MAPPING_INDEX_START] && index < indexes[TWO_UCHARS_MAPPING_INDEX_START]) { length = 1; } else if (index >= indexes[TWO_UCHARS_MAPPING_INDEX_START] && index < indexes[THREE_UCHARS_MAPPING_INDEX_START]) { length = 2; } else if (index >= indexes[THREE_UCHARS_MAPPING_INDEX_START] && index < indexes[FOUR_UCHARS_MAPPING_INDEX_START]) { length = 3; } else { length = mappingData[index++]; } /* copy mapping to destination */ dest.Append(mappingData, index, length); continue; } else { ch -= val.value_ren; } } else if (val.type == DELETE) { // just consume the codepoint and contine continue; } // copy the source into destination IBM.ICU.Text.UTF16.Append(dest, ch); } return(dest); }