Beispiel #1
0
        private static void GetValues(char trieWord, StringPrep.Values values)
        {
            values.Reset();
            if (trieWord == 0)
            {
                /*
                 * Initial value stored in the mapping table just return TYPE_LIMIT
                 * .. so that the source codepoint is copied to the destination
                 */
                values.type = TYPE_LIMIT;
            }
            else if (trieWord >= TYPE_THRESHOLD)
            {
                values.type = (trieWord - TYPE_THRESHOLD);
            }
            else
            {
                /* get the type */
                values.type = MAP;
                /* ascertain if the value is index or delta */
                if ((trieWord & 0x02) > 0)
                {
                    values.isIndex   = true;
                    values.value_ren = trieWord >> 2;     // mask off the lower 2 bits and
                    // shift
                }
                else
                {
                    values.isIndex   = false;
                    values.value_ren = ((int)(trieWord << 16)) >> 16;
                    values.value_ren = (values.value_ren >> 2);
                }

                if ((trieWord >> 2) == MAX_INDEX_VALUE)
                {
                    values.type      = DELETE;
                    values.isIndex   = false;
                    values.value_ren = 0;
                }
            }
        }
Beispiel #2
0
        /*
         * boolean isLabelSeparator(int ch){ int result = getCodePointValue(ch); if(
         * (result & 0x07) == LABEL_SEPARATOR){ return true; } return false; }
         */
        /*
         * 1) Map -- For each character in the input, check if it has a mapping and,
         * if so, replace it with its mapping.
         *
         * 2) Normalize -- Possibly normalize the result of step 1 using Unicode
         * normalization.
         *
         * 3) Prohibit -- Check for any characters that are not allowed in the
         * output. If any are found, return an error.
         *
         * 4) Check bidi -- Possibly check for right-to-left characters, and if any
         * are found, make sure that the whole string satisfies the requirements for
         * bidirectional strings. If the string does not satisfy the requirements
         * for bidirectional strings, return an error. [Unicode3.2] defines several
         * bidirectional categories; each character has one bidirectional category
         * assigned to it. For the purposes of the requirements below, an
         * "RandALCat character" is a character that has Unicode bidirectional
         * categories "R" or "AL"; an "LCat character" is a character that has
         * Unicode bidirectional category "L". Note
         *
         *
         * that there are many characters which fall in neither of the above
         * definitions; Latin digits (<U+0030> through <U+0039>) are examples of
         * this because they have bidirectional category "EN".
         *
         * In any profile that specifies bidirectional character handling, all three
         * of the following requirements MUST be met:
         *
         * 1) The characters in section 5.8 MUST be prohibited.
         *
         * 2) If a string contains any RandALCat character, the string MUST NOT
         * contain any LCat character.
         *
         * 3) If a string contains any RandALCat character, a RandALCat character
         * MUST be the first character of the string, and a RandALCat character MUST
         * be the last character of the string.
         */
        /// <summary>
        /// Prepare the input buffer for use in applications with the given profile.
        /// This operation maps, normalizes(NFKC), checks for prohited and BiDi
        /// characters in the order defined by RFC 3454 depending on the options
        /// specified in the profile.
        /// </summary>
        ///
        /// <param name="src">A UCharacterIterator object containing the source string</param>
        /// <param name="options">A bit set of options:- StringPrep.NONE Prohibit processing of unassigned codepoints in the input- StringPrep.ALLOW_UNASSIGNED Treat the unassigned code pointsare in the input as normal Unicode code points.</param>
        /// <returns>StringBuffer A StringBuffer containing the output</returns>
        /// <exception cref="ParseException"></exception>
        /// @stable ICU 2.8
        public StringBuilder Prepare(UCharacterIterator src, int options)
        {
            // map
            StringBuilder mapOut  = Map(src, options);
            StringBuilder normOut = mapOut;    // initialize

            if (doNFKC)
            {
                // normalize
                normOut = Normalize(mapOut);
            }

            int  ch;
            char result;
            UCharacterIterator iter = IBM.ICU.Text.UCharacterIterator.GetInstance(normOut);

            StringPrep.Values val = new StringPrep.Values();
            int  direction = IBM.ICU.Lang.UCharacterEnums.ECharacterDirection.CHAR_DIRECTION_COUNT, firstCharDir = IBM.ICU.Lang.UCharacterEnums.ECharacterDirection.CHAR_DIRECTION_COUNT;
            int  rtlPos = -1, ltrPos = -1;
            bool rightToLeft = false, leftToRight = false;

            while ((ch = iter.NextCodePoint()) != IBM.ICU.Text.UForwardCharacterIterator_Constants.DONE)
            {
                result = GetCodePointValue(ch);
                GetValues(result, val);

                if (val.type == PROHIBITED)
                {
                    throw new StringPrepParseException(
                              "A prohibited code point was found in the input",
                              IBM.ICU.Text.StringPrepParseException.PROHIBITED_ERROR,
                              iter.GetText(), val.value_ren);
                }

                if (checkBiDi)
                {
                    direction = bdp.GetClass(ch);
                    if (firstCharDir == IBM.ICU.Lang.UCharacterEnums.ECharacterDirection.CHAR_DIRECTION_COUNT)
                    {
                        firstCharDir = direction;
                    }
                    if (direction == IBM.ICU.Lang.UCharacterEnums.ECharacterDirection.LEFT_TO_RIGHT)
                    {
                        leftToRight = true;
                        ltrPos      = iter.GetIndex() - 1;
                    }
                    if (direction == IBM.ICU.Lang.UCharacterEnums.ECharacterDirection.RIGHT_TO_LEFT ||
                        direction == IBM.ICU.Lang.UCharacterEnums.ECharacterDirection.RIGHT_TO_LEFT_ARABIC)
                    {
                        rightToLeft = true;
                        rtlPos      = iter.GetIndex() - 1;
                    }
                }
            }
            if (checkBiDi == true)
            {
                // satisfy 2
                if (leftToRight == true && rightToLeft == true)
                {
                    throw new StringPrepParseException(
                              "The input does not conform to the rules for BiDi code points.",
                              IBM.ICU.Text.StringPrepParseException.CHECK_BIDI_ERROR, iter
                              .GetText(), (rtlPos > ltrPos) ? rtlPos : ltrPos);
                }

                // satisfy 3
                if (rightToLeft == true &&
                    !((firstCharDir == IBM.ICU.Lang.UCharacterEnums.ECharacterDirection.RIGHT_TO_LEFT || firstCharDir == IBM.ICU.Lang.UCharacterEnums.ECharacterDirection.RIGHT_TO_LEFT_ARABIC) && (direction == IBM.ICU.Lang.UCharacterEnums.ECharacterDirection.RIGHT_TO_LEFT || direction == IBM.ICU.Lang.UCharacterEnums.ECharacterDirection.RIGHT_TO_LEFT_ARABIC)))
                {
                    throw new StringPrepParseException(
                              "The input does not conform to the rules for BiDi code points.",
                              IBM.ICU.Text.StringPrepParseException.CHECK_BIDI_ERROR, iter
                              .GetText(), (rtlPos > ltrPos) ? rtlPos : ltrPos);
                }
            }
            return(normOut);
        }
Beispiel #3
0
        private StringBuilder Map(UCharacterIterator iter, int options)
        {
            StringPrep.Values val         = new StringPrep.Values();
            char          result          = (char)(0);
            int           ch              = IBM.ICU.Text.UForwardCharacterIterator_Constants.DONE;
            StringBuilder dest            = new StringBuilder();
            bool          allowUnassigned = ((options & ALLOW_UNASSIGNED) > 0);

            while ((ch = iter.NextCodePoint()) != IBM.ICU.Text.UForwardCharacterIterator_Constants.DONE)
            {
                result = GetCodePointValue(ch);
                GetValues(result, val);

                // check if the source codepoint is unassigned
                if (val.type == UNASSIGNED && allowUnassigned == false)
                {
                    throw new StringPrepParseException(
                              "An unassigned code point was found in the input",
                              IBM.ICU.Text.StringPrepParseException.UNASSIGNED_ERROR,
                              iter.GetText(), iter.GetIndex());
                }
                else if ((val.type == MAP))
                {
                    int index, length;

                    if (val.isIndex)
                    {
                        index = val.value_ren;
                        if (index >= indexes[ONE_UCHAR_MAPPING_INDEX_START] &&
                            index < indexes[TWO_UCHARS_MAPPING_INDEX_START])
                        {
                            length = 1;
                        }
                        else if (index >= indexes[TWO_UCHARS_MAPPING_INDEX_START] &&
                                 index < indexes[THREE_UCHARS_MAPPING_INDEX_START])
                        {
                            length = 2;
                        }
                        else if (index >= indexes[THREE_UCHARS_MAPPING_INDEX_START] &&
                                 index < indexes[FOUR_UCHARS_MAPPING_INDEX_START])
                        {
                            length = 3;
                        }
                        else
                        {
                            length = mappingData[index++];
                        }
                        /* copy mapping to destination */
                        dest.Append(mappingData, index, length);
                        continue;
                    }
                    else
                    {
                        ch -= val.value_ren;
                    }
                }
                else if (val.type == DELETE)
                {
                    // just consume the codepoint and contine
                    continue;
                }
                // copy the source into destination
                IBM.ICU.Text.UTF16.Append(dest, ch);
            }

            return(dest);
        }