Exemplo n.º 1
0
        public override int Matches(CharacterIterator text_, int maxLength, int[] lengths, int[] count_, int limit, int[] values)
        {
            UCharacterIterator text = UCharacterIterator.GetInstance(text_);
            BytesTrie          bt   = new BytesTrie(characters, 0);
            int c = text.NextCodePoint();

            if (c == UCharacterIterator.DONE)
            {
                return(0);
            }
            Result result = bt.First(Transform(c));
            // TODO: should numChars count Character.charCount() ?
            int numChars = 1;
            int count    = 0;

            for (; ;)
            {
                if (result.HasValue())
                {
                    if (count < limit)
                    {
                        if (values != null)
                        {
                            values[count] = bt.GetValue();
                        }
                        lengths[count] = numChars;
                        count++;
                    }
                    if (result == Result.FinalValue)
                    {
                        break;
                    }
                }
                else if (result == Result.NoMatch)
                {
                    break;
                }

                if (numChars >= maxLength)
                {
                    break;
                }

                c = text.NextCodePoint();
                if (c == UCharacterIterator.DONE)
                {
                    break;
                }
                ++numChars;
                result = bt.Next(Transform(c));
            }
            count_[0] = count;
            return(numChars);
        }
Exemplo n.º 2
0
        /// <summary>
        /// Set a new source string iterator for iteration, and reset the
        /// offset to the beginning of the text.
        /// <para/>
        /// The source iterator's integrity will be preserved since a new copy
        /// will be created for use.
        /// </summary>
        /// <param name="source">The new source string iterator for iteration.</param>
        /// <stable>ICU 2.8</stable>
        public void SetText(UCharacterIterator source)
        {
            string_ = source.GetText(); // TODO: do we need to remember the source string in a field?
                                        // Note: In C++, we just setText(source.getText()).
                                        // In Java, we actually operate on a character iterator.
                                        // (The old code apparently did so only for a CharacterIterator;
                                        // for a UCharacterIterator it also just used source.getText()).
                                        // TODO: do we need to remember the cloned iterator in a field?
            UCharacterIterator src;

            //try
            //{
            src = (UCharacterIterator)source.Clone();
            //}
            //catch (CloneNotSupportedException e)
            //{
            //    // Fall back to ICU 52 behavior of iterating over the text contents
            //    // of the UCharacterIterator.
            //    setText(source.getText());
            //    return;
            //}
            src.SetToStart();
            CollationIterator newIter;
            bool numeric = rbc_.settings.ReadOnly.IsNumeric;

            if (rbc_.settings.ReadOnly.DontCheckFCD)
            {
                newIter = new IterCollationIterator(rbc_.data, numeric, src);
            }
            else
            {
                newIter = new FCDIterCollationIterator(rbc_.data, numeric, src, 0);
            }
            iter_      = newIter;
            otherHalf_ = 0;
            dir_       = 0;
        }
Exemplo n.º 3
0
        /// <summary>
        /// Prepare the input String for use in applications with the given profile. This operation maps, normalizes(NFKC),
        /// checks for prohibited and BiDi characters in the order defined by RFC 3454
        /// depending on the options specified in the profile.
        /// </summary>
        /// <param name="src">A string.</param>
        /// <param name="options">A bit set of options:
        /// <list type="bullet">
        ///     <item><term><see cref="StringPrepOptions.Default"/></term><description>Prohibit processing of unassigned code points in the input.</description></item>
        ///     <item><term><see cref="StringPrepOptions.AllowUnassigned"/></term><description>Treat the unassigned code points are in the input as normal Unicode code points.</description></item>
        /// </list>
        /// </param>
        /// <returns>A string containing the output.</returns>
        /// <exception cref="StringPrepParseException">An exception occurs when parsing a string is invalid.</exception>
        /// <stable>ICU 4.2</stable>
        public string Prepare(string src, StringPrepOptions options)
        {
            StringBuffer result = Prepare(UCharacterIterator.GetInstance(src), options);

            return(result.ToString());
        }
Exemplo n.º 4
0
        /*
         * boolean isLabelSeparator(int ch){
         *  int result = getCodePointValue(ch);
         *  if( (result & 0x07)  == LABEL_SEPARATOR){
         *      return true;
         *  }
         *  return false;
         * }
         */
        /*
         * 1) Map -- For each character in the input, check if it has a mapping
         *   and, if so, replace it with its mapping.
         *
         * 2) Normalize -- Possibly normalize the result of step 1 using Unicode
         *   normalization.
         *
         * 3) Prohibit -- Check for any characters that are not allowed in the
         *   output.  If any are found, return an error.
         *
         * 4) Check bidi -- Possibly check for right-to-left characters, and if
         *   any are found, make sure that the whole string satisfies the
         *   requirements for bidirectional strings.  If the string does not
         *   satisfy the requirements for bidirectional strings, return an
         *   error.
         *   [Unicode3.2] defines several bidirectional categories; each character
         *    has one bidirectional category assigned to it.  For the purposes of
         *    the requirements below, an "RandALCat character" is a character that
         *    has Unicode bidirectional categories "R" or "AL"; an "LCat character"
         *    is a character that has Unicode bidirectional category "L".  Note
         *
         *
         *    that there are many characters which fall in neither of the above
         *    definitions; Latin digits (<U+0030> through <U+0039>) are examples of
         *    this because they have bidirectional category "EN".
         *
         *    In any profile that specifies bidirectional character handling, all
         *    three of the following requirements MUST be met:
         *
         *    1) The characters in section 5.8 MUST be prohibited.
         *
         *    2) If a string contains any RandALCat character, the string MUST NOT
         *       contain any LCat character.
         *
         *    3) If a string contains any RandALCat character, a RandALCat
         *       character MUST be the first character of the string, and a
         *       RandALCat character MUST be the last character of the string.
         */

        /// <summary>
        /// Prepare the input buffer for use in applications with the given profile. This operation maps, normalizes(NFKC),
        /// checks for prohibited and BiDi characters in the order defined by RFC 3454
        /// depending on the options specified in the profile.
        /// </summary>
        /// <param name="src">A <see cref="UCharacterIterator"/> object containing the source string.</param>
        /// <param name="options">A bit set of options:
        /// <list type="bullet">
        ///     <item><term><see cref="StringPrepOptions.Default"/></term><description>Prohibit processing of unassigned code points in the input.</description></item>
        ///     <item><term><see cref="StringPrepOptions.AllowUnassigned"/></term><description>Treat the unassigned code points are in the input as normal Unicode code points.</description></item>
        /// </list>
        /// </param>
        /// <returns>A <see cref="StringBuffer"/> containing the output.</returns>
        /// <exception cref="StringPrepParseException">An exception occurs when parsing a string is invalid.</exception>
        /// <stable>ICU 2.8</stable>
        public StringBuffer Prepare(UCharacterIterator src, StringPrepOptions options)
        {
            // map
            StringBuffer mapOut  = Map(src, options);
            StringBuffer normOut = mapOut;// initialize

            if (doNFKC)
            {
                // normalize
                normOut = Normalize(mapOut);
            }

            int  ch;
            char result;
            UCharacterIterator iter = UCharacterIterator.GetInstance(normOut);
            Values             val  = new Values();

#pragma warning disable 612, 618
            UCharacterDirection direction    = UCharacterDirection.CharDirectionCount,
                                firstCharDir = UCharacterDirection.CharDirectionCount;
#pragma warning restore 612, 618
            int  rtlPos = -1, ltrPos = -1;
            bool rightToLeft = false, leftToRight = false;

            while ((ch = iter.NextCodePoint()) != UCharacterIterator.DONE)
            {
                result = GetCodePointValue(ch);
                GetValues(result, val);

                if (val.type == PROHIBITED)
                {
                    throw new StringPrepParseException("A prohibited code point was found in the input",
                                                       StringPrepErrorType.ProhibitedError, iter.GetText(), val.value);
                }

                if (checkBiDi)
                {
                    direction = (UCharacterDirection)bdp.GetClass(ch);
#pragma warning disable 612, 618
                    if (firstCharDir == UCharacterDirection.CharDirectionCount)
#pragma warning restore 612, 618
                    {
                        firstCharDir = direction;
                    }
                    if (direction == UCharacterDirection.LeftToRight)
                    {
                        leftToRight = true;
                        ltrPos      = iter.Index - 1;
                    }
                    if (direction == UCharacterDirection.RightToLeft || direction == UCharacterDirection.RightToLeftArabic)
                    {
                        rightToLeft = true;
                        rtlPos      = iter.Index - 1;
                    }
                }
            }
            if (checkBiDi == true)
            {
                // satisfy 2
                if (leftToRight == true && rightToLeft == true)
                {
                    throw new StringPrepParseException("The input does not conform to the rules for BiDi code points.",
                                                       StringPrepErrorType.CheckBiDiError, iter.GetText(),
                                                       (rtlPos > ltrPos) ? rtlPos : ltrPos);
                }

                //satisfy 3
                if (rightToLeft == true &&
                    !((firstCharDir == UCharacterDirection.RightToLeft || firstCharDir == UCharacterDirection.RightToLeftArabic) &&
                      (direction == UCharacterDirection.RightToLeft || direction == UCharacterDirection.RightToLeftArabic))
                    )
                {
                    throw new StringPrepParseException("The input does not conform to the rules for BiDi code points.",
                                                       StringPrepErrorType.CheckBiDiError, iter.GetText(),
                                                       (rtlPos > ltrPos) ? rtlPos : ltrPos);
                }
            }
            return(normOut);
        }
Exemplo n.º 5
0
        private StringBuffer Map(UCharacterIterator iter, StringPrepOptions options)
        {
            Values       val             = new Values();
            char         result          = (char)0;
            int          ch              = UCharacterIterator.DONE;
            StringBuffer dest            = new StringBuffer();
            bool         allowUnassigned = ((options & StringPrepOptions.AllowUnassigned) > 0);

            while ((ch = iter.NextCodePoint()) != UCharacterIterator.DONE)
            {
                result = GetCodePointValue(ch);
                GetValues(result, val);

                // check if the source codepoint is unassigned
                if (val.type == UNASSIGNED && allowUnassigned == false)
                {
                    throw new StringPrepParseException("An unassigned code point was found in the input",
                                                       StringPrepErrorType.UnassignedError,
                                                       iter.GetText(), iter.Index);
                }
                else if ((val.type == MAP))
                {
                    int index, length;

                    if (val.isIndex)
                    {
                        index = val.value;
                        if (index >= indexes[ONE_UCHAR_MAPPING_INDEX_START] &&
                            index < indexes[TWO_UCHARS_MAPPING_INDEX_START])
                        {
                            length = 1;
                        }
                        else if (index >= indexes[TWO_UCHARS_MAPPING_INDEX_START] &&
                                 index < indexes[THREE_UCHARS_MAPPING_INDEX_START])
                        {
                            length = 2;
                        }
                        else if (index >= indexes[THREE_UCHARS_MAPPING_INDEX_START] &&
                                 index < indexes[FOUR_UCHARS_MAPPING_INDEX_START])
                        {
                            length = 3;
                        }
                        else
                        {
                            length = mappingData[index++];
                        }
                        /* copy mapping to destination */
                        dest.Append(mappingData, index, length);
                        continue;
                    }
                    else
                    {
                        ch -= val.value;
                    }
                }
                else if (val.type == DELETE)
                {
                    // just consume the codepoint and contine
                    continue;
                }
                //copy the source into destination
                UTF16.Append(dest, ch);
            }

            return(dest);
        }
Exemplo n.º 6
0
 /// <summary>
 /// <see cref="CollationElementIterator"/> constructor. This takes a source
 /// character iterator and a <see cref="Text.RuleBasedCollator"/>. The iterator will
 /// walk through the source string based on the rules defined by
 /// the collator. If the source string is empty, <see cref="NullOrder"/> will be
 /// returned on the first call to <see cref="Next()"/>.
 /// </summary>
 /// <param name="source">The source string iterator.</param>
 /// <param name="collator">The <see cref="Text.RuleBasedCollator"/>.</param>
 /// <stable>ICU 2.8</stable>
 internal CollationElementIterator(UCharacterIterator source, RuleBasedCollator collator)
     : this(collator)
 {
     SetText(source);
 }