private int Next()
        {
            int c;

            if (forward)
            {
                if (index < contextLimit)
                {
                    c      = rep.Char32At(index);
                    index += UTF16.GetCharCount(c);
                    return(c);
                }
                else
                {
                    // forward context iteration reached the limit
                    reachedLimit = true;
                }
            }
            else if (!forward && index > contextStart)
            {
                c      = rep.Char32At(index - 1);
                index -= UTF16.GetCharCount(c);
                return(c);
            }
            return(-1);
        }
Beispiel #2
0
        /// <summary>
        /// Default implementation of <see cref="IUnicodeMatcher.Matches(IReplaceable, int[], int, bool)"/> for Unicode
        /// filters.  Matches a single 16-bit code unit at offset.
        /// </summary>
        /// <stable>ICU 2.0</stable>
        public virtual MatchDegree Matches(IReplaceable text,
                                           int[] offset,
                                           int limit,
                                           bool incremental)
        {
            int c;

            if (offset[0] < limit &&
                Contains(c = text.Char32At(offset[0])))
            {
                offset[0] += UTF16.GetCharCount(c);
                return(MatchDegree.Match);
            }
            if (offset[0] > limit && Contains(text.Char32At(offset[0])))
            {
                // Backup offset by 1, unless the preceding character is a
                // surrogate pair -- then backup by 2 (keep offset pointing at
                // the lead surrogate).
                --offset[0];
                if (offset[0] >= 0)
                {
                    offset[0] -= UTF16.GetCharCount(text.Char32At(offset[0])) - 1;
                }
                return(MatchDegree.Match);
            }
            if (incremental && offset[0] == limit)
            {
                return(MatchDegree.PartialMatch);
            }
            return(MatchDegree.Mismatch);
        }
Beispiel #3
0
 /// <summary>
 /// Gets the current string from the iterator. Only use after calling <see cref="Next()"/>,
 /// not <see cref="NextRange()"/>.
 /// </summary>
 /// <stable>ICU 4.0</stable>
 public virtual string GetString() // ICU4N TODO: API String vs GetString() - confusing. This should be made into String property and the current string property made into a private field.
 {
     if (Codepoint != IS_STRING)
     {
         return(UTF16.ValueOf(Codepoint));
     }
     return(String);
 }
Beispiel #4
0
        /// <summary>
        /// Implement <see cref="IUnicodeMatcher"/>
        /// </summary>
        public virtual bool MatchesIndexValue(int v)
        {
            if (pattern.Length == 0)
            {
                return(true);
            }
            int             c = UTF16.CharAt(pattern, 0);
            IUnicodeMatcher m = data.LookupMatcher(c);

            return((m == null) ? ((c & 0xFF) == v) : m.MatchesIndexValue(v));
        }
Beispiel #5
0
        /**
         * Internal method.  Returns 8-bit index value for this rule.
         * This is the low byte of the first character of the key,
         * unless the first character of the key is a set.  If it's a
         * set, or otherwise can match multiple keys, the index value is -1.
         */
        internal int GetIndexValue()
        {
            if (anteContextLength == pattern.Length)
            {
                // A pattern with just ante context {such as foo)>bar} can
                // match any key.
                return(-1);
            }
            int c = UTF16.CharAt(pattern, anteContextLength);

            return(data.LookupMatcher(c) == null ? (c & 0xFF) : -1);
        }
Beispiel #6
0
        /// <summary>
        /// Implements <see cref="Transliterator.HandleTransliterate(IReplaceable, Position, bool)"/>.
        /// </summary>
        protected override void HandleTransliterate(IReplaceable text,
                                                    Position pos, bool incremental)
        {
            int start = pos.Start;
            int limit = pos.Limit;

            StringBuilder buf        = new StringBuilder(prefix);
            int           prefixLen  = prefix.Length;
            bool          redoPrefix = false;

            while (start < limit)
            {
                int c       = grokSupplementals ? text.Char32At(start) : text[start];
                int charLen = grokSupplementals ? UTF16.GetCharCount(c) : 1;

                if ((c & 0xFFFF0000) != 0 && supplementalHandler != null)
                {
                    buf.Length = 0;
                    buf.Append(supplementalHandler.prefix);
                    Utility.AppendNumber(buf, c, supplementalHandler.radix,
                                         supplementalHandler.minDigits);
                    buf.Append(supplementalHandler.suffix);
                    redoPrefix = true;
                }
                else
                {
                    if (redoPrefix)
                    {
                        buf.Length = 0;
                        buf.Append(prefix);
                        redoPrefix = false;
                    }
                    else
                    {
                        buf.Length = prefixLen;
                    }
                    Utility.AppendNumber(buf, c, radix, minDigits);
                    buf.Append(suffix);
                }

                text.Replace(start, start + charLen, buf.ToString());
                start += buf.Length;
                limit += buf.Length - charLen;
            }

            pos.ContextLimit += limit - pos.Limit;
            pos.Limit         = limit;
            pos.Start         = start;
        }
        /// <summary>
        /// Iterate forward through the string to fetch the next code point
        /// to be case-mapped, and set the context indexes for it.
        /// </summary>
        /// <returns>The next code point to be case-mapped, or &lt;0 when the iteration is done.</returns>
        public virtual int NextCaseMapCP()
        {
            int c;

            if (cpLimit < limit)
            {
                cpStart  = cpLimit;
                c        = rep.Char32At(cpLimit);
                cpLimit += UTF16.GetCharCount(c);
                return(c);
            }
            else
            {
                return(-1);
            }
        }
Beispiel #8
0
        /// <summary>
        /// Set a new source for this iterator. Allows object reuse.
        /// </summary>
        /// <param name="newSource">The source string to iterate against. This allows the same iterator to be used
        /// while changing the source string, saving object creation.</param>
        /// <stable>ICU 2.4</stable>
        public void SetSource(string newSource)
        {
            source = nfd.Normalize(newSource);
            done   = false;

            // catch degenerate case
            if (newSource.Length == 0)
            {
                pieces    = new string[1][];
                current   = new int[1];
                pieces[0] = new string[] { "" };
                return;
            }

            // find the segments
            IList <string> segmentList = new List <string>();
            int            cp;
            int            start = 0;

            // i should be the end of the first code point
            // break up the string into segements

            int i = UTF16.FindOffsetFromCodePoint(source, 1);

            for (; i < source.Length; i += Character.CharCount(cp))
            {
                cp = source.CodePointAt(i);
                if (nfcImpl.IsCanonSegmentStarter(cp))
                {
                    segmentList.Add(source.Substring(start, i - start)); // add up to i // ICU4N: Corrected 2nd substring parameter
                    start = i;
                }
            }
            segmentList.Add(source.Substring(start, i - start)); // add last one // ICU4N: Corrected 2nd substring parameter

            // allocate the arrays, and find the strings that are CE to each segment
            pieces  = new string[segmentList.Count][];
            current = new int[segmentList.Count];
            for (i = 0; i < pieces.Length; ++i)
            {
                if (PROGRESS)
                {
                    Console.Out.WriteLine("SEGMENT");
                }
                pieces[i] = GetEquivalents(segmentList[i]);
            }
        }
Beispiel #9
0
        /// <summary>
        /// Find the source and target sets, subject to the input filter.
        /// There is a known issue with filters containing multiple characters.
        /// </summary>
        // TODO: Problem: the rule is [{ab}]c > x
        // The filter is [a{bc}].
        // If the input is abc, then the rule will work.
        // However, following code applying the filter won't catch that case.
        internal void AddSourceTargetSet(UnicodeSet filter, UnicodeSet sourceSet, UnicodeSet targetSet, UnicodeSet revisiting)
        {
            int        limit      = anteContextLength + keyLength;
            UnicodeSet tempSource = new UnicodeSet();
            UnicodeSet temp       = new UnicodeSet();

            // We need to walk through the pattern.
            // Iff some of the characters at ALL of the the positions are matched by the filter, then we add temp to toUnionTo
            for (int i = anteContextLength; i < limit;)
            {
                int ch = UTF16.CharAt(pattern, i);
                i += UTF16.GetCharCount(ch);
                IUnicodeMatcher matcher = data.LookupMatcher(ch);
                if (matcher == null)
                {
                    if (!filter.Contains(ch))
                    {
                        return;
                    }
                    tempSource.Add(ch);
                }
                else
                {
                    try
                    {
                        if (!filter.ContainsSome((UnicodeSet)matcher))
                        {
                            return;
                        }
                        matcher.AddMatchSetTo(tempSource);
                    }
                    catch (InvalidCastException)
                    { // if the matcher is not a UnicodeSet
                        temp.Clear();
                        matcher.AddMatchSetTo(temp);
                        if (!filter.ContainsSome(temp))
                        {
                            return;
                        }
                        tempSource.AddAll(temp);
                    }
                }
            }
            // if we made our way through the gauntlet, add to source/target
            sourceSet.AddAll(tempSource);
            output.AddReplacementSetTo(targetSet);
        }
Beispiel #10
0
        public static void Permute(string source, bool skipZeros, ISet <string> output)
        {
            // TODO: optimize
            //if (PROGRESS) System.out.println("Permute: " + source);

            // optimization:
            // if zero or one character, just return a set with it
            // we check for length < 2 to keep from counting code points all the time
            if (source.Length <= 2 && UTF16.CountCodePoint(source) <= 1)
            {
                output.Add(source);
                return;
            }

            // otherwise iterate through the string, and recursively permute all the other characters
            ISet <string> subpermute = new HashSet <string>();
            int           cp;

            for (int i = 0; i < source.Length; i += UTF16.GetCharCount(cp))
            {
                cp = UTF16.CharAt(source, i);

                // optimization:
                // if the character is canonical combining class zero,
                // don't permute it
                if (skipZeros && i != 0 && UCharacter.GetCombiningClass(cp) == 0)
                {
                    //System.out.println("Skipping " + Utility.hex(UTF16.valueOf(source, i)));
                    continue;
                }

                // see what the permutations of the characters before and after this one are
                subpermute.Clear();
                Permute(source.Substring(0, i - 0)                                              // ICU4N: Checked 2nd parameter
                        + source.Substring(i + UTF16.GetCharCount(cp)), skipZeros, subpermute); // ICU4N: Substring only has 1 parameter

                // prefix this character to all of them
                string chStr = UTF16.ValueOf(source, i);
                foreach (string s in subpermute)
                {
                    string piece = chStr + s;
                    //if (PROGRESS) System.out.println("  Piece: " + piece);
                    output.Add(piece);
                }
            }
        }
Beispiel #11
0
        /// <summary>
        /// Return the 32-bit code point at the given 16-bit offset into
        /// the text.  This assumes the text is stored as 16-bit code units
        /// with surrogate pairs intermixed.  If the offset of a leading or
        /// trailing code unit of a surrogate pair is given, return the
        /// code point of the surrogate pair.
        /// <para/>
        /// Usage Note: If you are making external changes to a <see cref="StringBuffer"/>
        /// that is passed into the <see cref="ReplaceableString"/> constructor,
        /// it is recommended to call <see cref="ReplaceableString.ToString()"/> if
        /// the contents of the <see cref="StringBuffer"/> changed but the length
        /// did not change before calling this method. Since the indexer of the
        /// <see cref="StringBuffer"/> in .NET is slow, the contents are cached internally
        /// so multiple calls to this method in a row are not expensive.
        /// <see cref="ReplaceableString.ToString()"/> forces a reload of the cache.
        /// </summary>
        /// <param name="offset">An integer between 0 and <see cref="Length"/>-1 inclusive.</param>
        /// <returns>32-bit code point of text at given offset.</returns>
        /// <stable>ICU 2.0</stable>
        public virtual int Char32At(int offset)
        {
            // ICU4N: In .NET, the StringBuilder indexer is extremely slow,
            // so we realize (cache) a string whenever a change is detected.
            // GetHashCode() is not a 100% reliable way to determine if the contents
            // of the StringBuilder have changed but more reliable than Length.
            // The Length property is a bit cheaper, so we check that first.

            string realizedString = realized;

            if (realizedString is null || changed || previousLength != buf.Length || previousHashCode != buf.GetHashCode())
            {
                realizedString = RealizeString();
            }

            return(UTF16.CharAt(realizedString, offset));
        }
Beispiel #12
0
        /// <summary>
        /// Transliterate the given text with the given UTransPosition
        /// indices.  Return TRUE if the transliteration should continue
        /// or FALSE if it should halt (because of a U_PARTIAL_MATCH match).
        /// Note that FALSE is only ever returned if isIncremental is TRUE.
        /// </summary>
        /// <param name="text">The text to be transliterated.</param>
        /// <param name="pos">The position indices, which will be updated.</param>
        /// <param name="incremental">If TRUE, assume new text may be inserted
        /// at index.Limit, and return FALSE if thre is a partial match.</param>
        /// <returns>TRUE unless a U_PARTIAL_MATCH has been obtained,
        /// indicating that transliteration should stop until more text
        /// arrives.</returns>
        public virtual bool Transliterate(IReplaceable text,
                                          TransliterationPosition pos,
                                          bool incremental)
        {
            int indexByte = text.Char32At(pos.Start) & 0xFF;

            for (int i = index[indexByte]; i < index[indexByte + 1]; ++i)
            {
                MatchDegree m = rules[i].MatchAndReplace(text, pos, incremental);
                switch (m)
                {
                case MatchDegree.Match:
                    if (Transliterator.DEBUG)
                    {
                        Console.Out.WriteLine((incremental ? "Rule.i: match " : "Rule: match ") +
                                              rules[i].ToRule(true) + " => " +
                                              UtilityExtensions.FormatInput(text, pos));
                    }
                    return(true);

                case MatchDegree.PartialMatch:
                    if (Transliterator.DEBUG)
                    {
                        Console.Out.WriteLine((incremental ? "Rule.i: partial match " : "Rule: partial match ") +
                                              rules[i].ToRule(true) + " => " +
                                              UtilityExtensions.FormatInput(text, pos));
                    }
                    return(false);

                default:
                    if (Transliterator.DEBUG)
                    {
                        Console.Out.WriteLine("Rule: no match " + rules[i]);
                    }
                    break;
                }
            }
            // No match or partial match from any rule
            pos.Start += UTF16.GetCharCount(text.Char32At(pos.Start));
            if (Transliterator.DEBUG)
            {
                Console.Out.WriteLine((incremental ? "Rule.i: no match => " : "Rule: no match => ") +
                                      UtilityExtensions.FormatInput(text, pos));
            }
            return(true);
        }
Beispiel #13
0
        /// <summary>
        /// Union the set of all characters that may output by this object
        /// into the given set.
        /// </summary>
        /// <param name="toUnionTo">The set into which to union the output characters.</param>
        public virtual void AddReplacementSetTo(UnicodeSet toUnionTo)
        {
            int ch;

            for (int i = 0; i < output.Length; i += UTF16.GetCharCount(ch))
            {
                ch = UTF16.CharAt(output, i);
                IUnicodeReplacer r = data.LookupReplacer(ch);
                if (r == null)
                {
                    toUnionTo.Add(ch);
                }
                else
                {
                    r.AddReplacementSetTo(toUnionTo);
                }
            }
        }
Beispiel #14
0
        /// <summary>
        /// Implementation of <see cref="IUnicodeMatcher"/> API.  Union the set of all
        /// characters that may be matched by this object into the given
        /// set.
        /// </summary>
        /// <param name="toUnionTo">The set into which to union the source characters.</param>
        public virtual void AddMatchSetTo(UnicodeSet toUnionTo)
        {
            int ch;

            for (int i = 0; i < pattern.Length; i += UTF16.GetCharCount(ch))
            {
                ch = UTF16.CharAt(pattern, i);
                IUnicodeMatcher matcher = data.LookupMatcher(ch);
                if (matcher == null)
                {
                    toUnionTo.Add(ch);
                }
                else
                {
                    matcher.AddMatchSetTo(toUnionTo);
                }
            }
        }
Beispiel #15
0
        /// <summary>
        /// Retreat to the start of the previous code point in the text, and return it (pre-decrement semantics). If the
        /// index is not preceeded by a valid surrogate pair, the behavior is the same as <see cref="Previous()"/>. Otherwise
        /// the iterator is decremented to the start of the surrogate pair, and the code point represented by the pair is
        /// returned.
        /// </summary>
        /// <returns>The previous code point in the text, or <see cref="UForwardCharacterIterator.Done"/> if the new index is before the start of the text.</returns>
        /// <stable>ICU 2.4</stable>
        public virtual int PreviousCodePoint()
        {
            int ch1 = Previous();

            if (UTF16.IsTrailSurrogate((char)ch1))
            {
                int ch2 = Previous();
                if (UTF16.IsLeadSurrogate((char)ch2))
                {
                    return(Character.ToCodePoint((char)ch2, (char)ch1));
                }
                else if (ch2 != Done)
                {
                    // unmatched trail surrogate so back out
                    Next();
                }
            }
            return(ch1);
        }
        /// <summary>
        /// Implements <see cref="Transliterator.HandleTransliterate(IReplaceable, Position, bool)"/>.
        /// </summary>
        protected override void HandleTransliterate(IReplaceable text,
                                                    Position offsets, bool isIncremental)
        {
            int cursor = offsets.Start;
            int limit  = offsets.Limit;

            StringBuilder str = new StringBuilder();

            str.Append(OPEN_DELIM);
            int    len;
            string name;

            while (cursor < limit)
            {
                int c = text.Char32At(cursor);
                if ((name = UCharacter.GetExtendedName(c)) != null)
                {
                    str.Length = OPEN_DELIM_LEN;
                    str.Append(name).Append(CLOSE_DELIM);

                    int clen = UTF16.GetCharCount(c);
                    text.Replace(cursor, cursor + clen, str.ToString());
                    len     = str.Length;
                    cursor += len;        // advance cursor by 1 and adjust for new text
                    limit  += len - clen; // change in length
                }
                else
                {
                    ++cursor;
                }
            }

            offsets.ContextLimit += limit - offsets.Limit;
            offsets.Limit         = limit;
            offsets.Start         = cursor;
        }
Beispiel #17
0
        //
        // RBBISymbolTable::parseReference   This function from the abstract symbol table interface
        //                                   looks for a $variable name in the source text.
        //                                   It does not look it up, only scans for it.
        //                                   It is used by the UnicodeSet parser.
        //
        public virtual string ParseReference(string text, ParsePosition pos, int limit)
        {
            int    start  = pos.Index;
            int    i      = start;
            string result = "";

            while (i < limit)
            {
                int c = UTF16.CharAt(text, i);
                if ((i == start && !UChar.IsUnicodeIdentifierStart(c)) ||
                    !UChar.IsUnicodeIdentifierPart(c))
                {
                    break;
                }
                i += UTF16.GetCharCount(c);
            }
            if (i == start)
            {                   // No valid name chars
                return(result); // Indicate failure with empty string
            }
            pos.Index = i;
            result    = text.Substring(start, i - start); // ICU4N: Corrected 2nd parameter
            return(result);
        }
        /// <summary>
        /// Implements <see cref="Transliterator.HandleTransliterate(IReplaceable, TransliterationPosition, bool)"/>
        /// </summary>
        protected override void HandleTransliterate(IReplaceable text,
                                                    TransliterationPosition offsets, bool isIncremental)
        {
            lock (syncLock)
            {
                if (csp == null)
                {
                    return;
                }

                if (offsets.Start >= offsets.Limit)
                {
                    return;
                }

                iter.SetText(text);
                result.Length = 0;
                int c, delta;

                // Walk through original string
                // If there is a case change, modify corresponding position in replaceable

                iter.SetIndex(offsets.Start);
                iter.SetLimit(offsets.Limit);
                iter.SetContextLimits(offsets.ContextStart, offsets.ContextLimit);
                while ((c = iter.NextCaseMapCP()) >= 0)
                {
                    c = csp.ToFullFolding(c, result, 0); // toFullFolding(int c, StringBuffer out, int options)

                    if (iter.DidReachLimit && isIncremental)
                    {
                        // the case mapping function tried to look beyond the context limit
                        // wait for more input
                        offsets.Start = iter.CaseMapCPStart;
                        return;
                    }

                    /* decode the result */
                    if (c < 0)
                    {
                        /* c mapped to itself, no change */
                        continue;
                    }
                    else if (c <= UCaseProperties.MaxStringLength)
                    {
                        /* replace by the mapping string */
                        delta         = iter.Replace(result.ToString());
                        result.Length = 0;
                    }
                    else
                    {
                        /* replace by single-code point mapping */
                        delta = iter.Replace(UTF16.ValueOf(c));
                    }

                    if (delta != 0)
                    {
                        offsets.Limit        += delta;
                        offsets.ContextLimit += delta;
                    }
                }
                offsets.Start = offsets.Limit;
            }
        }
Beispiel #19
0
        private ISet <string> GetEquivalents2(string segment)
        {
            ISet <string> result = new HashSet <string>();

            if (PROGRESS)
            {
                Console.Out.WriteLine("Adding: " + Utility.Hex(segment));
            }

            result.Add(segment);
            StringBuffer workingBuffer = new StringBuffer();
            UnicodeSet   starts        = new UnicodeSet();

            // cycle through all the characters
            int cp;

            for (int i = 0; i < segment.Length; i += Character.CharCount(cp))
            {
                // see if any character is at the start of some decomposition
                cp = segment.CodePointAt(i);
                if (!nfcImpl.GetCanonStartSet(cp, starts))
                {
                    continue;
                }
                // if so, see which decompositions match
                for (UnicodeSetIterator iter = new UnicodeSetIterator(starts); iter.Next();)
                {
                    int           cp2       = iter.Codepoint;
                    ISet <string> remainder = Extract(cp2, segment, i, workingBuffer);
                    if (remainder == null)
                    {
                        continue;
                    }

                    // there were some matches, so add all the possibilities to the set.
                    string prefix = segment.Substring(0, i - 0); // ICU4N: Checked 2nd parameter
                    prefix += UTF16.ValueOf(cp2);
                    foreach (string item in remainder)
                    {
                        result.Add(prefix + item);
                    }
                }
            }
            return(result);

            /*
             * Set result = new HashSet();
             * if (PROGRESS) System.out.println("Adding: " + NAME.transliterate(segment));
             * result.add(segment);
             * StringBuffer workingBuffer = new StringBuffer();
             *
             * // cycle through all the characters
             * int cp;
             *
             * for (int i = 0; i < segment.length(); i += UTF16.getCharCount(cp)) {
             *  // see if any character is at the start of some decomposition
             *  cp = UTF16.charAt(segment, i);
             *  NormalizerImpl.getCanonStartSet(c,fillSet)
             *  UnicodeSet starts = AT_START.get(cp);
             *  if (starts == null) continue;
             *  UnicodeSetIterator usi = new UnicodeSetIterator(starts);
             *  // if so, see which decompositions match
             *  while (usi.next()) {
             *      int cp2 = usi.codepoint;
             *      // we know that there are no strings in it
             *      // so we don't have to check CharacterIterator.IS_STRING
             *      Set remainder = extract(cp2, segment, i, workingBuffer);
             *      if (remainder == null) continue;
             *
             *      // there were some matches, so add all the possibilities to the set.
             *      String prefix = segment.substring(0, i) + UTF16.valueOf(cp2);
             *      Iterator it = remainder.iterator();
             *      while (it.hasNext()) {
             *          String item = (String) it.next();
             *          if (PROGRESS) System.out.println("Adding: " + NAME.transliterate(prefix + item));
             *          result.add(prefix + item);
             *      }
             *  }
             * }
             * return result;
             */
        }
Beispiel #20
0
 internal static int PosAfter(IReplaceable str, int pos)
 {
     return((pos >= 0 && pos < str.Length) ?
            pos + UTF16.GetCharCount(str.Char32At(pos)) :
            pos + 1);
 }
Beispiel #21
0
        protected override void HandleTransliterate(IReplaceable text, TransliterationPosition pos, bool incremental)
        {
            lock (this)
            {
                boundaryCount = 0;
                int boundary = 0;
                GetBreakIterator(); // Lazy-create it if necessary
                bi.SetText(new ReplaceableCharacterIterator(text, pos.Start, pos.Limit, pos.Start));
                // TODO: fix clumsy workaround used below.

                /*
                 * char[] tempBuffer = new char[text.length()];
                 * text.getChars(0, text.length(), tempBuffer, 0);
                 * bi.setText(new StringCharacterIterator(new String(tempBuffer), pos.start, pos.limit, pos.start));
                 */
                // end debugging

                // To make things much easier, we will stack the boundaries, and then insert at the end.
                // generally, we won't need too many, since we will be filtered.

                for (boundary = bi.First(); boundary != BreakIterator.Done && boundary < pos.Limit; boundary = bi.Next())
                {
                    if (boundary == 0)
                    {
                        continue;
                    }
                    // HACK: Check to see that preceeding item was a letter

                    int cp   = UTF16.CharAt(text, boundary - 1);
                    int type = UChar.GetUnicodeCategory(cp).ToInt32();
                    //System.out.println(Integer.toString(cp,16) + " (before): " + type);
                    if (((1 << type) & LETTER_OR_MARK_MASK) == 0)
                    {
                        continue;
                    }

                    cp   = UTF16.CharAt(text, boundary);
                    type = UChar.GetUnicodeCategory(cp).ToInt32();
                    //System.out.println(Integer.toString(cp,16) + " (after): " + type);
                    if (((1 << type) & LETTER_OR_MARK_MASK) == 0)
                    {
                        continue;
                    }

                    if (boundaryCount >= boundaries.Length)
                    {       // realloc if necessary
                        int[] temp = new int[boundaries.Length * 2];
                        System.Array.Copy(boundaries, 0, temp, 0, boundaries.Length);
                        boundaries = temp;
                    }

                    boundaries[boundaryCount++] = boundary;
                    //System.out.println(boundary);
                }

                int delta        = 0;
                int lastBoundary = 0;

                if (boundaryCount != 0)
                { // if we found something, adjust
                    delta        = boundaryCount * insertion.Length;
                    lastBoundary = boundaries[boundaryCount - 1];

                    // we do this from the end backwards, so that we don't have to keep updating.

                    while (boundaryCount > 0)
                    {
                        boundary = boundaries[--boundaryCount];
                        text.Replace(boundary, boundary, insertion);
                    }
                }

                // Now fix up the return values
                pos.ContextLimit += delta;
                pos.Limit        += delta;
                pos.Start         = incremental ? lastBoundary + delta : pos.Limit;
            }
        }
Beispiel #22
0
        /// <summary>
        /// See if the decomposition of cp2 is at segment starting at <paramref name="segmentPos"/>
        /// (with canonical rearrangment!).
        /// If so, take the remainder, and return the equivalents.
        /// </summary>
        /// <param name="comp"></param>
        /// <param name="segment"></param>
        /// <param name="segmentPos"></param>
        /// <param name="buf"></param>
        /// <returns></returns>
        private ISet <string> Extract(int comp, string segment, int segmentPos, StringBuffer buf)
        {
            if (PROGRESS)
            {
                Console.Out.WriteLine(" extract: " + Utility.Hex(UTF16.ValueOf(comp))
                                      + ", " + Utility.Hex(segment.Substring(segmentPos)));
            }

            string decomp = nfcImpl.GetDecomposition(comp);

            if (decomp == null)
            {
                decomp = UTF16.ValueOf(comp);
            }

            // See if it matches the start of segment (at segmentPos)
            bool ok = false;
            int  cp;
            int  decompPos = 0;
            int  decompCp  = UTF16.CharAt(decomp, 0);

            decompPos += UTF16.GetCharCount(decompCp); // adjust position to skip first char
                                                       //int decompClass = getClass(decompCp);
            buf.Length = 0;                            // initialize working buffer, shared among callees

            for (int i = segmentPos; i < segment.Length; i += UTF16.GetCharCount(cp))
            {
                cp = UTF16.CharAt(segment, i);
                if (cp == decompCp)
                { // if equal, eat another cp from decomp
                    if (PROGRESS)
                    {
                        Console.Out.WriteLine("  matches: " + Utility.Hex(UTF16.ValueOf(cp)));
                    }
                    if (decompPos == decomp.Length)
                    {                                                              // done, have all decomp characters!
                        buf.Append(segment.Substring(i + UTF16.GetCharCount(cp))); // add remaining segment chars
                        ok = true;
                        break;
                    }
                    decompCp   = UTF16.CharAt(decomp, decompPos);
                    decompPos += UTF16.GetCharCount(decompCp);
                    //decompClass = getClass(decompCp);
                }
                else
                {
                    if (PROGRESS)
                    {
                        Console.Out.WriteLine("  buffer: " + Utility.Hex(UTF16.ValueOf(cp)));
                    }
                    // brute force approach
                    UTF16.Append(buf, cp);

                    /* TODO: optimize
                     * // since we know that the classes are monotonically increasing, after zero
                     * // e.g. 0 5 7 9 0 3
                     * // we can do an optimization
                     * // there are only a few cases that work: zero, less, same, greater
                     * // if both classes are the same, we fail
                     * // if the decomp class < the segment class, we fail
                     *
                     * segClass = getClass(cp);
                     * if (decompClass <= segClass) return null;
                     */
                }
            }
            if (!ok)
            {
                return(null);     // we failed, characters left over
            }
            if (PROGRESS)
            {
                Console.Out.WriteLine("Matches");
            }
            if (buf.Length == 0)
            {
                return(SET_WITH_NULL_STRING);                 // succeed, but no remainder
            }
            string remainder = buf.ToString();

            // brute force approach
            // to check to make sure result is canonically equivalent

            /*
             * String trial = Normalizer.normalize(UTF16.valueOf(comp) + remainder, Normalizer.DECOMP, 0);
             * if (!segment.regionMatches(segmentPos, trial, 0, segment.length() - segmentPos)) return null;
             */

            if (0 != Normalizer.Compare(UTF16.ValueOf(comp) + remainder, segment.Substring(segmentPos), 0))
            {
                return(null);
            }

            // get the remaining combinations
            return(GetEquivalents2(remainder));
        }
Beispiel #23
0
        private StringBuffer Map(UCharacterIterator iter, StringPrepOptions options)
        {
            Values       val             = new Values();
            char         result          = (char)0;
            int          ch              = UCharacterIterator.DONE;
            StringBuffer dest            = new StringBuffer();
            bool         allowUnassigned = ((options & StringPrepOptions.AllowUnassigned) > 0);

            while ((ch = iter.NextCodePoint()) != UCharacterIterator.DONE)
            {
                result = GetCodePointValue(ch);
                GetValues(result, val);

                // check if the source codepoint is unassigned
                if (val.type == UNASSIGNED && allowUnassigned == false)
                {
                    throw new StringPrepParseException("An unassigned code point was found in the input",
                                                       StringPrepErrorType.UnassignedError,
                                                       iter.GetText(), iter.Index);
                }
                else if ((val.type == MAP))
                {
                    int index, length;

                    if (val.isIndex)
                    {
                        index = val.value;
                        if (index >= indexes[ONE_UCHAR_MAPPING_INDEX_START] &&
                            index < indexes[TWO_UCHARS_MAPPING_INDEX_START])
                        {
                            length = 1;
                        }
                        else if (index >= indexes[TWO_UCHARS_MAPPING_INDEX_START] &&
                                 index < indexes[THREE_UCHARS_MAPPING_INDEX_START])
                        {
                            length = 2;
                        }
                        else if (index >= indexes[THREE_UCHARS_MAPPING_INDEX_START] &&
                                 index < indexes[FOUR_UCHARS_MAPPING_INDEX_START])
                        {
                            length = 3;
                        }
                        else
                        {
                            length = mappingData[index++];
                        }
                        /* copy mapping to destination */
                        dest.Append(mappingData, index, length);
                        continue;
                    }
                    else
                    {
                        ch -= val.value;
                    }
                }
                else if (val.type == DELETE)
                {
                    // just consume the codepoint and contine
                    continue;
                }
                //copy the source into destination
                UTF16.Append(dest, ch);
            }

            return(dest);
        }
Beispiel #24
0
 internal static int PosBefore(IReplaceable str, int pos)
 {
     return((pos > 0) ?
            pos - UTF16.GetCharCount(str.Char32At(pos - 1)) :
            pos - 1);
 }
Beispiel #25
0
        /// <summary>
        /// Implements <see cref="Transliterator.HandleTransliterate(IReplaceable, Position, bool)"/>.
        /// </summary>
        protected override void HandleTransliterate(IReplaceable text,
                                                    Position offsets, bool isIncremental)
        {
            int maxLen = UCharacterName.Instance.MaxCharNameLength + 1; // allow for temporary trailing space

            StringBuffer name = new StringBuffer(maxLen);

            // Get the legal character set
            UnicodeSet legal = new UnicodeSet();

            UCharacterName.Instance.GetCharNameCharacters(legal);

            int cursor = offsets.Start;
            int limit  = offsets.Limit;

            // Modes:
            // 0 - looking for open delimiter
            // 1 - after open delimiter
            int mode    = 0;
            int openPos = -1; // open delim candidate pos

            int c;

            while (cursor < limit)
            {
                c = text.Char32At(cursor);

                switch (mode)
                {
                case 0:   // looking for open delimiter
                    if (c == OPEN_DELIM)
                    {     // quick check first
                        openPos = cursor;
                        int i = Utility.ParsePattern(OPEN_PAT, text, cursor, limit);
                        if (i >= 0 && i < limit)
                        {
                            mode        = 1;
                            name.Length = 0;
                            cursor      = i;
                            continue;     // *** reprocess char32At(cursor)
                        }
                    }
                    break;

                case 1:     // after open delimiter
                            // Look for legal chars.  If \s+ is found, convert it
                            // to a single space.  If closeDelimiter is found, exit
                            // the loop.  If any other character is found, exit the
                            // loop.  If the limit is reached, exit the loop.

                    // Convert \s+ => SPACE.  This assumes there are no
                    // runs of >1 space characters in names.
                    if (PatternProps.IsWhiteSpace(c))
                    {
                        // Ignore leading whitespace
                        if (name.Length > 0 &&
                            name[name.Length - 1] != SPACE)
                        {
                            name.Append(SPACE);
                            // If we are too long then abort.  maxLen includes
                            // temporary trailing space, so use '>'.
                            if (name.Length > maxLen)
                            {
                                mode = 0;
                            }
                        }
                        break;
                    }

                    if (c == CLOSE_DELIM)
                    {
                        int len = name.Length;

                        // Delete trailing space, if any
                        if (len > 0 &&
                            name[len - 1] == SPACE)
                        {
                            name.Length = --len;
                        }

                        c = UCharacter.GetCharFromExtendedName(name.ToString());
                        if (c != -1)
                        {
                            // Lookup succeeded

                            // assert(UTF16.getCharCount(CLOSE_DELIM) == 1);
                            cursor++;     // advance over CLOSE_DELIM

                            string str = UTF16.ValueOf(c);
                            text.Replace(openPos, cursor, str);

                            // Adjust indices for the change in the length of
                            // the string.  Do not assume that str.length() ==
                            // 1, in case of surrogates.
                            int delta = cursor - openPos - str.Length;
                            cursor -= delta;
                            limit  -= delta;
                            // assert(cursor == openPos + str.length());
                        }
                        // If the lookup failed, we leave things as-is and
                        // still switch to mode 0 and continue.
                        mode    = 0;
                        openPos = -1; // close off candidate
                        continue;     // *** reprocess char32At(cursor)
                    }

                    if (legal.Contains(c))
                    {
                        UTF16.Append(name, c);
                        // If we go past the longest possible name then abort.
                        // maxLen includes temporary trailing space, so use '>='.
                        if (name.Length >= maxLen)
                        {
                            mode = 0;
                        }
                    }

                    // Invalid character
                    else
                    {
                        --cursor;     // Backup and reprocess this character
                        mode = 0;
                    }

                    break;
                }

                cursor += UTF16.GetCharCount(c);
            }

            offsets.ContextLimit += limit - offsets.Limit;
            offsets.Limit         = limit;
            // In incremental mode, only advance the cursor up to the last
            // open delimiter candidate.
            offsets.Start = (isIncremental && openPos >= 0) ? openPos : cursor;
        }
Beispiel #26
0
        //=    public static UnicodeReplacer valueOf(String output,
        //=                                          int cursorPos,
        //=                                          RuleBasedTransliterator.Data data) {
        //=        if (output.length() == 1) {
        //=            char c = output.charAt(0);
        //=            UnicodeReplacer r = data.lookupReplacer(c);
        //=            if (r != null) {
        //=                return r;
        //=            }
        //=        }
        //=        return new StringReplacer(output, cursorPos, data);
        //=    }

        /// <summary>
        /// <see cref="IUnicodeReplacer"/> API
        /// </summary>
        public virtual int Replace(IReplaceable text,
                                   int start,
                                   int limit,
                                   int[] cursor)
        {
            int outLen;
            int newStart = 0;

            // NOTE: It should be possible to _always_ run the complex
            // processing code; just slower.  If not, then there is a bug
            // in the complex processing code.

            // Simple (no nested replacers) Processing Code :
            if (!isComplex)
            {
                text.Replace(start, limit, output);
                outLen = output.Length;

                // Setup default cursor position (for cursorPos within output)
                newStart = cursorPos;
            }

            // Complex (nested replacers) Processing Code :
            else
            {
                /* When there are segments to be copied, use the Replaceable.copy()
                 * API in order to retain out-of-band data.  Copy everything to the
                 * end of the string, then copy them back over the key.  This preserves
                 * the integrity of indices into the key and surrounding context while
                 * generating the output text.
                 */
                StringBuffer buf = new StringBuffer();
                int          oOutput; // offset into 'output'
                isComplex = false;

                // The temporary buffer starts at tempStart, and extends
                // to destLimit + tempExtra.  The start of the buffer has a single
                // character from before the key.  This provides style
                // data when addition characters are filled into the
                // temporary buffer.  If there is nothing to the left, use
                // the non-character U+FFFF, which Replaceable subclasses
                // should treat specially as a "no-style character."
                // destStart points to the point after the style context
                // character, so it is tempStart+1 or tempStart+2.
                int tempStart = text.Length; // start of temp buffer
                int destStart = tempStart;   // copy new text to here
                if (start > 0)
                {
                    int len = UTF16.GetCharCount(text.Char32At(start - 1));
                    text.Copy(start - len, start, tempStart);
                    destStart += len;
                }
                else
                {
                    text.Replace(tempStart, tempStart, "\uFFFF");
                    destStart++;
                }
                int destLimit = destStart;
                int tempExtra = 0; // temp chars after destLimit

                for (oOutput = 0; oOutput < output.Length;)
                {
                    if (oOutput == cursorPos)
                    {
                        // Record the position of the cursor
                        newStart = buf.Length + destLimit - destStart; // relative to start
                                                                       // the buf.length() was inserted for bug 5789
                                                                       // the problem is that if we are accumulating into a buffer (when r == null below)
                                                                       // then the actual length of the text at that point needs to add the buf length.
                                                                       // there was an alternative suggested in #5789, but that looks like it won't work
                                                                       // if we have accumulated some stuff in the dest part AND have a non-zero buffer.
                    }
                    int c = UTF16.CharAt(output, oOutput);

                    // When we are at the last position copy the right style
                    // context character into the temporary buffer.  We don't
                    // do this before because it will provide an incorrect
                    // right context for previous replace() operations.
                    int nextIndex = oOutput + UTF16.GetCharCount(c);
                    if (nextIndex == output.Length)
                    {
                        tempExtra = UTF16.GetCharCount(text.Char32At(limit));
                        text.Copy(limit, limit + tempExtra, destLimit);
                    }

                    IUnicodeReplacer r = data.LookupReplacer(c);
                    if (r == null)
                    {
                        // Accumulate straight (non-segment) text.
                        UTF16.Append(buf, c);
                    }
                    else
                    {
                        isComplex = true;

                        // Insert any accumulated straight text.
                        if (buf.Length > 0)
                        {
                            text.Replace(destLimit, destLimit, buf.ToString());
                            destLimit += buf.Length;
                            buf.Length = 0;
                        }

                        // Delegate output generation to replacer object
                        int len = r.Replace(text, destLimit, destLimit, cursor);
                        destLimit += len;
                    }
                    oOutput = nextIndex;
                }
                // Insert any accumulated straight text.
                if (buf.Length > 0)
                {
                    text.Replace(destLimit, destLimit, buf.ToString());
                    destLimit += buf.Length;
                }
                if (oOutput == cursorPos)
                {
                    // Record the position of the cursor
                    newStart = destLimit - destStart; // relative to start
                }

                outLen = destLimit - destStart;

                // Copy new text to start, and delete it
                text.Copy(destStart, destLimit, start);
                text.Replace(tempStart + outLen, destLimit + tempExtra + outLen, "");

                // Delete the old text (the key)
                text.Replace(start + outLen, limit + outLen, "");
            }

            if (hasCursor)
            {
                // Adjust the cursor for positions outside the key.  These
                // refer to code points rather than code units.  If cursorPos
                // is within the output string, then use newStart, which has
                // already been set above.
                if (cursorPos < 0)
                {
                    newStart = start;
                    int n = cursorPos;
                    // Outside the output string, cursorPos counts code points
                    while (n < 0 && newStart > 0)
                    {
                        newStart -= UTF16.GetCharCount(text.Char32At(newStart - 1));
                        ++n;
                    }
                    newStart += n;
                }
                else if (cursorPos > output.Length)
                {
                    newStart = start + outLen;
                    int n = cursorPos - output.Length;
                    // Outside the output string, cursorPos counts code points
                    while (n > 0 && newStart < text.Length)
                    {
                        newStart += UTF16.GetCharCount(text.Char32At(newStart));
                        --n;
                    }
                    newStart += n;
                }
                else
                {
                    // Cursor is within output string.  It has been set up above
                    // to be relative to start.
                    newStart += start;
                }

                cursor[0] = newStart;
            }

            return(outLen);
        }
Beispiel #27
0
        /// <summary>
        /// Implements <see cref="Transliterator.HandleTransliterate(IReplaceable, Position, bool)"/>.
        /// </summary>
        protected override void HandleTransliterate(IReplaceable text,
                                                    Position offsets, bool isIncremental)
        {
            lock (this)
            {
                // TODO reimplement, see ustrcase.c
                // using a real word break iterator
                //   instead of just looking for a transition between cased and uncased characters
                // call CaseMapTransliterator::handleTransliterate() for lowercasing? (set fMap)
                // needs to take isIncremental into account because case mappings are context-sensitive
                //   also detect when lowercasing function did not finish because of context

                if (offsets.Start >= offsets.Limit)
                {
                    return;
                }

                // case type: >0 cased (UCaseProps.LOWER etc.)  ==0 uncased  <0 case-ignorable
                int type;

                // Our mode; we are either converting letter toTitle or
                // toLower.
                bool doTitle = true;

                // Determine if there is a preceding context of cased case-ignorable*,
                // in which case we want to start in toLower mode.  If the
                // prior context is anything else (including empty) then start
                // in toTitle mode.
                int c, start;
                for (start = offsets.Start - 1; start >= offsets.ContextStart; start -= UTF16.GetCharCount(c))
                {
                    c    = text.Char32At(start);
                    type = csp.GetTypeOrIgnorable(c);
                    if (type > 0)
                    { // cased
                        doTitle = false;
                        break;
                    }
                    else if (type == 0)
                    { // uncased but not ignorable
                        break;
                    }
                    // else (type<0) case-ignorable: continue
                }

                // Convert things after a cased character toLower; things
                // after a uncased, non-case-ignorable character toTitle.  Case-ignorable
                // characters are copied directly and do not change the mode.

                iter.SetText(text);
                iter.SetIndex(offsets.Start);
                iter.SetLimit(offsets.Limit);
                iter.SetContextLimits(offsets.ContextStart, offsets.ContextLimit);

                result.Length = 0;

                // Walk through original string
                // If there is a case change, modify corresponding position in replaceable
                int delta;

                while ((c = iter.NextCaseMapCP()) >= 0)
                {
                    type = csp.GetTypeOrIgnorable(c);
                    if (type >= 0)
                    { // not case-ignorable
                        if (doTitle)
                        {
                            c = csp.ToFullTitle(c, iter, result, caseLocale);
                        }
                        else
                        {
                            c = csp.ToFullLower(c, iter, result, caseLocale);
                        }
                        doTitle = type == 0; // doTitle=isUncased

                        if (iter.DidReachLimit && isIncremental)
                        {
                            // the case mapping function tried to look beyond the context limit
                            // wait for more input
                            offsets.Start = iter.CaseMapCPStart;
                            return;
                        }

                        /* decode the result */
                        if (c < 0)
                        {
                            /* c mapped to itself, no change */
                            continue;
                        }
                        else if (c <= UCaseProps.MAX_STRING_LENGTH)
                        {
                            /* replace by the mapping string */
                            delta         = iter.Replace(result.ToString());
                            result.Length = 0;
                        }
                        else
                        {
                            /* replace by single-code point mapping */
                            delta = iter.Replace(UTF16.ValueOf(c));
                        }

                        if (delta != 0)
                        {
                            offsets.Limit        += delta;
                            offsets.ContextLimit += delta;
                        }
                    }
                }
                offsets.Start = offsets.Limit;
            }
        }
        /// <summary>
        /// Implements <see cref="Transliterator.HandleTransliterate(IReplaceable, Position, bool)"/>.
        /// </summary>
        protected override void HandleTransliterate(IReplaceable text,
                                                    Position pos, bool isIncremental)
        {
            int start = pos.Start;
            int limit = pos.Limit;
            int i, ipat;

            //loop:
            while (start < limit)
            {
                // Loop over the forms in spec[].  Exit this loop when we
                // match one of the specs.  Exit the outer loop if a
                // partial match is detected and isIncremental is true.
                for (ipat = 0; spec[ipat] != END;)
                {
                    // Read the header
                    int prefixLen = spec[ipat++];
                    int suffixLen = spec[ipat++];
                    int radix     = spec[ipat++];
                    int minDigits = spec[ipat++];
                    int maxDigits = spec[ipat++];

                    // s is a copy of start that is advanced over the
                    // characters as we parse them.
                    int  s     = start;
                    bool match = true;

                    for (i = 0; i < prefixLen; ++i)
                    {
                        if (s >= limit)
                        {
                            if (i > 0)
                            {
                                // We've already matched a character.  This is
                                // a partial match, so we return if in
                                // incremental mode.  In non-incremental mode,
                                // go to the next spec.
                                if (isIncremental)
                                {
                                    goto loop_break;
                                }
                                match = false;
                                break;
                            }
                        }
                        char c = text[s++];
                        if (c != spec[ipat + i])
                        {
                            match = false;
                            break;
                        }
                    }

                    if (match)
                    {
                        int u          = 0;
                        int digitCount = 0;
                        for (; ;)
                        {
                            if (s >= limit)
                            {
                                // Check for partial match in incremental mode.
                                if (s > start && isIncremental)
                                {
                                    goto loop_break;
                                }
                                break;
                            }
                            int ch    = text.Char32At(s);
                            int digit = UCharacter.Digit(ch, radix);
                            if (digit < 0)
                            {
                                break;
                            }
                            s += UTF16.GetCharCount(ch);
                            u  = (u * radix) + digit;
                            if (++digitCount == maxDigits)
                            {
                                break;
                            }
                        }

                        match = (digitCount >= minDigits);

                        if (match)
                        {
                            for (i = 0; i < suffixLen; ++i)
                            {
                                if (s >= limit)
                                {
                                    // Check for partial match in incremental mode.
                                    if (s > start && isIncremental)
                                    {
                                        goto loop_break;
                                    }
                                    match = false;
                                    break;
                                }
                                char c = text[s++];
                                if (c != spec[ipat + prefixLen + i])
                                {
                                    match = false;
                                    break;
                                }
                            }

                            if (match)
                            {
                                // At this point, we have a match
                                string str = UTF16.ValueOf(u);
                                text.Replace(start, s, str);
                                limit -= s - start - str.Length;
                                // The following break statement leaves the
                                // loop that is traversing the forms in
                                // spec[].  We then parse the next input
                                // character.
                                break;
                            }
                        }
                    }

                    ipat += prefixLen + suffixLen;
                }

                if (start < limit)
                {
                    start += UTF16.GetCharCount(text.Char32At(start));
                }
            }
            loop_break : { }

            pos.ContextLimit += limit - pos.Limit;
            pos.Limit         = limit;
            pos.Start         = start;
        }
Beispiel #29
0
        public SourceTargetUtility(ITransform <string, string> transform, Normalizer2 normalizer)
        {
            this.transform = transform;
            if (normalizer != null)
            {
                //            synchronized (SourceTargetUtility.class) {
                //                if (NFC == null) {
                //                    NFC = Normalizer2.getInstance(null, "nfc", Mode.COMPOSE);
                //                    for (int i = 0; i <= 0x10FFFF; ++i) {
                //                        String d = NFC.getDecomposition(i);
                //                        if (d == null) {
                //                            continue;
                //                        }
                //                        String s = NFC.normalize(d);
                //                        if (!CharSequences.equals(i, s)) {
                //                            continue;
                //                        }
                //                        // composes
                //                        boolean first = false;
                //                        for (int trailing : CharSequences.codePoints(d)) {
                //                            if (first) {
                //                                first = false;
                //                            } else {
                //                                TRAILING_COMBINING.add(trailing);
                //                            }
                //                        }
                //                    }
                //                }
                //            }
                sourceCache = new UnicodeSet("[:^ccc=0:]");
            }
            else
            {
                sourceCache = new UnicodeSet();
            }
            sourceStrings = new HashSet <string>();
            for (int i = 0; i <= 0x10FFFF; ++i)
            {
                string s     = transform.Transform(UTF16.ValueOf(i));
                bool   added = false;
                if (!CharSequences.Equals(i, s))
                {
                    sourceCache.Add(i);
                    added = true;
                }
                if (normalizer == null)
                {
                    continue;
                }
                string d = NFC.GetDecomposition(i);
                if (d == null)
                {
                    continue;
                }
                s = transform.Transform(d);
                if (!d.Equals(s))
                {
                    sourceStrings.Add(d);
                }
                if (added)
                {
                    continue;
                }
                if (!normalizer.IsInert(i))
                {
                    sourceCache.Add(i);
                    continue;
                }
                // see if any of the non-starters change s; if so, add i
                //            for (String ns : TRAILING_COMBINING) {
                //                String s2 = transform.transform(s + ns);
                //                if (!s2.startsWith(s)) {
                //                    sourceCache.add(i);
                //                    break;
                //                }
                //            }

                // int endOfFirst = CharSequences.onCharacterBoundary(d, 1) ? 1 : 2;
                // if (endOfFirst >= d.length()) {
                // continue;
                // }
                // // now add all initial substrings
                // for (int j = 1; j < d.length(); ++j) {
                // if (!CharSequences.onCharacterBoundary(d, j)) {
                // continue;
                // }
                // String dd = d.substring(0,j);
                // s = transform.transform(dd);
                // if (!dd.equals(s)) {
                // sourceStrings.add(dd);
                // }
                // }
            }
            sourceCache.Freeze();
        }
Beispiel #30
0
 /// <summary>
 /// Return the 32-bit code point at the given 16-bit offset into
 /// the text.  This assumes the text is stored as 16-bit code units
 /// with surrogate pairs intermixed.  If the offset of a leading or
 /// trailing code unit of a surrogate pair is given, return the
 /// code point of the surrogate pair.
 /// </summary>
 /// <param name="offset">An integer between 0 and <see cref="Length"/>-1 inclusive.</param>
 /// <returns>32-bit code point of text at given offset.</returns>
 /// <stable>ICU 2.0</stable>
 public virtual int Char32At(int offset)
 {
     return(UTF16.CharAt(buf, offset));
 }