/// <summary>
        /// Implements <see cref="M:IBM.ICU.Text.Transliterator.HandleTransliterate(IBM.ICU.Text.Replaceable, null, System.Boolean)"/>.
        /// </summary>
        ///
        protected internal override void HandleTransliterate(Replaceable text, Transliterator.Position offsets,
                                                             bool isIncremental)
        {
            int start = offsets.start;
            int limit = offsets.limit;

            if (start >= limit)
            {
                return;
            }

            int overallDelta = 0;

            // Walk through the string looking for safe characters.
            // Whenever you hit one normalize from the start of the last
            // safe character up to just before the next safe character
            // Also, if you hit the end and we are not in incremental mode,
            // do to end.

            // TODO: fix for surrogates
            // TODO: add QuickCheck, so we rarely convert OK stuff

            int lastSafe = start;     // go back to start in any event
            int cp;

            for (int i = start + 1; i < limit; i += IBM.ICU.Text.UTF16.GetCharCount(cp))
            {
                cp = text.Char32At(i);
                if (IBM.ICU.Lang.UCharacter.GetCombiningClass(cp) == 0 &&
                    !unsafeStart.Contains(cp))
                {
                    int delta = Convert(text, lastSafe, i, null);
                    i            += delta;
                    limit        += delta;
                    overallDelta += delta;
                    lastSafe      = i;
                }
            }
            if (!isIncremental)
            {
                int delta_0 = Convert(text, lastSafe, limit, null);
                overallDelta += delta_0;
                lastSafe      = limit + delta_0;
            }
            else
            {
                // We are incremental, so accept the last characters IF they turn
                // into skippables
                int delta_1 = Convert(text, lastSafe, limit, skippable);
                if (delta_1 != Int32.MinValue)
                {
                    overallDelta += delta_1;
                    lastSafe      = limit + delta_1;
                }
            }
            offsets.contextLimit += overallDelta;
            offsets.limit        += overallDelta;
            offsets.start         = lastSafe;
        }
Exemple #2
0
            protected internal override bool Match(String s, Pick.Position p)
            {
                int cp = IBM.ICU.Text.UTF16.CharAt(s, p.index);

                if (source.Contains(cp))
                {
                    p.index += IBM.ICU.Text.UTF16.GetCharCount(cp);
                    return(true);
                }
                p.SetMax("codePoint");
                return(false);
            }
Exemple #3
0
        // private methods ------------------------------------------------------

        /// <summary>
        /// Gets the index of the next delimiter after offset
        /// </summary>
        ///
        /// <param name="offset">to the source string</param>
        /// <returns>offset of the immediate next delimiter, otherwise (- source
        /// string length - 1) if there are no more delimiters after
        /// m_nextOffset</returns>
        private int GetNextDelimiter(int offset)
        {
            if (offset >= 0)
            {
                int result = offset;
                int c      = 0;
                if (delims == null)
                {
                    do
                    {
                        c = IBM.ICU.Text.UTF16.CharAt(m_source_, result);
                        if (m_delimiters_.Contains(c))
                        {
                            break;
                        }
                        result++;
                    } while (result < m_length_);
                }
                else
                {
                    do
                    {
                        c = IBM.ICU.Text.UTF16.CharAt(m_source_, result);
                        if (c < delims.Length && delims[c])
                        {
                            break;
                        }
                        result++;
                    } while (result < m_length_);
                }
                if (result < m_length_)
                {
                    return(result);
                }
            }
            return(-1 - m_length_);
        }
Exemple #4
0
        public void TestScriptMetadata()
        {
            UnicodeSet rtl = new UnicodeSet("[[:bc=R:][:bc=AL:]-[:Cn:]-[:sc=Common:]]");
            // So far, sample characters are uppercase.
            // Georgian is special.
            UnicodeSet cased = new UnicodeSet("[[:Lu:]-[:sc=Common:]-[:sc=Geor:]]");

            for (int sc = 0; sc < UScript.CodeLimit; ++sc)
            {
                String      sn        = UScript.GetShortName(sc);
                ScriptUsage usage     = UScript.GetUsage(sc);
                String      sample    = UScript.GetSampleString(sc);
                UnicodeSet  scriptSet = new UnicodeSet();
                scriptSet.ApplyInt32PropertyValue(UProperty.Script, sc);
                if (usage == ScriptUsage.NotEncoded)
                {
                    assertTrue(sn + " not encoded, no sample", sample.Length == 0);  // Java 6: sample.isEmpty()
                    assertFalse(sn + " not encoded, not RTL", UScript.IsRightToLeft(sc));
                    assertFalse(sn + " not encoded, not LB letters", UScript.BreaksBetweenLetters(sc));
                    assertFalse(sn + " not encoded, not cased", UScript.IsCased(sc));
                    assertTrue(sn + " not encoded, no characters", scriptSet.IsEmpty);
                }
                else
                {
                    assertFalse(sn + " encoded, has a sample character", sample.Length == 0);  // Java 6: sample.isEmpty()
                    int firstChar  = sample.CodePointAt(0);
                    int charScript = GetCharScript(sc);
                    assertEquals(sn + " script(sample(script))",
                                 charScript, UScript.GetScript(firstChar));
                    assertEquals(sn + " RTL vs. set", rtl.Contains(firstChar), UScript.IsRightToLeft(sc));
                    assertEquals(sn + " cased vs. set", cased.Contains(firstChar), UScript.IsCased(sc));
                    assertEquals(sn + " encoded, has characters", sc == charScript, !scriptSet.IsEmpty);
                    if (UScript.IsRightToLeft(sc))
                    {
                        rtl.RemoveAll(scriptSet);
                    }
                    if (UScript.IsCased(sc))
                    {
                        cased.RemoveAll(scriptSet);
                    }
                }
            }
            assertEquals("no remaining RTL characters", "[]", rtl.ToPattern(true));
            assertEquals("no remaining cased characters", "[]", cased.ToPattern(true));

            assertTrue("Hani breaks between letters", UScript.BreaksBetweenLetters(UScript.Han));
            assertTrue("Thai breaks between letters", UScript.BreaksBetweenLetters(UScript.Thai));
            assertFalse("Latn does not break between letters", UScript.BreaksBetweenLetters(UScript.Latin));
        }
 /// <summary>
 /// Checks whether a token is a valid keyword.
 /// </summary>
 ///
 /// <param name="token">the token to be checked</param>
 /// <returns>true if the token is a valid keyword.</returns>
 private static bool IsValidKeyword(String token)
 {
     if (token.Length > 0 && START_CHARS.Contains(token[0]))
     {
         for (int i = 1; i < token.Length; ++i)
         {
             if (!CONT_CHARS.Contains(token[i]))
             {
                 return(false);
             }
         }
         return(true);
     }
     return(false);
 }
        public static String Remove(String source, UnicodeSet removals)
        {
            StringBuilder result = new StringBuilder();
            int           cp;

            for (int i = 0; i < source.Length; i += IBM.ICU.Text.UTF16.GetCharCount(cp))
            {
                cp = IBM.ICU.Text.UTF16.CharAt(source, i);
                if (!removals.Contains(cp))
                {
                    IBM.ICU.Text.UTF16.Append(result, cp);
                }
            }
            return(result.ToString());
        }
        internal PrettyPrinter AppendQuoted(int codePoint)
        {
            if (toQuote.Contains(codePoint))
            {
                if (quoter != null)
                {
                    target.Append(quoter.Transliterate(IBM.ICU.Text.UTF16.ValueOf(codePoint)));
                    return(this);
                }
                if (codePoint > 0xFFFF)
                {
                    target.Append("\\U");
                    target.Append(IBM.ICU.Impl.Utility.Hex(codePoint, 8));
                }
                else
                {
                    target.Append("\\u");
                    target.Append(IBM.ICU.Impl.Utility.Hex(codePoint, 4));
                }
                return(this);
            }
            switch (codePoint)
            {
            case '[':     // SET_OPEN:
            case ']':     // SET_CLOSE:
            case '-':     // HYPHEN:
            case '^':     // COMPLEMENT:
            case '&':     // INTERSECTION:
            case '\\':    // BACKSLASH:
            case '{':
            case '}':
            case '$':
            case ':':
                target.Append('\\');
                break;

            default:
                // Escape whitespace
                if (patternWhitespace.Contains(codePoint))
                {
                    target.Append('\\');
                }
                break;
            }
            IBM.ICU.Text.UTF16.Append(target, codePoint);
            return(this);
        }
Exemple #8
0
        public static String ReplaceAll(String source, UnicodeSet set, String replacement)
        {
            StringBuffer results = new StringBuffer();
            int          cp;

            for (int i = 0; i < source.Length; i += UTF16.GetCharCount(cp))
            {
                cp = UTF16.CharAt(source, i);
                if (set.Contains(cp))
                {
                    results.Append(replacement);
                }
                else
                {
                    UTF16.Append(results, cp);
                }
            }
            return(results.ToString());
        }
        /// <summary>Returns <c>true</c> if the current text represents emoji character or sequence.</summary>
        private bool IsEmoji(int current, int next)
        {
            int begin     = start + current;
            int end       = start + next;
            int codepoint = UTF16.CharAt(text, 0, end, begin);

            if (EMOJI.Contains(codepoint))
            {
                if (EMOJI_RK.Contains(codepoint))
                {
                    // if its in EmojiRK, we don't treat it as emoji unless there is evidence it forms emoji sequence,
                    // an emoji presentation selector or keycap follows.
                    int trailer = begin + Character.CharCount(codepoint);
                    return(trailer < end && (text[trailer] == 0xFE0F || text[trailer] == 0x20E3));
                }
                else
                {
                    return(true);
                }
            }
            return(false);
        }
	    // ---------------------------------------------------------------------------------
	    //
	    // Parse RBBI rules. The state machine for rules parsing is here.
	    // The state tables are hand-written in the file rbbirpt.txt,
	    // and converted to the form used here by a perl
	    // script rbbicst.pl
	    //
	    // ---------------------------------------------------------------------------------
	    internal void Parse() {
	        int state;
	        RBBIRuleParseTable.RBBIRuleTableElement tableEl;
	
	        state = 1;
	        NextChar(fC);
	        //
	        // Main loop for the rule parsing state machine.
	        // Runs once per state transition.
	        // Each time through optionally performs, depending on the state table,
	        // - an advance to the the next input char
	        // - an action to be performed.
	        // - pushing or popping a state to/from the local state return stack.
	        //
	        for (;;) {
	            // Quit if state == 0. This is the normal way to exit the state
	            // machine.
	            //
	            if (state == 0) {
	                break;
	            }
	
	            // Find the state table element that matches the input char from the
	            // rule, or the
	            // class of the input character. Start with the first table row for
	            // this
	            // state, then linearly scan forward until we find a row that
	            // matches the
	            // character. The last row for each state always matches all
	            // characters, so
	            // the search will stop there, if not before.
	            //
	            tableEl = IBM.ICU.Text.RBBIRuleParseTable.gRuleParseStateTable[state];
	            if (fRB.fDebugEnv != null && fRB.fDebugEnv.IndexOf("scan") >= 0) {
	                System.Console.Out.WriteLine("char, line, col = (\'" + (char) fC.fChar
	                        + "\', " + fLineNum + ", " + fCharNum + "    state = "
	                        + tableEl.fStateName);
	            }
	
	            for (int tableRow = state;; tableRow++) { // loop over the state
	                                                      // table rows associated
	                                                      // with this state.
	                tableEl = IBM.ICU.Text.RBBIRuleParseTable.gRuleParseStateTable[tableRow];
	                if (fRB.fDebugEnv != null && fRB.fDebugEnv.IndexOf("scan") >= 0) {
	                    System.Console.Out.Write(".");
	                }
	                if (tableEl.fCharClass < 127 && fC.fEscaped == false
	                        && tableEl.fCharClass == fC.fChar) {
	                    // Table row specified an individual character, not a set,
	                    // and
	                    // the input character is not escaped, and
	                    // the input character matched it.
	                    break;
	                }
	                if (tableEl.fCharClass == 255) {
	                    // Table row specified default, match anything character
	                    // class.
	                    break;
	                }
	                if (tableEl.fCharClass == 254 && fC.fEscaped) {
	                    // Table row specified "escaped" and the char was escaped.
	                    break;
	                }
	                if (tableEl.fCharClass == 253 && fC.fEscaped
	                        && (fC.fChar == 0x50 || fC.fChar == 0x70)) {
	                    // Table row specified "escaped P" and the char is either
	                    // 'p' or 'P'.
	                    break;
	                }
	                if (tableEl.fCharClass == 252 && fC.fChar == (int) -1) {
	                    // Table row specified eof and we hit eof on the input.
	                    break;
	                }
	
	                if (tableEl.fCharClass >= 128 && tableEl.fCharClass < 240 && // Table
	                                                                             // specs
	                                                                             // a
	                                                                             // char
	                                                                             // class
	                                                                             // &&
	                        fC.fEscaped == false && // char is not escaped &&
	                        fC.fChar != (int) -1) { // char is not EOF
	                    UnicodeSet uniset = fRuleSets[tableEl.fCharClass - 128];
	                    if (uniset.Contains(fC.fChar)) {
	                        // Table row specified a character class, or set of
	                        // characters,
	                        // and the current char matches it.
	                        break;
	                    }
	                }
	            }
	
	            if (fRB.fDebugEnv != null && fRB.fDebugEnv.IndexOf("scan") >= 0) {
	                System.Console.Out.WriteLine("");
	            }
	            //
	            // We've found the row of the state table that matches the current
	            // input
	            // character from the rules string.
	            // Perform any action specified by this row in the state table.
	            if (DoParseActions(tableEl.fAction) == false) {
	                // Break out of the state machine loop if the
	                // the action signalled some kind of error, or
	                // the action was to exit, occurs on normal end-of-rules-input.
	                break;
	            }
	
	            if (tableEl.fPushState != 0) {
	                fStackPtr++;
	                if (fStackPtr >= kStackSize) {
	                    System.Console.Out
	                            .WriteLine("RBBIRuleScanner.parse() - state stack overflow.");
	                    Error(IBM.ICU.Text.RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
	                }
	                fStack[fStackPtr] = tableEl.fPushState;
	            }
	
	            if (tableEl.fNextChar) {
	                NextChar(fC);
	            }
	
	            // Get the next state from the table entry, or from the
	            // state stack if the next state was specified as "pop".
	            if (tableEl.fNextState != 255) {
	                state = tableEl.fNextState;
	            } else {
	                state = fStack[fStackPtr];
	                fStackPtr--;
	                if (fStackPtr < 0) {
	                    System.Console.Out
	                            .WriteLine("RBBIRuleScanner.parse() - state stack underflow.");
	                    Error(IBM.ICU.Text.RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
	                }
	            }
	
	        }
	
	        //
	        // If there were NO user specified reverse rules, set up the equivalent
	        // of ".*;"
	        //
	        if (fRB.fTreeRoots[IBM.ICU.Text.RBBIRuleBuilder.fReverseTree] == null) {
	            fRB.fTreeRoots[IBM.ICU.Text.RBBIRuleBuilder.fReverseTree] = PushNewNode(IBM.ICU.Text.RBBINode.opStar);
	            RBBINode operand = PushNewNode(IBM.ICU.Text.RBBINode.setRef);
	            FindSetFor(kAny, operand, null);
	            fRB.fTreeRoots[IBM.ICU.Text.RBBIRuleBuilder.fReverseTree].fLeftChild = operand;
	            operand.fParent = fRB.fTreeRoots[IBM.ICU.Text.RBBIRuleBuilder.fReverseTree];
	            fNodeStackPtr -= 2;
	        }
	
	        //
	        // Parsing of the input RBBI rules is complete.
	        // We now have a parse tree for the rule expressions
	        // and a list of all UnicodeSets that are referenced.
	        //
	        if (fRB.fDebugEnv != null && fRB.fDebugEnv.IndexOf("symbols") >= 0) {
	            fSymbolTable.RbbiSymtablePrint();
	        }
	        if (fRB.fDebugEnv != null && fRB.fDebugEnv.IndexOf("ptree") >= 0) {
	            System.Console.Out.WriteLine("Completed Forward Rules Parse Tree...");
	            fRB.fTreeRoots[IBM.ICU.Text.RBBIRuleBuilder.fForwardTree].PrintTree(true);
	            System.Console.Out.WriteLine("\nCompleted Reverse Rules Parse Tree...");
	            fRB.fTreeRoots[IBM.ICU.Text.RBBIRuleBuilder.fReverseTree].PrintTree(true);
	            System.Console.Out
	                    .WriteLine("\nCompleted Safe Point Forward Rules Parse Tree...");
	            if (fRB.fTreeRoots[IBM.ICU.Text.RBBIRuleBuilder.fSafeFwdTree] == null) {
	                System.Console.Out.WriteLine("  -- null -- ");
	            } else {
	                fRB.fTreeRoots[IBM.ICU.Text.RBBIRuleBuilder.fSafeFwdTree].PrintTree(true);
	            }
	            System.Console.Out
	                    .WriteLine("\nCompleted Safe Point Reverse Rules Parse Tree...");
	            if (fRB.fTreeRoots[IBM.ICU.Text.RBBIRuleBuilder.fSafeRevTree] == null) {
	                System.Console.Out.WriteLine("  -- null -- ");
	            } else {
	                fRB.fTreeRoots[IBM.ICU.Text.RBBIRuleBuilder.fSafeRevTree].PrintTree(true);
	            }
	        }
	    }
        /// <summary>
        /// Implements <see cref="M:IBM.ICU.Text.Transliterator.HandleTransliterate(IBM.ICU.Text.Replaceable, null, System.Boolean)"/>.
        /// </summary>
        ///
        protected internal override void HandleTransliterate(Replaceable text, Transliterator.Position offsets,
                                                             bool isIncremental)
        {
            int maxLen = IBM.ICU.Impl.UCharacterName.GetInstance().GetMaxCharNameLength() + 1;     // allow
            // for
            // temporary
            // trailing
            // space

            StringBuilder name = new StringBuilder(maxLen);

            // Get the legal character set
            UnicodeSet legal = new UnicodeSet();

            IBM.ICU.Impl.UCharacterName.GetInstance().GetCharNameCharacters(legal);

            int cursor = offsets.start;
            int limit  = offsets.limit;

            // Modes:
            // 0 - looking for open delimiter
            // 1 - after open delimiter
            int mode    = 0;
            int openPos = -1;     // open delim candidate pos

            int c;

            while (cursor < limit)
            {
                c = text.Char32At(cursor);

                switch (mode)
                {
                case 0:                  // looking for open delimiter
                    if (c == OPEN_DELIM) // quick check first
                    {
                        openPos = cursor;
                        int i = IBM.ICU.Impl.Utility.ParsePattern(OPEN_PAT, text, cursor, limit);
                        if (i >= 0 && i < limit)
                        {
                            mode        = 1;
                            name.Length = 0;
                            cursor      = i;
                            continue;     // *** reprocess char32At(cursor)
                        }
                    }
                    break;

                case 1:     // after open delimiter
                    // Look for legal chars. If \s+ is found, convert it
                    // to a single space. If closeDelimiter is found, exit
                    // the loop. If any other character is found, exit the
                    // loop. If the limit is reached, exit the loop.

                    // Convert \s+ => SPACE. This assumes there are no
                    // runs of >1 space characters in names.
                    if (IBM.ICU.Impl.UCharacterProperty.IsRuleWhiteSpace(c))
                    {
                        // Ignore leading whitespace
                        if (name.Length > 0 &&
                            name[name.Length - 1] != SPACE)
                        {
                            name.Append(SPACE);
                            // If we are too long then abort. maxLen includes
                            // temporary trailing space, so use '>'.
                            if (name.Length > maxLen)
                            {
                                mode = 0;
                            }
                        }
                        break;
                    }

                    if (c == CLOSE_DELIM)
                    {
                        int len = name.Length;

                        // Delete trailing space, if any
                        if (len > 0 && name[len - 1] == SPACE)
                        {
                            name.Length = --len;
                        }

                        c = IBM.ICU.Lang.UCharacter.GetCharFromExtendedName(name.ToString());
                        if (c != -1)
                        {
                            // Lookup succeeded

                            // assert(UTF16.getCharCount(CLOSE_DELIM) == 1);
                            cursor++;     // advance over CLOSE_DELIM

                            String str = IBM.ICU.Text.UTF16.ValueOf(c);
                            text.Replace(openPos, cursor, str);

                            // Adjust indices for the change in the length of
                            // the string. Do not assume that str.length() ==
                            // 1, in case of surrogates.
                            int delta = cursor - openPos - str.Length;
                            cursor -= delta;
                            limit  -= delta;
                            // assert(cursor == openPos + str.length());
                        }
                        // If the lookup failed, we leave things as-is and
                        // still switch to mode 0 and continue.
                        mode    = 0;
                        openPos = -1; // close off candidate
                        continue;     // *** reprocess char32At(cursor)
                    }

                    if (legal.Contains(c))
                    {
                        IBM.ICU.Text.UTF16.Append(name, c);
                        // If we go past the longest possible name then abort.
                        // maxLen includes temporary trailing space, so use '>='.
                        if (name.Length >= maxLen)
                        {
                            mode = 0;
                        }
                    }

                    // Invalid character
                    else
                    {
                        --cursor;     // Backup and reprocess this character
                        mode = 0;
                    }

                    break;
                }

                cursor += IBM.ICU.Text.UTF16.GetCharCount(c);
            }

            offsets.contextLimit += limit - offsets.limit;
            offsets.limit         = limit;
            // In incremental mode, only advance the cursor up to the last
            // open delimiter candidate.
            offsets.start = (isIncremental && openPos >= 0) ? openPos : cursor;
        }
        /// <summary>
        /// Get the pattern for a particular set.
        /// </summary>
        ///
        /// <param name="uset"></param>
        /// <returns>formatted UnicodeSet</returns>
        public String ToPattern(UnicodeSet uset)
        {
            first = true;
            UnicodeSet putAtEnd = new UnicodeSet(uset).RetainAll(sortAtEnd);     // remove

            // all
            // the
            // unassigned
            // gorp
            // for
            // now
            // make sure that comparison separates all strings, even canonically
            // equivalent ones
            ILOG.J2CsMapping.Collections.ISet orderedStrings = new SortedSet(ordering);
            for (UnicodeSetIterator it = new UnicodeSetIterator(uset); it
                 .NextRange();)
            {
                if (it.codepoint == IBM.ICU.Text.UnicodeSetIterator.IS_STRING)
                {
                    ILOG.J2CsMapping.Collections.Generics.Collections.Add(orderedStrings, it.str0);
                }
                else
                {
                    for (int i = it.codepoint; i <= it.codepointEnd; ++i)
                    {
                        if (!putAtEnd.Contains(i))
                        {
                            ILOG.J2CsMapping.Collections.Generics.Collections.Add(orderedStrings, IBM.ICU.Text.UTF16.ValueOf(i));
                        }
                    }
                }
            }
            target.Length = 0;
            target.Append("[");
            for (IIterator it_0 = new ILOG.J2CsMapping.Collections.IteratorAdapter(orderedStrings.GetEnumerator()); it_0.HasNext();)
            {
                AppendUnicodeSetItem((String)it_0.Next());
            }
            for (UnicodeSetIterator it_1 = new UnicodeSetIterator(putAtEnd); it_1
                 .Next();)          // add back the unassigned gorp
            {
                AppendUnicodeSetItem(it_1.codepoint);
            }
            FlushLast();
            target.Append("]");
            String sresult = target.ToString();

            // double check the results. This can be removed once we have more
            // tests.
            // try {
            // UnicodeSet doubleCheck = new UnicodeSet(sresult);
            // if (!uset.equals(doubleCheck)) {
            // throw new
            // IllegalStateException("Failure to round-trip in pretty-print " + uset
            // + " => " + sresult + "\r\n source-result: " + new
            // UnicodeSet(uset).removeAll(doubleCheck) + "\r\n result-source: " +
            // new UnicodeSet(doubleCheck).removeAll(uset));
            // }
            // } catch (RuntimeException e) {
            // throw (RuntimeException) new
            // IllegalStateException("Failure to round-trip in pretty-print " +
            // uset).initCause(e);
            // }
            return(sresult);
        }
        public int Next(StringBuilder buffer)
        {
            if (start >= limit)
            {
                return(DONE);
            }
            int status      = UNKNOWN;
            int lastQuote   = UNKNOWN;
            int quoteStatus = NONE;
            int hexCount    = 0;
            int hexValue    = 0;
            int cp;

            main : {
                for (int i = start; i < limit; i += IBM.ICU.Text.UTF16.GetCharCount(cp))
                {
                    cp = IBM.ICU.Text.UTF16.CharAt(pattern, i);
                    // if we are in a quote, then handle it.
                    switch (quoteStatus)
                    {
                    case SLASH_START:
                        switch (cp)
                        {
                        case 'u':
                            quoteStatus = HEX;
                            hexCount    = 4;
                            hexValue    = 0;
                            goto main;

                        case 'U':
                            quoteStatus = HEX;
                            hexCount    = 8;
                            hexValue    = 0;
                            goto main;

                        default:
                            if (usingSlash)
                            {
                                IBM.ICU.Text.UTF16.Append(buffer, cp);
                                quoteStatus = NONE;
                                goto main;
                            }
                            else
                            {
                                buffer.Append(BACK_SLASH);
                                quoteStatus = NONE;
                            }
                            break;
                        }
                        break;     // fall through to NONE

                    case HEX:
                        hexValue <<= 4;
                        hexValue  += cp;
                        switch (cp)
                        {
                        case '0':
                        case '1':
                        case '2':
                        case '3':
                        case '4':
                        case '5':
                        case '6':
                        case '7':
                        case '8':
                        case '9':
                            hexValue -= '0';
                            break;

                        case 'a':
                        case 'b':
                        case 'c':
                        case 'd':
                        case 'e':
                        case 'f':
                            hexValue -= 'a' - 10;
                            break;

                        case 'A':
                        case 'B':
                        case 'C':
                        case 'D':
                        case 'E':
                        case 'F':
                            hexValue -= 'A' - 10;
                            break;

                        default:
                            start = i;
                            return(BROKEN_ESCAPE);
                        }
                        --hexCount;
                        if (hexCount == 0)
                        {
                            quoteStatus = NONE;
                            IBM.ICU.Text.UTF16.Append(buffer, hexValue);
                        }
                        goto main;

                    case AFTER_QUOTE:
                        // see if we get another quote character
                        // if we just ended a quote BUT the following character is the
                        // lastQuote character, then we have a situation like
                        // '...''...', so we restart the quote
                        if (cp == lastQuote)
                        {
                            IBM.ICU.Text.UTF16.Append(buffer, cp);
                            quoteStatus = NORMAL_QUOTE;
                            goto main;
                        }
                        quoteStatus = NONE;
                        break;     // fall through to NONE

                    case START_QUOTE:
                        // if we are at the very start of a quote, and we hit another
                        // quote mark then we emit a literal quote character and end the
                        // quote
                        if (cp == lastQuote)
                        {
                            IBM.ICU.Text.UTF16.Append(buffer, cp);
                            quoteStatus = NONE;     // get out of quote, with no trace
                                                    // remaining
                            continue;
                        }
                        // otherwise get into quote
                        IBM.ICU.Text.UTF16.Append(buffer, cp);
                        quoteStatus = NORMAL_QUOTE;
                        goto main;

                    case NORMAL_QUOTE:
                        if (cp == lastQuote)
                        {
                            quoteStatus = AFTER_QUOTE;     // get out of quote
                            goto main;
                        }
                        IBM.ICU.Text.UTF16.Append(buffer, cp);
                        goto main;
                    }

                    if (ignorableCharacters.Contains(cp))
                    {
                        continue;
                    }
                    // do syntax characters
                    if (syntaxCharacters.Contains(cp))
                    {
                        if (status == UNKNOWN)
                        {
                            IBM.ICU.Text.UTF16.Append(buffer, cp);
                            start = i + IBM.ICU.Text.UTF16.GetCharCount(cp);
                            return(SYNTAX);
                        }
                        else         // LITERAL, so back up and break
                        {
                            start = i;
                            return(status);
                        }
                    }
                    // otherwise it is a literal; keep on going
                    status = LITERAL;
                    if (cp == BACK_SLASH)
                    {
                        quoteStatus = SLASH_START;
                        continue;
                    }
                    else if (usingQuote && cp == SINGLE_QUOTE)
                    {
                        lastQuote   = cp;
                        quoteStatus = START_QUOTE;
                        continue;
                    }
                    // normal literals
                    IBM.ICU.Text.UTF16.Append(buffer, cp);
                }
            }
gotomain:
            ;
            // handle final cleanup
            start = limit;
            switch (quoteStatus)
            {
            case HEX:
                status = BROKEN_ESCAPE;
                break;

            case SLASH_START:
                if (usingSlash)
                {
                    status = BROKEN_ESCAPE;
                }
                else
                {
                    buffer.Append(BACK_SLASH);
                }
                break;

            case START_QUOTE:
            case NORMAL_QUOTE:
                status = BROKEN_QUOTE;
                break;
            }
            return(status);
        }
        /// <summary>
        /// Quote a literal string, using the available settings. Thus syntax
        /// characters, quote characters, and ignorable characters will be put into
        /// quotes.
        /// </summary>
        ///
        /// <param name="string"></param>
        /// <returns></returns>
        public String QuoteLiteral(String str0)
        {
            if (needingQuoteCharacters == null)
            {
                needingQuoteCharacters = new UnicodeSet().AddAll(syntaxCharacters)
                                         .AddAll(ignorableCharacters).AddAll(extraQuotingCharacters); // .addAll(quoteCharacters)
                if (usingSlash)
                {
                    needingQuoteCharacters.Add(BACK_SLASH);
                }
                if (usingQuote)
                {
                    needingQuoteCharacters.Add(SINGLE_QUOTE);
                }
            }
            StringBuilder result     = new StringBuilder();
            int           quotedChar = NO_QUOTE;
            int           cp;

            for (int i = 0; i < str0.Length; i += IBM.ICU.Text.UTF16.GetCharCount(cp))
            {
                cp = IBM.ICU.Text.UTF16.CharAt(str0, i);
                if (escapeCharacters.Contains(cp))
                {
                    // we may have to fix up previous characters
                    if (quotedChar == IN_QUOTE)
                    {
                        result.Append(SINGLE_QUOTE);
                        quotedChar = NO_QUOTE;
                    }
                    AppendEscaped(result, cp);
                    continue;
                }

                if (needingQuoteCharacters.Contains(cp))
                {
                    // if we have already started a quote
                    if (quotedChar == IN_QUOTE)
                    {
                        IBM.ICU.Text.UTF16.Append(result, cp);
                        if (usingQuote && cp == SINGLE_QUOTE)       // double it
                        {
                            result.Append(SINGLE_QUOTE);
                        }
                        continue;
                    }
                    // otherwise not already in quote
                    if (usingSlash)
                    {
                        result.Append(BACK_SLASH);
                        IBM.ICU.Text.UTF16.Append(result, cp);
                        continue;
                    }
                    if (usingQuote)
                    {
                        if (cp == SINGLE_QUOTE)       // double it and continue
                        {
                            result.Append(SINGLE_QUOTE);
                            result.Append(SINGLE_QUOTE);
                            continue;
                        }
                        result.Append(SINGLE_QUOTE);
                        IBM.ICU.Text.UTF16.Append(result, cp);
                        quotedChar = IN_QUOTE;
                        continue;
                    }
                    // we have no choice but to use \\u or \\U
                    AppendEscaped(result, cp);
                    continue;
                }
                // otherwise cp doesn't need quoting
                // we may have to fix up previous characters
                if (quotedChar == IN_QUOTE)
                {
                    result.Append(SINGLE_QUOTE);
                    quotedChar = NO_QUOTE;
                }
                IBM.ICU.Text.UTF16.Append(result, cp);
            }
            // all done.
            // we may have to fix up previous characters
            if (quotedChar == IN_QUOTE)
            {
                result.Append(SINGLE_QUOTE);
            }
            return(result.ToString());
        }
Exemple #15
0
        private void Compare(int c, int ce32, int baseCE32)
        {
            if (Collation.IsPrefixCE32(ce32))
            {
                int dataIndex = Collation.IndexFromCE32(ce32);
                ce32 = data.GetFinalCE32(data.GetCE32FromContexts(dataIndex));
                if (Collation.IsPrefixCE32(baseCE32))
                {
                    int baseIndex = Collation.IndexFromCE32(baseCE32);
                    baseCE32 = baseData.GetFinalCE32(baseData.GetCE32FromContexts(baseIndex));
                    ComparePrefixes(c, data.contexts, dataIndex + 2, baseData.contexts, baseIndex + 2);
                }
                else
                {
                    AddPrefixes(data, c, data.contexts, dataIndex + 2);
                }
            }
            else if (Collation.IsPrefixCE32(baseCE32))
            {
                int baseIndex = Collation.IndexFromCE32(baseCE32);
                baseCE32 = baseData.GetFinalCE32(baseData.GetCE32FromContexts(baseIndex));
                AddPrefixes(baseData, c, baseData.contexts, baseIndex + 2);
            }

            if (Collation.IsContractionCE32(ce32))
            {
                int dataIndex = Collation.IndexFromCE32(ce32);
                if ((ce32 & Collation.CONTRACT_SINGLE_CP_NO_MATCH) != 0)
                {
                    ce32 = Collation.NO_CE32;
                }
                else
                {
                    ce32 = data.GetFinalCE32(data.GetCE32FromContexts(dataIndex));
                }
                if (Collation.IsContractionCE32(baseCE32))
                {
                    int baseIndex = Collation.IndexFromCE32(baseCE32);
                    if ((baseCE32 & Collation.CONTRACT_SINGLE_CP_NO_MATCH) != 0)
                    {
                        baseCE32 = Collation.NO_CE32;
                    }
                    else
                    {
                        baseCE32 = baseData.GetFinalCE32(baseData.GetCE32FromContexts(baseIndex));
                    }
                    CompareContractions(c, data.contexts, dataIndex + 2, baseData.contexts, baseIndex + 2);
                }
                else
                {
                    AddContractions(c, data.contexts, dataIndex + 2);
                }
            }
            else if (Collation.IsContractionCE32(baseCE32))
            {
                int baseIndex = Collation.IndexFromCE32(baseCE32);
                baseCE32 = baseData.GetFinalCE32(baseData.GetCE32FromContexts(baseIndex));
                AddContractions(c, baseData.contexts, baseIndex + 2);
            }

            int tag;

            if (Collation.IsSpecialCE32(ce32))
            {
                tag = Collation.TagFromCE32(ce32);
                Debug.Assert(tag != Collation.PREFIX_TAG);
                Debug.Assert(tag != Collation.CONTRACTION_TAG);
                // Currently, the tailoring data builder does not write offset tags.
                // They might be useful for saving space,
                // but they would complicate the builder,
                // and in tailorings we assume that performance of tailored characters is more important.
                Debug.Assert(tag != Collation.OFFSET_TAG);
            }
            else
            {
                tag = -1;
            }
            int baseTag;

            if (Collation.IsSpecialCE32(baseCE32))
            {
                baseTag = Collation.TagFromCE32(baseCE32);
                Debug.Assert(baseTag != Collation.PREFIX_TAG);
                Debug.Assert(baseTag != Collation.CONTRACTION_TAG);
            }
            else
            {
                baseTag = -1;
            }

            // Non-contextual mappings, expansions, etc.
            if (baseTag == Collation.OFFSET_TAG)
            {
                // We might be comparing a tailoring CE which is a copy of
                // a base offset-tag CE, via the [optimize [set]] syntax
                // or when a single-character mapping was copied for tailored contractions.
                // Offset tags always result in long-primary CEs,
                // with common secondary/tertiary weights.
                if (!Collation.IsLongPrimaryCE32(ce32))
                {
                    Add(c);
                    return;
                }
                long dataCE = baseData.ces[Collation.IndexFromCE32(baseCE32)];
                long p      = Collation.GetThreeBytePrimaryForOffsetData(c, dataCE);
                if (Collation.PrimaryFromLongPrimaryCE32(ce32) != p)
                {
                    Add(c);
                    return;
                }
            }

            if (tag != baseTag)
            {
                Add(c);
                return;
            }

            if (tag == Collation.EXPANSION32_TAG)
            {
                int length     = Collation.LengthFromCE32(ce32);
                int baseLength = Collation.LengthFromCE32(baseCE32);

                if (length != baseLength)
                {
                    Add(c);
                    return;
                }

                int idx0 = Collation.IndexFromCE32(ce32);
                int idx1 = Collation.IndexFromCE32(baseCE32);

                for (int i = 0; i < length; ++i)
                {
                    if (data.ce32s[idx0 + i] != baseData.ce32s[idx1 + i])
                    {
                        Add(c);
                        break;
                    }
                }
            }
            else if (tag == Collation.EXPANSION_TAG)
            {
                int length     = Collation.LengthFromCE32(ce32);
                int baseLength = Collation.LengthFromCE32(baseCE32);

                if (length != baseLength)
                {
                    Add(c);
                    return;
                }

                int idx0 = Collation.IndexFromCE32(ce32);
                int idx1 = Collation.IndexFromCE32(baseCE32);

                for (int i = 0; i < length; ++i)
                {
                    if (data.ces[idx0 + i] != baseData.ces[idx1 + i])
                    {
                        Add(c);
                        break;
                    }
                }
            }
            else if (tag == Collation.HANGUL_TAG)
            {
                StringBuilder jamos  = new StringBuilder();
                int           length = Hangul.Decompose(c, jamos);
                if (tailored.Contains(jamos[0]) || tailored.Contains(jamos[1]) ||
                    (length == 3 && tailored.Contains(jamos[2])))
                {
                    Add(c);
                }
            }
            else if (ce32 != baseCE32)
            {
                Add(c);
            }
        }
Exemple #16
0
        // ------------------------------------------------------------------------
        //
        // build Build the list of non-overlapping character ranges
        // from the Unicode Sets.
        //
        // ------------------------------------------------------------------------
        internal void Build()
        {
            RBBINode usetNode;

            RBBISetBuilder.RangeDescriptor rlRange;

            if (fRB.fDebugEnv != null && fRB.fDebugEnv.IndexOf("usets") >= 0)
            {
                PrintSets();
            }

            // Initialize the process by creating a single range encompassing all
            // characters
            // that is in no sets.
            //
            fRangeList            = new RBBISetBuilder.RangeDescriptor();
            fRangeList.fStartChar = 0;
            fRangeList.fEndChar   = 0x10ffff;

            //
            // Find the set of non-overlapping ranges of characters
            //
            IIterator ni = new ILOG.J2CsMapping.Collections.IteratorAdapter(fRB.fUSetNodes.GetEnumerator());

            while (ni.HasNext())
            {
                usetNode = (RBBINode)ni.Next();

                UnicodeSet inputSet           = usetNode.fInputSet;
                int        inputSetRangeCount = inputSet.GetRangeCount();
                int        inputSetRangeIndex = 0;
                rlRange = fRangeList;

                for (;;)
                {
                    if (inputSetRangeIndex >= inputSetRangeCount)
                    {
                        break;
                    }
                    int inputSetRangeBegin = inputSet
                                             .GetRangeStart(inputSetRangeIndex);
                    int inputSetRangeEnd = inputSet.GetRangeEnd(inputSetRangeIndex);

                    // skip over ranges from the range list that are completely
                    // below the current range from the input unicode set.
                    while (rlRange.fEndChar < inputSetRangeBegin)
                    {
                        rlRange = rlRange.fNext;
                    }

                    // If the start of the range from the range list is before with
                    // the start of the range from the unicode set, split the range
                    // list range
                    // in two, with one part being before (wholly outside of) the
                    // unicode set
                    // and the other containing the rest.
                    // Then continue the loop; the post-split current range will
                    // then be skipped
                    // over
                    if (rlRange.fStartChar < inputSetRangeBegin)
                    {
                        rlRange.Split(inputSetRangeBegin);
                        continue;
                    }

                    // Same thing at the end of the ranges...
                    // If the end of the range from the range list doesn't coincide
                    // with
                    // the end of the range from the unicode set, split the range
                    // list
                    // range in two. The first part of the split range will be
                    // wholly inside the Unicode set.
                    if (rlRange.fEndChar > inputSetRangeEnd)
                    {
                        rlRange.Split(inputSetRangeEnd + 1);
                    }

                    // The current rlRange is now entirely within the UnicodeSet
                    // range.
                    // Add this unicode set to the list of sets for this rlRange
                    if (rlRange.fIncludesSets.IndexOf(usetNode) == -1)
                    {
                        ILOG.J2CsMapping.Collections.Generics.Collections.Add(rlRange.fIncludesSets, usetNode);
                    }

                    // Advance over ranges that we are finished with.
                    if (inputSetRangeEnd == rlRange.fEndChar)
                    {
                        inputSetRangeIndex++;
                    }
                    rlRange = rlRange.fNext;
                }
            }

            if (fRB.fDebugEnv != null && fRB.fDebugEnv.IndexOf("range") >= 0)
            {
                PrintRanges();
            }

            //
            // Group the above ranges, with each group consisting of one or more
            // ranges that are in exactly the same set of original UnicodeSets.
            // The groups are numbered, and these group numbers are the set of
            // input symbols recognized by the run-time state machine.
            //
            // Numbering: # 0 (state table column 0) is unused.
            // # 1 is reserved - table column 1 is for end-of-input
            // # 2 is reserved - table column 2 is for beginning-in-input
            // # 3 is the first range list.
            //
            RBBISetBuilder.RangeDescriptor rlSearchRange;
            for (rlRange = fRangeList; rlRange != null; rlRange = rlRange.fNext)
            {
                for (rlSearchRange = fRangeList; rlSearchRange != rlRange; rlSearchRange = rlSearchRange.fNext)
                {
                    if (rlRange.fIncludesSets.Equals(rlSearchRange.fIncludesSets))
                    {
                        rlRange.fNum = rlSearchRange.fNum;
                        break;
                    }
                }
                if (rlRange.fNum == 0)
                {
                    fGroupCount++;
                    rlRange.fNum = fGroupCount + 2;
                    rlRange.SetDictionaryFlag();
                    AddValToSets(rlRange.fIncludesSets, fGroupCount + 2);
                }
            }

            // Handle input sets that contain the special string {eof}.
            // Column 1 of the state table is reserved for EOF on input.
            // Column 2 is reserved for before-the-start-input.
            // (This column can be optimized away later if there are no rule
            // references to {bof}.)
            // Add this column value (1 or 2) to the equivalent expression
            // subtree for each UnicodeSet that contains the string {eof}
            // Because {bof} and {eof} are not a characters in the normal sense,
            // they doesn't affect the computation of ranges or TRIE.

            String eofString = "eof";
            String bofString = "bof";

            ni = new ILOG.J2CsMapping.Collections.IteratorAdapter(fRB.fUSetNodes.GetEnumerator());
            while (ni.HasNext())
            {
                usetNode = (RBBINode)ni.Next();
                UnicodeSet inputSet_0 = usetNode.fInputSet;
                if (inputSet_0.Contains(eofString))
                {
                    AddValToSet(usetNode, 1);
                }
                if (inputSet_0.Contains(bofString))
                {
                    AddValToSet(usetNode, 2);
                    fSawBOF = true;
                }
            }

            if (fRB.fDebugEnv != null && fRB.fDebugEnv.IndexOf("rgroup") >= 0)
            {
                PrintRangeGroups();
            }
            if (fRB.fDebugEnv != null && fRB.fDebugEnv.IndexOf("esets") >= 0)
            {
                PrintSets();
            }

            // IntTrieBuilder(int aliasdata[], int maxdatalength,
            // int initialvalue, int leadunitvalue,
            // boolean latin1linear)

            fTrie = new IntTrieBuilder(null,   // Data array (utrie will allocate one)
                                       100000, // Max Data Length
                                       0,      // Initial value for all code points
                                       0,      // Lead Surrogate unit value,
                                       true);  // Keep Latin 1 in separately.

            for (rlRange = fRangeList; rlRange != null; rlRange = rlRange.fNext)
            {
                fTrie.SetRange(rlRange.fStartChar, rlRange.fEndChar + 1,
                               rlRange.fNum, true);
            }
        }