Пример #1
0
 internal RangeDescriptor(RBBISetBuilder.RangeDescriptor other)
 {
     fStartChar    = other.fStartChar;
     fEndChar      = other.fEndChar;
     fNum          = other.fNum;
     fIncludesSets = new ArrayList(other.fIncludesSets);
 }
Пример #2
0
            // -------------------------------------------------------------------------------------
            //
            // RangeDesriptor::split()
            //
            // -------------------------------------------------------------------------------------
            internal void Split(int where)
            {
                IBM.ICU.Impl.Assert.Assrt(where > fStartChar && where <= fEndChar);
                RBBISetBuilder.RangeDescriptor nr = new RBBISetBuilder.RangeDescriptor(this);

                // RangeDescriptor copy constructor copies all fields.
                // Only need to update those that are different after the split.
                nr.fStartChar = where;
                this.fEndChar = where - 1;
                nr.fNext      = this.fNext;
                this.fNext    = nr;

                // TODO: fIncludesSets is not updated. Check it out.
                // Probably because they haven't been populated yet,
                // but still sloppy.
            }
Пример #3
0
        // ------------------------------------------------------------------------
        //
        // build Build the list of non-overlapping character ranges
        // from the Unicode Sets.
        //
        // ------------------------------------------------------------------------
        internal void Build()
        {
            RBBINode usetNode;

            RBBISetBuilder.RangeDescriptor rlRange;

            if (fRB.fDebugEnv != null && fRB.fDebugEnv.IndexOf("usets") >= 0)
            {
                PrintSets();
            }

            // Initialize the process by creating a single range encompassing all
            // characters
            // that is in no sets.
            //
            fRangeList            = new RBBISetBuilder.RangeDescriptor();
            fRangeList.fStartChar = 0;
            fRangeList.fEndChar   = 0x10ffff;

            //
            // Find the set of non-overlapping ranges of characters
            //
            IIterator ni = new ILOG.J2CsMapping.Collections.IteratorAdapter(fRB.fUSetNodes.GetEnumerator());

            while (ni.HasNext())
            {
                usetNode = (RBBINode)ni.Next();

                UnicodeSet inputSet           = usetNode.fInputSet;
                int        inputSetRangeCount = inputSet.GetRangeCount();
                int        inputSetRangeIndex = 0;
                rlRange = fRangeList;

                for (;;)
                {
                    if (inputSetRangeIndex >= inputSetRangeCount)
                    {
                        break;
                    }
                    int inputSetRangeBegin = inputSet
                                             .GetRangeStart(inputSetRangeIndex);
                    int inputSetRangeEnd = inputSet.GetRangeEnd(inputSetRangeIndex);

                    // skip over ranges from the range list that are completely
                    // below the current range from the input unicode set.
                    while (rlRange.fEndChar < inputSetRangeBegin)
                    {
                        rlRange = rlRange.fNext;
                    }

                    // If the start of the range from the range list is before with
                    // the start of the range from the unicode set, split the range
                    // list range
                    // in two, with one part being before (wholly outside of) the
                    // unicode set
                    // and the other containing the rest.
                    // Then continue the loop; the post-split current range will
                    // then be skipped
                    // over
                    if (rlRange.fStartChar < inputSetRangeBegin)
                    {
                        rlRange.Split(inputSetRangeBegin);
                        continue;
                    }

                    // Same thing at the end of the ranges...
                    // If the end of the range from the range list doesn't coincide
                    // with
                    // the end of the range from the unicode set, split the range
                    // list
                    // range in two. The first part of the split range will be
                    // wholly inside the Unicode set.
                    if (rlRange.fEndChar > inputSetRangeEnd)
                    {
                        rlRange.Split(inputSetRangeEnd + 1);
                    }

                    // The current rlRange is now entirely within the UnicodeSet
                    // range.
                    // Add this unicode set to the list of sets for this rlRange
                    if (rlRange.fIncludesSets.IndexOf(usetNode) == -1)
                    {
                        ILOG.J2CsMapping.Collections.Generics.Collections.Add(rlRange.fIncludesSets, usetNode);
                    }

                    // Advance over ranges that we are finished with.
                    if (inputSetRangeEnd == rlRange.fEndChar)
                    {
                        inputSetRangeIndex++;
                    }
                    rlRange = rlRange.fNext;
                }
            }

            if (fRB.fDebugEnv != null && fRB.fDebugEnv.IndexOf("range") >= 0)
            {
                PrintRanges();
            }

            //
            // Group the above ranges, with each group consisting of one or more
            // ranges that are in exactly the same set of original UnicodeSets.
            // The groups are numbered, and these group numbers are the set of
            // input symbols recognized by the run-time state machine.
            //
            // Numbering: # 0 (state table column 0) is unused.
            // # 1 is reserved - table column 1 is for end-of-input
            // # 2 is reserved - table column 2 is for beginning-in-input
            // # 3 is the first range list.
            //
            RBBISetBuilder.RangeDescriptor rlSearchRange;
            for (rlRange = fRangeList; rlRange != null; rlRange = rlRange.fNext)
            {
                for (rlSearchRange = fRangeList; rlSearchRange != rlRange; rlSearchRange = rlSearchRange.fNext)
                {
                    if (rlRange.fIncludesSets.Equals(rlSearchRange.fIncludesSets))
                    {
                        rlRange.fNum = rlSearchRange.fNum;
                        break;
                    }
                }
                if (rlRange.fNum == 0)
                {
                    fGroupCount++;
                    rlRange.fNum = fGroupCount + 2;
                    rlRange.SetDictionaryFlag();
                    AddValToSets(rlRange.fIncludesSets, fGroupCount + 2);
                }
            }

            // Handle input sets that contain the special string {eof}.
            // Column 1 of the state table is reserved for EOF on input.
            // Column 2 is reserved for before-the-start-input.
            // (This column can be optimized away later if there are no rule
            // references to {bof}.)
            // Add this column value (1 or 2) to the equivalent expression
            // subtree for each UnicodeSet that contains the string {eof}
            // Because {bof} and {eof} are not a characters in the normal sense,
            // they doesn't affect the computation of ranges or TRIE.

            String eofString = "eof";
            String bofString = "bof";

            ni = new ILOG.J2CsMapping.Collections.IteratorAdapter(fRB.fUSetNodes.GetEnumerator());
            while (ni.HasNext())
            {
                usetNode = (RBBINode)ni.Next();
                UnicodeSet inputSet_0 = usetNode.fInputSet;
                if (inputSet_0.Contains(eofString))
                {
                    AddValToSet(usetNode, 1);
                }
                if (inputSet_0.Contains(bofString))
                {
                    AddValToSet(usetNode, 2);
                    fSawBOF = true;
                }
            }

            if (fRB.fDebugEnv != null && fRB.fDebugEnv.IndexOf("rgroup") >= 0)
            {
                PrintRangeGroups();
            }
            if (fRB.fDebugEnv != null && fRB.fDebugEnv.IndexOf("esets") >= 0)
            {
                PrintSets();
            }

            // IntTrieBuilder(int aliasdata[], int maxdatalength,
            // int initialvalue, int leadunitvalue,
            // boolean latin1linear)

            fTrie = new IntTrieBuilder(null,   // Data array (utrie will allocate one)
                                       100000, // Max Data Length
                                       0,      // Initial value for all code points
                                       0,      // Lead Surrogate unit value,
                                       true);  // Keep Latin 1 in separately.

            for (rlRange = fRangeList; rlRange != null; rlRange = rlRange.fNext)
            {
                fTrie.SetRange(rlRange.fStartChar, rlRange.fEndChar + 1,
                               rlRange.fNum, true);
            }
        }