// ------------------------------------------------------------------------ // // build Build the list of non-overlapping character ranges // from the Unicode Sets. // // ------------------------------------------------------------------------ internal void Build() { RBBINode usetNode; RBBISetBuilder.RangeDescriptor rlRange; if (fRB.fDebugEnv != null && fRB.fDebugEnv.IndexOf("usets") >= 0) { PrintSets(); } // Initialize the process by creating a single range encompassing all // characters // that is in no sets. // fRangeList = new RBBISetBuilder.RangeDescriptor(); fRangeList.fStartChar = 0; fRangeList.fEndChar = 0x10ffff; // // Find the set of non-overlapping ranges of characters // IIterator ni = new ILOG.J2CsMapping.Collections.IteratorAdapter(fRB.fUSetNodes.GetEnumerator()); while (ni.HasNext()) { usetNode = (RBBINode)ni.Next(); UnicodeSet inputSet = usetNode.fInputSet; int inputSetRangeCount = inputSet.GetRangeCount(); int inputSetRangeIndex = 0; rlRange = fRangeList; for (;;) { if (inputSetRangeIndex >= inputSetRangeCount) { break; } int inputSetRangeBegin = inputSet .GetRangeStart(inputSetRangeIndex); int inputSetRangeEnd = inputSet.GetRangeEnd(inputSetRangeIndex); // skip over ranges from the range list that are completely // below the current range from the input unicode set. while (rlRange.fEndChar < inputSetRangeBegin) { rlRange = rlRange.fNext; } // If the start of the range from the range list is before with // the start of the range from the unicode set, split the range // list range // in two, with one part being before (wholly outside of) the // unicode set // and the other containing the rest. // Then continue the loop; the post-split current range will // then be skipped // over if (rlRange.fStartChar < inputSetRangeBegin) { rlRange.Split(inputSetRangeBegin); continue; } // Same thing at the end of the ranges... // If the end of the range from the range list doesn't coincide // with // the end of the range from the unicode set, split the range // list // range in two. The first part of the split range will be // wholly inside the Unicode set. if (rlRange.fEndChar > inputSetRangeEnd) { rlRange.Split(inputSetRangeEnd + 1); } // The current rlRange is now entirely within the UnicodeSet // range. // Add this unicode set to the list of sets for this rlRange if (rlRange.fIncludesSets.IndexOf(usetNode) == -1) { ILOG.J2CsMapping.Collections.Generics.Collections.Add(rlRange.fIncludesSets, usetNode); } // Advance over ranges that we are finished with. if (inputSetRangeEnd == rlRange.fEndChar) { inputSetRangeIndex++; } rlRange = rlRange.fNext; } } if (fRB.fDebugEnv != null && fRB.fDebugEnv.IndexOf("range") >= 0) { PrintRanges(); } // // Group the above ranges, with each group consisting of one or more // ranges that are in exactly the same set of original UnicodeSets. // The groups are numbered, and these group numbers are the set of // input symbols recognized by the run-time state machine. // // Numbering: # 0 (state table column 0) is unused. // # 1 is reserved - table column 1 is for end-of-input // # 2 is reserved - table column 2 is for beginning-in-input // # 3 is the first range list. // RBBISetBuilder.RangeDescriptor rlSearchRange; for (rlRange = fRangeList; rlRange != null; rlRange = rlRange.fNext) { for (rlSearchRange = fRangeList; rlSearchRange != rlRange; rlSearchRange = rlSearchRange.fNext) { if (rlRange.fIncludesSets.Equals(rlSearchRange.fIncludesSets)) { rlRange.fNum = rlSearchRange.fNum; break; } } if (rlRange.fNum == 0) { fGroupCount++; rlRange.fNum = fGroupCount + 2; rlRange.SetDictionaryFlag(); AddValToSets(rlRange.fIncludesSets, fGroupCount + 2); } } // Handle input sets that contain the special string {eof}. // Column 1 of the state table is reserved for EOF on input. // Column 2 is reserved for before-the-start-input. // (This column can be optimized away later if there are no rule // references to {bof}.) // Add this column value (1 or 2) to the equivalent expression // subtree for each UnicodeSet that contains the string {eof} // Because {bof} and {eof} are not a characters in the normal sense, // they doesn't affect the computation of ranges or TRIE. String eofString = "eof"; String bofString = "bof"; ni = new ILOG.J2CsMapping.Collections.IteratorAdapter(fRB.fUSetNodes.GetEnumerator()); while (ni.HasNext()) { usetNode = (RBBINode)ni.Next(); UnicodeSet inputSet_0 = usetNode.fInputSet; if (inputSet_0.Contains(eofString)) { AddValToSet(usetNode, 1); } if (inputSet_0.Contains(bofString)) { AddValToSet(usetNode, 2); fSawBOF = true; } } if (fRB.fDebugEnv != null && fRB.fDebugEnv.IndexOf("rgroup") >= 0) { PrintRangeGroups(); } if (fRB.fDebugEnv != null && fRB.fDebugEnv.IndexOf("esets") >= 0) { PrintSets(); } // IntTrieBuilder(int aliasdata[], int maxdatalength, // int initialvalue, int leadunitvalue, // boolean latin1linear) fTrie = new IntTrieBuilder(null, // Data array (utrie will allocate one) 100000, // Max Data Length 0, // Initial value for all code points 0, // Lead Surrogate unit value, true); // Keep Latin 1 in separately. for (rlRange = fRangeList; rlRange != null; rlRange = rlRange.fNext) { fTrie.SetRange(rlRange.fStartChar, rlRange.fEndChar + 1, rlRange.fNum, true); } }
/** * Invariant: stringIterator is null when there are no (more) strings * remaining */ /// <exclude/> protected internal void LoadRange(int range_0) { nextElement = set.GetRangeStart(range_0); endElement = set.GetRangeEnd(range_0); }