// ------------------------------------------------------------------------- // // print. Print out a single node, for debugging. // // ------------------------------------------------------------------------- // /CLOVER:OFF static internal void PrintNode(RBBINode n) { if (n == null) { System.Console.Out.Write(" -- null --\n"); } else { RBBINode.PrintInt(n.fSerialNum, 10); RBBINode.PrintString(nodeTypeNames[n.fType], 11); RBBINode.PrintInt((n.fParent == null) ? 0 : n.fParent.fSerialNum, 11); RBBINode.PrintInt((n.fLeftChild == null) ? 0 : n.fLeftChild.fSerialNum, 11); RBBINode.PrintInt((n.fRightChild == null) ? 0 : n.fRightChild.fSerialNum, 12); RBBINode.PrintInt(n.fFirstPos, 12); RBBINode.PrintInt(n.fVal, 7); if (n.fType == varRef) { System.Console.Out.Write(" " + n.fText); } } System.Console.Out.WriteLine(""); }
// ------------------------------------------------------------------------ // // printRanges A debugging function. // dump out all of the range definitions. // // ------------------------------------------------------------------------ // /CLOVER:OFF internal void PrintRanges() { RBBISetBuilder.RangeDescriptor rlRange; int i; System.Console.Out.Write("\n\n Nonoverlapping Ranges ...\n"); for (rlRange = fRangeList; rlRange != null; rlRange = rlRange.fNext) { System.Console.Out.Write(" " + rlRange.fNum + " " + (int)rlRange.fStartChar + "-" + (int)rlRange.fEndChar); for (i = 0; i < rlRange.fIncludesSets.Count; i++) { RBBINode usetNode = (RBBINode)rlRange.fIncludesSets[i]; String setName = "anon"; RBBINode setRef = usetNode.fParent; if (setRef != null) { RBBINode varRef = setRef.fParent; if (varRef != null && varRef.fType == IBM.ICU.Text.RBBINode.varRef) { setName = varRef.fText; } } System.Console.Out.Write(setName); System.Console.Out.Write(" "); } System.Console.Out.WriteLine(""); } }
// ------------------------------------------------------------------------- // // cloneTree Make a copy of the subtree rooted at this node. // Discard any variable references encountered along the way, // and replace with copies of the variable's definitions. // Used to replicate the expression underneath variable // references in preparation for generating the DFA tables. // // ------------------------------------------------------------------------- internal RBBINode CloneTree() { RBBINode n; if (fType == RBBINode.varRef) { // If the current node is a variable reference, skip over it // and clone the definition of the variable instead. n = fLeftChild.CloneTree(); } else if (fType == RBBINode.uset) { n = this; } else { n = new RBBINode(this); if (fLeftChild != null) { n.fLeftChild = fLeftChild.CloneTree(); n.fLeftChild.fParent = n; } if (fRightChild != null) { n.fRightChild = fRightChild.CloneTree(); n.fRightChild.fParent = n; } } return(n); }
// /CLOVER:ON // --------------------------------------------------------------------------------- // // pushNewNode create a new RBBINode of the specified type and push it // onto the stack of nodes. // // --------------------------------------------------------------------------------- internal RBBINode PushNewNode(int nodeType) { fNodeStackPtr++; if (fNodeStackPtr >= kStackSize) { System.Console.Out.WriteLine("RBBIRuleScanner.pushNewNode - stack overflow."); Error(IBM.ICU.Text.RBBIRuleBuilder.U_BRK_INTERNAL_ERROR); } fNodeStack[fNodeStackPtr] = new RBBINode(nodeType); return fNodeStack[fNodeStackPtr]; }
// ------------------------------------------------------------------------ // // addValToSets Add a runtime-mapped input value to each uset from a // list of uset nodes. (val corresponds to a state table column.) // For each of the original Unicode sets - which correspond // directly to uset nodes - a logically equivalent expression // is constructed in terms of the remapped runtime input // symbol set. This function adds one runtime input symbol to // a list of sets. // // The "logically equivalent expression" is the tree for an // or-ing together of all of the symbols that go into the set. // // ------------------------------------------------------------------------ internal void AddValToSets(IList sets, int val) { int ix; for (ix = 0; ix < sets.Count; ix++) { RBBINode usetNode = (RBBINode)sets[ix]; AddValToSet(usetNode, val); } }
// ----------------------------------------------------------------------------- // // printSet Debug function. Print the contents of a set of Nodes // // ----------------------------------------------------------------------------- internal void PrintSet(ICollection s) { IIterator it = new ILOG.J2CsMapping.Collections.IteratorAdapter(s.GetEnumerator()); while (it.HasNext()) { RBBINode n = (RBBINode)it.Next(); IBM.ICU.Text.RBBINode.PrintInt(n.fSerialNum, 8); } System.Console.Out.WriteLine(); }
// ---------------------------------------------------------------------------------------- // // findSetFor given a String, // - find the corresponding Unicode Set (uset node) // (create one if necessary) // - Set fLeftChild of the caller's node (should be a setRef node) // to the uset node // Maintain a hash table of uset nodes, so the same one is always used // for the same string. // If a "to adopt" set is provided and we haven't seen this key before, // add the provided set to the hash table. // If the string is one (32 bit) char in length, the set contains // just one element which is the char in question. // If the string is "any", return a set containing all chars. // // ---------------------------------------------------------------------------------------- internal void FindSetFor(String s, RBBINode node, UnicodeSet setToAdopt) { RBBIRuleScanner.RBBISetTableEl el; // First check whether we've already cached a set for this string. // If so, just use the cached set in the new node. // delete any set provided by the caller, since we own it. el = (RBBIRuleScanner.RBBISetTableEl ) ILOG.J2CsMapping.Collections.Collections.Get(fSetTable,s); if (el != null) { node.fLeftChild = el.val; IBM.ICU.Impl.Assert.Assrt(node.fLeftChild.fType == IBM.ICU.Text.RBBINode.uset); return; } // Haven't seen this set before. // If the caller didn't provide us with a prebuilt set, // create a new UnicodeSet now. if (setToAdopt == null) { if (s.Equals(kAny)) { setToAdopt = new UnicodeSet(0x000000, 0x10ffff); } else { int c; c = IBM.ICU.Text.UTF16.CharAt(s, 0); setToAdopt = new UnicodeSet(c, c); } } // // Make a new uset node to refer to this UnicodeSet // This new uset node becomes the child of the caller's setReference // node. // RBBINode usetNode = new RBBINode(IBM.ICU.Text.RBBINode.uset); usetNode.fInputSet = setToAdopt; usetNode.fParent = node; node.fLeftChild = usetNode; usetNode.fText = s; // // Add the new uset node to the list of all uset nodes. // ILOG.J2CsMapping.Collections.Generics.Collections.Add(fRB.fUSetNodes,usetNode); // // Add the new set to the set hash table. // el = new RBBIRuleScanner.RBBISetTableEl (); el.key = s; el.val = usetNode; ILOG.J2CsMapping.Collections.Collections.Put(fSetTable,el.key,el); return; }
// // RBBISymbolTable::lookupNode Given a key (a variable name), return the // corresponding RBBI Node. If there is no entry // in the table for this name, return NULL. // internal RBBINode LookupNode(String key_0) { RBBINode retNode = null; RBBISymbolTable.RBBISymbolTableEntry el; el = (RBBISymbolTable.RBBISymbolTableEntry)ILOG.J2CsMapping.Collections.Collections.Get(fHashTable, key_0); if (el != null) { retNode = el.val; } return(retNode); }
// // RBBISymbolTable::addEntry Add a new entry to the symbol table. // Indicate an error if the name already exists - // this will only occur in the case of duplicate // variable assignments. // internal void AddEntry(String key_0, RBBINode val_1) { RBBISymbolTable.RBBISymbolTableEntry e; e = (RBBISymbolTable.RBBISymbolTableEntry)ILOG.J2CsMapping.Collections.Collections.Get(fHashTable, key_0); if (e != null) { fRuleScanner.Error(IBM.ICU.Text.RBBIRuleBuilder.U_BRK_VARIABLE_REDFINITION); return; } e = new RBBISymbolTable.RBBISymbolTableEntry(); e.key = key_0; e.val = val_1; ILOG.J2CsMapping.Collections.Collections.Put(fHashTable, e.key, e); }
internal RBBINode(RBBINode other) { this.fPrecedence = precZero; fSerialNum = ++gLastSerial; fType = other.fType; fInputSet = other.fInputSet; fPrecedence = other.fPrecedence; fText = other.fText; fFirstPos = other.fFirstPos; fLastPos = other.fLastPos; fNullable = other.fNullable; fVal = other.fVal; fFirstPosSet = new HashedSet(other.fFirstPosSet); fLastPosSet = new HashedSet(other.fLastPosSet); fFollowPos = new HashedSet(other.fFollowPos); }
// ----------------------------------------------------------------------------- // // bofFixup. Fixup for state tables that include {bof} beginning of input // testing. // Do an swizzle similar to chaining, modifying the followPos set of // the bofNode to include the followPos nodes from other {bot} nodes // scattered through the tree. // // This function has much in common with calcChainedFollowPos(). // // ----------------------------------------------------------------------------- internal void BofFixup() { // // The parse tree looks like this ... // fTree root --. <cat> // / \ // <cat> <#end node> // / \ // <bofNode> rest // of tree // // We will be adding things to the followPos set of the <bofNode> // RBBINode bofNode = fRB.fTreeRoots[fRootIx].fLeftChild.fLeftChild; IBM.ICU.Impl.Assert.Assrt(bofNode.fType == IBM.ICU.Text.RBBINode.leafChar); IBM.ICU.Impl.Assert.Assrt(bofNode.fVal == 2); // Get all nodes that can be the start a match of the user-written rules // (excluding the fake bofNode) // We want the nodes that can start a match in the // part labeled "rest of tree" // ILOG.J2CsMapping.Collections.ISet matchStartNodes = fRB.fTreeRoots[fRootIx].fLeftChild.fRightChild.fFirstPosSet; IIterator startNodeIt = new ILOG.J2CsMapping.Collections.IteratorAdapter(matchStartNodes.GetEnumerator()); while (startNodeIt.HasNext()) { RBBINode startNode = (RBBINode)startNodeIt.Next(); if (startNode.fType != IBM.ICU.Text.RBBINode.leafChar) { continue; } if (startNode.fVal == bofNode.fVal) { // We found a leaf node corresponding to a {bof} that was // explicitly written into a rule. // Add everything from the followPos set of this node to the // followPos set of the fake bofNode at the start of the tree. // ILOG.J2CsMapping.Collections.Generics.Collections.AddAll(startNode.fFollowPos, bofNode.fFollowPos); } } }
// ----------------------------------------------------------------------------- // // calcNullable. Impossible to explain succinctly. See Aho, section 3.9 // // ----------------------------------------------------------------------------- internal void CalcNullable(RBBINode n) { if (n == null) { return; } if (n.fType == IBM.ICU.Text.RBBINode.setRef || n.fType == IBM.ICU.Text.RBBINode.endMark) { // These are non-empty leaf node types. n.fNullable = false; return; } if (n.fType == IBM.ICU.Text.RBBINode.lookAhead || n.fType == IBM.ICU.Text.RBBINode.tag) { // Lookahead marker node. It's a leaf, so no recursion on children. // It's nullable because it does not match any literal text from the // input stream. n.fNullable = true; return; } // The node is not a leaf. // Calculate nullable on its children. CalcNullable(n.fLeftChild); CalcNullable(n.fRightChild); // Apply functions from table 3.40 in Aho if (n.fType == IBM.ICU.Text.RBBINode.opOr) { n.fNullable = n.fLeftChild.fNullable || n.fRightChild.fNullable; } else if (n.fType == IBM.ICU.Text.RBBINode.opCat) { n.fNullable = n.fLeftChild.fNullable && n.fRightChild.fNullable; } else if (n.fType == IBM.ICU.Text.RBBINode.opStar || n.fType == IBM.ICU.Text.RBBINode.opQuestion) { n.fNullable = true; } else { n.fNullable = false; } }
// ------------------------------------------------------------------------- // // flattenVariables Walk a parse tree, replacing any variable // references with a copy of the variable's definition. // Aside from variables, the tree is not changed. // // Return the root of the tree. If the root was not a variable // reference, it remains unchanged - the root we started with // is the root we return. If, however, the root was a variable // reference, the root of the newly cloned replacement tree will // be returned, and the original tree deleted. // // This function works by recursively walking the tree // without doing anything until a variable reference is // found, then calling cloneTree() at that point. Any // nested references are handled by cloneTree(), not here. // // ------------------------------------------------------------------------- internal RBBINode FlattenVariables() { if (fType == varRef) { RBBINode retNode = fLeftChild.CloneTree(); // delete this; return(retNode); } if (fLeftChild != null) { fLeftChild = fLeftChild.FlattenVariables(); fLeftChild.fParent = this; } if (fRightChild != null) { fRightChild = fRightChild.FlattenVariables(); fRightChild.fParent = this; } return(this); }
// ----------------------------------------------------------------------------- // // printPosSets Debug function. Dump Nullable, firstpos, lastpos and // followpos // for each node in the tree. // // ----------------------------------------------------------------------------- internal void PrintPosSets(RBBINode n) { if (n == null) { return; } IBM.ICU.Text.RBBINode.PrintNode(n); System.Console.Out.Write(" Nullable: " + n.fNullable); System.Console.Out.Write(" firstpos: "); PrintSet(n.fFirstPosSet); System.Console.Out.Write(" lastpos: "); PrintSet(n.fLastPosSet); System.Console.Out.Write(" followpos: "); PrintSet(n.fFollowPos); PrintPosSets(n.fLeftChild); PrintPosSets(n.fRightChild); }
// ----------------------------------------------------------------------------- // // calcLastPos. Impossible to explain succinctly. See Aho, section 3.9 // // ----------------------------------------------------------------------------- internal void CalcLastPos(RBBINode n) { if (n == null) { return; } if (n.fType == IBM.ICU.Text.RBBINode.leafChar || n.fType == IBM.ICU.Text.RBBINode.endMark || n.fType == IBM.ICU.Text.RBBINode.lookAhead || n.fType == IBM.ICU.Text.RBBINode.tag) { // These are non-empty leaf node types. ILOG.J2CsMapping.Collections.Generics.Collections.Add(n.fLastPosSet, n); return; } // The node is not a leaf. // Calculate lastPos on its children. CalcLastPos(n.fLeftChild); CalcLastPos(n.fRightChild); // Apply functions from table 3.40 in Aho if (n.fType == IBM.ICU.Text.RBBINode.opOr) { ILOG.J2CsMapping.Collections.Generics.Collections.AddAll(n.fLeftChild.fLastPosSet, n.fLastPosSet); ILOG.J2CsMapping.Collections.Generics.Collections.AddAll(n.fRightChild.fLastPosSet, n.fLastPosSet); } else if (n.fType == IBM.ICU.Text.RBBINode.opCat) { ILOG.J2CsMapping.Collections.Generics.Collections.AddAll(n.fRightChild.fLastPosSet, n.fLastPosSet); if (n.fRightChild.fNullable) { ILOG.J2CsMapping.Collections.Generics.Collections.AddAll(n.fLeftChild.fLastPosSet, n.fLastPosSet); } } else if (n.fType == IBM.ICU.Text.RBBINode.opStar || n.fType == IBM.ICU.Text.RBBINode.opQuestion || n.fType == IBM.ICU.Text.RBBINode.opPlus) { ILOG.J2CsMapping.Collections.Generics.Collections.AddAll(n.fLeftChild.fLastPosSet, n.fLastPosSet); } }
// ----------------------------------------------------------------------------- // // calcFollowPos. Impossible to explain succinctly. See Aho, section 3.9 // // ----------------------------------------------------------------------------- internal void CalcFollowPos(RBBINode n) { if (n == null || n.fType == IBM.ICU.Text.RBBINode.leafChar || n.fType == IBM.ICU.Text.RBBINode.endMark) { return; } CalcFollowPos(n.fLeftChild); CalcFollowPos(n.fRightChild); // Aho rule #1 if (n.fType == IBM.ICU.Text.RBBINode.opCat) { RBBINode i; // is 'i' in Aho's description ILOG.J2CsMapping.Collections.ISet LastPosOfLeftChild = n.fLeftChild.fLastPosSet; IIterator ix = new ILOG.J2CsMapping.Collections.IteratorAdapter(LastPosOfLeftChild.GetEnumerator()); while (ix.HasNext()) { i = (RBBINode)ix.Next(); ILOG.J2CsMapping.Collections.Generics.Collections.AddAll(n.fRightChild.fFirstPosSet, i.fFollowPos); } } // Aho rule #2 if (n.fType == IBM.ICU.Text.RBBINode.opStar || n.fType == IBM.ICU.Text.RBBINode.opPlus) { RBBINode i_0; // again, n and i are the names from Aho's description. IIterator ix_1 = new ILOG.J2CsMapping.Collections.IteratorAdapter(n.fLastPosSet.GetEnumerator()); while (ix_1.HasNext()) { i_0 = (RBBINode)ix_1.Next(); ILOG.J2CsMapping.Collections.Generics.Collections.AddAll(n.fFirstPosSet, i_0.fFollowPos); } } }
// ------------------------------------------------------------------------- // // flattenSets Walk the parse tree, replacing any nodes of type setRef // with a copy of the expression tree for the set. A set's // equivalent expression tree is precomputed and saved as // the left child of the uset node. // // ------------------------------------------------------------------------- internal void FlattenSets() { IBM.ICU.Impl.Assert.Assrt(fType != setRef); if (fLeftChild != null) { if (fLeftChild.fType == setRef) { RBBINode setRefNode = fLeftChild; RBBINode usetNode = setRefNode.fLeftChild; RBBINode replTree = usetNode.fLeftChild; fLeftChild = replTree.CloneTree(); fLeftChild.fParent = this; } else { fLeftChild.FlattenSets(); } } if (fRightChild != null) { if (fRightChild.fType == setRef) { RBBINode setRefNode_0 = fRightChild; RBBINode usetNode_1 = setRefNode_0.fLeftChild; RBBINode replTree_2 = usetNode_1.fLeftChild; fRightChild = replTree_2.CloneTree(); fRightChild.fParent = this; // delete setRefNode; } else { fRightChild.FlattenSets(); } } }
// ------------------------------------------------------------------------------------- // // RangeDescriptor::setDictionaryFlag // // Character Category Numbers that include characters from // the original Unicode Set named "dictionary" have bit 14 // set to 1. The RBBI runtime engine uses this to trigger // use of the word dictionary. // // This function looks through the Unicode Sets that it // (the range) includes, and sets the bit in fNum when // "dictionary" is among them. // // TODO: a faster way would be to find the set node for // "dictionary" just once, rather than looking it // up by name every time. // // ------------------------------------------------------------------------------------- internal void SetDictionaryFlag() { int i; for (i = 0; i < this.fIncludesSets.Count; i++) { RBBINode usetNode = (RBBINode)fIncludesSets[i]; String setName = ""; RBBINode setRef = usetNode.fParent; if (setRef != null) { RBBINode varRef = setRef.fParent; if (varRef != null && varRef.fType == IBM.ICU.Text.RBBINode.varRef) { setName = varRef.fText; } } if (setName.Equals("dictionary")) { this.fNum |= 0x4000; break; } } }
internal void AddValToSet(RBBINode usetNode, int val) { RBBINode leafNode = new RBBINode(IBM.ICU.Text.RBBINode.leafChar); leafNode.fVal = val; if (usetNode.fLeftChild == null) { usetNode.fLeftChild = leafNode; leafNode.fParent = usetNode; } else { // There are already input symbols present for this set. // Set up an OR node, with the previous stuff as the left child // and the new value as the right child. RBBINode orNode = new RBBINode(IBM.ICU.Text.RBBINode.opOr); orNode.fLeftChild = usetNode.fLeftChild; orNode.fRightChild = leafNode; orNode.fLeftChild.fParent = orNode; orNode.fRightChild.fParent = orNode; usetNode.fLeftChild = orNode; orNode.fParent = usetNode; } }
// --------------------------------------------------------------------------------- // // Parse RBBI rules. The state machine for rules parsing is here. // The state tables are hand-written in the file rbbirpt.txt, // and converted to the form used here by a perl // script rbbicst.pl // // --------------------------------------------------------------------------------- internal void Parse() { int state; RBBIRuleParseTable.RBBIRuleTableElement tableEl; state = 1; NextChar(fC); // // Main loop for the rule parsing state machine. // Runs once per state transition. // Each time through optionally performs, depending on the state table, // - an advance to the the next input char // - an action to be performed. // - pushing or popping a state to/from the local state return stack. // for (;;) { // Quit if state == 0. This is the normal way to exit the state // machine. // if (state == 0) { break; } // Find the state table element that matches the input char from the // rule, or the // class of the input character. Start with the first table row for // this // state, then linearly scan forward until we find a row that // matches the // character. The last row for each state always matches all // characters, so // the search will stop there, if not before. // tableEl = IBM.ICU.Text.RBBIRuleParseTable.gRuleParseStateTable[state]; if (fRB.fDebugEnv != null && fRB.fDebugEnv.IndexOf("scan") >= 0) { System.Console.Out.WriteLine("char, line, col = (\'" + (char) fC.fChar + "\', " + fLineNum + ", " + fCharNum + " state = " + tableEl.fStateName); } for (int tableRow = state;; tableRow++) { // loop over the state // table rows associated // with this state. tableEl = IBM.ICU.Text.RBBIRuleParseTable.gRuleParseStateTable[tableRow]; if (fRB.fDebugEnv != null && fRB.fDebugEnv.IndexOf("scan") >= 0) { System.Console.Out.Write("."); } if (tableEl.fCharClass < 127 && fC.fEscaped == false && tableEl.fCharClass == fC.fChar) { // Table row specified an individual character, not a set, // and // the input character is not escaped, and // the input character matched it. break; } if (tableEl.fCharClass == 255) { // Table row specified default, match anything character // class. break; } if (tableEl.fCharClass == 254 && fC.fEscaped) { // Table row specified "escaped" and the char was escaped. break; } if (tableEl.fCharClass == 253 && fC.fEscaped && (fC.fChar == 0x50 || fC.fChar == 0x70)) { // Table row specified "escaped P" and the char is either // 'p' or 'P'. break; } if (tableEl.fCharClass == 252 && fC.fChar == (int) -1) { // Table row specified eof and we hit eof on the input. break; } if (tableEl.fCharClass >= 128 && tableEl.fCharClass < 240 && // Table // specs // a // char // class // && fC.fEscaped == false && // char is not escaped && fC.fChar != (int) -1) { // char is not EOF UnicodeSet uniset = fRuleSets[tableEl.fCharClass - 128]; if (uniset.Contains(fC.fChar)) { // Table row specified a character class, or set of // characters, // and the current char matches it. break; } } } if (fRB.fDebugEnv != null && fRB.fDebugEnv.IndexOf("scan") >= 0) { System.Console.Out.WriteLine(""); } // // We've found the row of the state table that matches the current // input // character from the rules string. // Perform any action specified by this row in the state table. if (DoParseActions(tableEl.fAction) == false) { // Break out of the state machine loop if the // the action signalled some kind of error, or // the action was to exit, occurs on normal end-of-rules-input. break; } if (tableEl.fPushState != 0) { fStackPtr++; if (fStackPtr >= kStackSize) { System.Console.Out .WriteLine("RBBIRuleScanner.parse() - state stack overflow."); Error(IBM.ICU.Text.RBBIRuleBuilder.U_BRK_INTERNAL_ERROR); } fStack[fStackPtr] = tableEl.fPushState; } if (tableEl.fNextChar) { NextChar(fC); } // Get the next state from the table entry, or from the // state stack if the next state was specified as "pop". if (tableEl.fNextState != 255) { state = tableEl.fNextState; } else { state = fStack[fStackPtr]; fStackPtr--; if (fStackPtr < 0) { System.Console.Out .WriteLine("RBBIRuleScanner.parse() - state stack underflow."); Error(IBM.ICU.Text.RBBIRuleBuilder.U_BRK_INTERNAL_ERROR); } } } // // If there were NO user specified reverse rules, set up the equivalent // of ".*;" // if (fRB.fTreeRoots[IBM.ICU.Text.RBBIRuleBuilder.fReverseTree] == null) { fRB.fTreeRoots[IBM.ICU.Text.RBBIRuleBuilder.fReverseTree] = PushNewNode(IBM.ICU.Text.RBBINode.opStar); RBBINode operand = PushNewNode(IBM.ICU.Text.RBBINode.setRef); FindSetFor(kAny, operand, null); fRB.fTreeRoots[IBM.ICU.Text.RBBIRuleBuilder.fReverseTree].fLeftChild = operand; operand.fParent = fRB.fTreeRoots[IBM.ICU.Text.RBBIRuleBuilder.fReverseTree]; fNodeStackPtr -= 2; } // // Parsing of the input RBBI rules is complete. // We now have a parse tree for the rule expressions // and a list of all UnicodeSets that are referenced. // if (fRB.fDebugEnv != null && fRB.fDebugEnv.IndexOf("symbols") >= 0) { fSymbolTable.RbbiSymtablePrint(); } if (fRB.fDebugEnv != null && fRB.fDebugEnv.IndexOf("ptree") >= 0) { System.Console.Out.WriteLine("Completed Forward Rules Parse Tree..."); fRB.fTreeRoots[IBM.ICU.Text.RBBIRuleBuilder.fForwardTree].PrintTree(true); System.Console.Out.WriteLine("\nCompleted Reverse Rules Parse Tree..."); fRB.fTreeRoots[IBM.ICU.Text.RBBIRuleBuilder.fReverseTree].PrintTree(true); System.Console.Out .WriteLine("\nCompleted Safe Point Forward Rules Parse Tree..."); if (fRB.fTreeRoots[IBM.ICU.Text.RBBIRuleBuilder.fSafeFwdTree] == null) { System.Console.Out.WriteLine(" -- null -- "); } else { fRB.fTreeRoots[IBM.ICU.Text.RBBIRuleBuilder.fSafeFwdTree].PrintTree(true); } System.Console.Out .WriteLine("\nCompleted Safe Point Reverse Rules Parse Tree..."); if (fRB.fTreeRoots[IBM.ICU.Text.RBBIRuleBuilder.fSafeRevTree] == null) { System.Console.Out.WriteLine(" -- null -- "); } else { fRB.fTreeRoots[IBM.ICU.Text.RBBIRuleBuilder.fSafeRevTree].PrintTree(true); } } }
// ----------------------------------------------------------------------------- // // RBBITableBuilder::build - This is the main function for building the DFA // state transtion // table from the RBBI rules parse tree. // // ----------------------------------------------------------------------------- internal void Build() { // If there were no rules, just return. This situation can easily arise // for the reverse rules. if (fRB.fTreeRoots[fRootIx] == null) { return; } // // Walk through the tree, replacing any references to $variables with a // copy of the // parse tree for the substition expression. // fRB.fTreeRoots[fRootIx] = fRB.fTreeRoots[fRootIx].FlattenVariables(); if (fRB.fDebugEnv != null && fRB.fDebugEnv.IndexOf("ftree") >= 0) { System.Console.Out .WriteLine("Parse tree after flattening variable references."); fRB.fTreeRoots[fRootIx].PrintTree(true); } // // If the rules contained any references to {bof} // add a {bof} <cat> <former root of tree> to the // tree. Means that all matches must start out with the // {bof} fake character. // if (fRB.fSetBuilder.SawBOF()) { RBBINode bofTop = new RBBINode(IBM.ICU.Text.RBBINode.opCat); RBBINode bofLeaf = new RBBINode(IBM.ICU.Text.RBBINode.leafChar); bofTop.fLeftChild = bofLeaf; bofTop.fRightChild = fRB.fTreeRoots[fRootIx]; bofLeaf.fParent = bofTop; bofLeaf.fVal = 2; // Reserved value for {bof}. fRB.fTreeRoots[fRootIx] = bofTop; } // // Add a unique right-end marker to the expression. // Appears as a cat-node, left child being the original tree, // right child being the end marker. // RBBINode cn = new RBBINode(IBM.ICU.Text.RBBINode.opCat); cn.fLeftChild = fRB.fTreeRoots[fRootIx]; fRB.fTreeRoots[fRootIx].fParent = cn; cn.fRightChild = new RBBINode(IBM.ICU.Text.RBBINode.endMark); cn.fRightChild.fParent = cn; fRB.fTreeRoots[fRootIx] = cn; // // Replace all references to UnicodeSets with the tree for the // equivalent // expression. // fRB.fTreeRoots[fRootIx].FlattenSets(); if (fRB.fDebugEnv != null && fRB.fDebugEnv.IndexOf("stree") >= 0) { System.Console.Out .WriteLine("Parse tree after flattening Unicode Set references."); fRB.fTreeRoots[fRootIx].PrintTree(true); } // // calculate the functions nullable, firstpos, lastpos and followpos on // nodes in the parse tree. // See the alogrithm description in Aho. // Understanding how this works by looking at the code alone will be // nearly impossible. // CalcNullable(fRB.fTreeRoots[fRootIx]); CalcFirstPos(fRB.fTreeRoots[fRootIx]); CalcLastPos(fRB.fTreeRoots[fRootIx]); CalcFollowPos(fRB.fTreeRoots[fRootIx]); if (fRB.fDebugEnv != null && fRB.fDebugEnv.IndexOf("pos") >= 0) { System.Console.Out.Write("\n"); PrintPosSets(fRB.fTreeRoots[fRootIx]); } // // For "chained" rules, modify the followPos sets // if (fRB.fChainRules) { CalcChainedFollowPos(fRB.fTreeRoots[fRootIx]); } // // BOF (start of input) test fixup. // if (fRB.fSetBuilder.SawBOF()) { BofFixup(); } // // Build the DFA state transition tables. // BuildStateTable(); FlagAcceptingStates(); FlagLookAheadStates(); FlagTaggedStates(); // // Update the global table of rule status {tag} values // The rule builder has a global vector of status values that are common // for all tables. Merge the ones from this table into the global set. // MergeRuleStatusVals(); if (fRB.fDebugEnv != null && fRB.fDebugEnv.IndexOf("states") >= 0) { PrintStates(); } }
// ----------------------------------------------------------------------------- // // calcChainedFollowPos. Modify the previously calculated followPos sets // to implement rule chaining. NOT described by Aho // // ----------------------------------------------------------------------------- internal void CalcChainedFollowPos(RBBINode tree) { IList endMarkerNodes = new ArrayList(); IList leafNodes = new ArrayList(); // get a list of all endmarker nodes. tree.FindNodes(endMarkerNodes, IBM.ICU.Text.RBBINode.endMark); // get a list all leaf nodes tree.FindNodes(leafNodes, IBM.ICU.Text.RBBINode.leafChar); // Get all nodes that can be the start a match, which is FirstPosition() // of the portion of the tree corresponding to user-written rules. // See the tree description in bofFixup(). RBBINode userRuleRoot = tree; if (fRB.fSetBuilder.SawBOF()) { userRuleRoot = tree.fLeftChild.fRightChild; } IBM.ICU.Impl.Assert.Assrt(userRuleRoot != null); ILOG.J2CsMapping.Collections.ISet matchStartNodes = userRuleRoot.fFirstPosSet; // Iteratate over all leaf nodes, // IIterator endNodeIx = new ILOG.J2CsMapping.Collections.IteratorAdapter(leafNodes.GetEnumerator()); while (endNodeIx.HasNext()) { RBBINode tNode = (RBBINode)endNodeIx.Next(); RBBINode endNode = null; // Identify leaf nodes that correspond to overall rule match // positions. // These include an endMarkerNode in their followPos sets. IIterator i = new ILOG.J2CsMapping.Collections.IteratorAdapter(endMarkerNodes.GetEnumerator()); while (i.HasNext()) { RBBINode endMarkerNode = (RBBINode)i.Next(); if (ILOG.J2CsMapping.Collections.Collections.Contains(endMarkerNode, tNode.fFollowPos)) { endNode = tNode; break; } } if (endNode == null) { // node wasn't an end node. Try again with the next. continue; } // We've got a node that can end a match. // Line Break Specific hack: If this node's val correspond to the // $CM char class, // don't chain from it. // TODO: Add rule syntax for this behavior, get specifics out of // here and // into the rule file. if (fRB.fLBCMNoChain) { int c = this.fRB.fSetBuilder.GetFirstChar(endNode.fVal); if (c != -1) { // c == -1 occurs with sets containing only the {eof} marker // string. int cLBProp = IBM.ICU.Lang.UCharacter.GetIntPropertyValue(c, IBM.ICU.Lang.UProperty_Constants.LINE_BREAK); if (cLBProp == IBM.ICU.Lang.UCharacter.LineBreak.COMBINING_MARK) { continue; } } } // Now iterate over the nodes that can start a match, looking for // ones // with the same char class as our ending node. RBBINode startNode; IIterator startNodeIx = new ILOG.J2CsMapping.Collections.IteratorAdapter(matchStartNodes.GetEnumerator()); while (startNodeIx.HasNext()) { startNode = (RBBINode)startNodeIx.Next(); if (startNode.fType != IBM.ICU.Text.RBBINode.leafChar) { continue; } if (endNode.fVal == startNode.fVal) { // The end val (character class) of one possible match is // the // same as the start of another. // Add all nodes from the followPos of the start node to the // followPos set of the end node, which will have the effect // of // letting matches transition from a match state at endNode // to the second char of a match starting with startNode. ILOG.J2CsMapping.Collections.Generics.Collections.AddAll(startNode.fFollowPos, endNode.fFollowPos); } } } }
// ---------------------------------------------------------------------------------------- // // doParseAction Do some action during rule parsing. // Called by the parse state machine. // Actions build the parse tree and Unicode Sets, // and maintain the parse stack for nested expressions. // // ---------------------------------------------------------------------------------------- internal bool DoParseActions(int action) { RBBINode n = null; bool returnVal = true; switch (action) { case IBM.ICU.Text.RBBIRuleParseTable.doExprStart: PushNewNode(IBM.ICU.Text.RBBINode.opStart); fRuleNum++; break; case IBM.ICU.Text.RBBIRuleParseTable.doExprOrOperator: { FixOpStack(IBM.ICU.Text.RBBINode.precOpCat); RBBINode operandNode = fNodeStack[fNodeStackPtr--]; RBBINode orNode = PushNewNode(IBM.ICU.Text.RBBINode.opOr); orNode.fLeftChild = operandNode; operandNode.fParent = orNode; } break; case IBM.ICU.Text.RBBIRuleParseTable.doExprCatOperator: // concatenation operator. // For the implicit concatenation of adjacent terms in an expression // that are // not separated by any other operator. Action is invoked between // the // actions for the two terms. { FixOpStack(IBM.ICU.Text.RBBINode.precOpCat); RBBINode operandNode_0 = fNodeStack[fNodeStackPtr--]; RBBINode catNode = PushNewNode(IBM.ICU.Text.RBBINode.opCat); catNode.fLeftChild = operandNode_0; operandNode_0.fParent = catNode; } break; case IBM.ICU.Text.RBBIRuleParseTable.doLParen: // Open Paren. // The openParen node is a dummy operation type with a low // precedence, // which has the affect of ensuring that any real binary op that // follows within the parens binds more tightly to the operands than // stuff outside of the parens. PushNewNode(IBM.ICU.Text.RBBINode.opLParen); break; case IBM.ICU.Text.RBBIRuleParseTable.doExprRParen: FixOpStack(IBM.ICU.Text.RBBINode.precLParen); break; case IBM.ICU.Text.RBBIRuleParseTable.doNOP: break; case IBM.ICU.Text.RBBIRuleParseTable.doStartAssign: // We've just scanned "$variable = " // The top of the node stack has the $variable ref node. // Save the start position of the RHS text in the StartExpression // node // that precedes the $variableReference node on the stack. // This will eventually be used when saving the full $variable // replacement // text as a string. n = fNodeStack[fNodeStackPtr - 1]; n.fFirstPos = fNextIndex; // move past the '=' // Push a new start-of-expression node; needed to keep parse of the // RHS expression happy. PushNewNode(IBM.ICU.Text.RBBINode.opStart); break; case IBM.ICU.Text.RBBIRuleParseTable.doEndAssign: { // We have reached the end of an assignement statement. // Current scan char is the ';' that terminates the assignment. // Terminate expression, leaves expression parse tree rooted in TOS // node. FixOpStack(IBM.ICU.Text.RBBINode.precStart); RBBINode startExprNode = fNodeStack[fNodeStackPtr - 2]; RBBINode varRefNode = fNodeStack[fNodeStackPtr - 1]; RBBINode RHSExprNode = fNodeStack[fNodeStackPtr]; // Save original text of right side of assignment, excluding the // terminating ';' // in the root of the node for the right-hand-side expression. RHSExprNode.fFirstPos = startExprNode.fFirstPos; RHSExprNode.fLastPos = fScanIndex; // fRB.fRules.extractBetween(RHSExprNode.fFirstPos, // RHSExprNode.fLastPos, RHSExprNode.fText); RHSExprNode.fText = fRB.fRules.Substring(RHSExprNode.fFirstPos,(RHSExprNode.fLastPos)-(RHSExprNode.fFirstPos)); // Expression parse tree becomes l. child of the $variable reference // node. varRefNode.fLeftChild = RHSExprNode; RHSExprNode.fParent = varRefNode; // Make a symbol table entry for the $variableRef node. fSymbolTable.AddEntry(varRefNode.fText, varRefNode); // Clean up the stack. fNodeStackPtr -= 3; break; } case IBM.ICU.Text.RBBIRuleParseTable.doEndOfRule: { FixOpStack(IBM.ICU.Text.RBBINode.precStart); // Terminate expression, leaves // expression if (fRB.fDebugEnv != null && fRB.fDebugEnv.IndexOf("rtree") >= 0) { PrintNodeStack("end of rule"); } IBM.ICU.Impl.Assert.Assrt(fNodeStackPtr == 1); // If this rule includes a look-ahead '/', add a endMark node to the // expression tree. if (fLookAheadRule) { RBBINode thisRule = fNodeStack[fNodeStackPtr]; RBBINode endNode = PushNewNode(IBM.ICU.Text.RBBINode.endMark); RBBINode catNode_1 = PushNewNode(IBM.ICU.Text.RBBINode.opCat); fNodeStackPtr -= 2; catNode_1.fLeftChild = thisRule; catNode_1.fRightChild = endNode; fNodeStack[fNodeStackPtr] = catNode_1; endNode.fVal = fRuleNum; endNode.fLookAheadEnd = true; } // All rule expressions are ORed together. // The ';' that terminates an expression really just functions as a // '|' with // a low operator prededence. // // Each of the four sets of rules are collected separately. // (forward, reverse, safe_forward, safe_reverse) // OR this rule into the appropriate group of them. // int destRules = ((fReverseRule) ? IBM.ICU.Text.RBBIRuleBuilder.fReverseTree : fRB.fDefaultTree); if (fRB.fTreeRoots[destRules] != null) { // This is not the first rule encounted. // OR previous stuff (from *destRules) // with the current rule expression (on the Node Stack) // with the resulting OR expression going to *destRules // RBBINode thisRule_2 = fNodeStack[fNodeStackPtr]; RBBINode prevRules = fRB.fTreeRoots[destRules]; RBBINode orNode_3 = PushNewNode(IBM.ICU.Text.RBBINode.opOr); orNode_3.fLeftChild = prevRules; prevRules.fParent = orNode_3; orNode_3.fRightChild = thisRule_2; thisRule_2.fParent = orNode_3; fRB.fTreeRoots[destRules] = orNode_3; } else { // This is the first rule encountered (for this direction). // Just move its parse tree from the stack to *destRules. fRB.fTreeRoots[destRules] = fNodeStack[fNodeStackPtr]; } fReverseRule = false; // in preparation for the next rule. fLookAheadRule = false; fNodeStackPtr = 0; } break; case IBM.ICU.Text.RBBIRuleParseTable.doRuleError: Error(IBM.ICU.Text.RBBIRuleBuilder.U_BRK_RULE_SYNTAX); returnVal = false; break; case IBM.ICU.Text.RBBIRuleParseTable.doVariableNameExpectedErr: Error(IBM.ICU.Text.RBBIRuleBuilder.U_BRK_RULE_SYNTAX); break; // // Unary operands + ? * // These all appear after the operand to which they apply. // When we hit one, the operand (may be a whole sub expression) // will be on the top of the stack. // Unary Operator becomes TOS, with the old TOS as its one child. case IBM.ICU.Text.RBBIRuleParseTable.doUnaryOpPlus: { RBBINode operandNode_4 = fNodeStack[fNodeStackPtr--]; RBBINode plusNode = PushNewNode(IBM.ICU.Text.RBBINode.opPlus); plusNode.fLeftChild = operandNode_4; operandNode_4.fParent = plusNode; } break; case IBM.ICU.Text.RBBIRuleParseTable.doUnaryOpQuestion: { RBBINode operandNode_5 = fNodeStack[fNodeStackPtr--]; RBBINode qNode = PushNewNode(IBM.ICU.Text.RBBINode.opQuestion); qNode.fLeftChild = operandNode_5; operandNode_5.fParent = qNode; } break; case IBM.ICU.Text.RBBIRuleParseTable.doUnaryOpStar: { RBBINode operandNode_6 = fNodeStack[fNodeStackPtr--]; RBBINode starNode = PushNewNode(IBM.ICU.Text.RBBINode.opStar); starNode.fLeftChild = operandNode_6; operandNode_6.fParent = starNode; } break; case IBM.ICU.Text.RBBIRuleParseTable.doRuleChar: // A "Rule Character" is any single character that is a literal part // of the regular expression. Like a, b and c in the expression // "(abc*) // | [:L:]" // These are pretty uncommon in break rules; the terms are more // commonly // sets. To keep things uniform, treat these characters like as // sets that just happen to contain only one character. { n = PushNewNode(IBM.ICU.Text.RBBINode.setRef); String s = (new StringBuilder().Append((char) fC.fChar)).ToString(); FindSetFor(s, n, null); n.fFirstPos = fScanIndex; n.fLastPos = fNextIndex; n.fText = fRB.fRules.Substring(n.fFirstPos,(n.fLastPos)-(n.fFirstPos)); break; } case IBM.ICU.Text.RBBIRuleParseTable.doDotAny: // scanned a ".", meaning match any single character. { n = PushNewNode(IBM.ICU.Text.RBBINode.setRef); FindSetFor(kAny, n, null); n.fFirstPos = fScanIndex; n.fLastPos = fNextIndex; n.fText = fRB.fRules.Substring(n.fFirstPos,(n.fLastPos)-(n.fFirstPos)); break; } case IBM.ICU.Text.RBBIRuleParseTable.doSlash: // Scanned a '/', which identifies a look-ahead break position in a // rule. n = PushNewNode(IBM.ICU.Text.RBBINode.lookAhead); n.fVal = fRuleNum; n.fFirstPos = fScanIndex; n.fLastPos = fNextIndex; n.fText = fRB.fRules.Substring(n.fFirstPos,(n.fLastPos)-(n.fFirstPos)); fLookAheadRule = true; break; case IBM.ICU.Text.RBBIRuleParseTable.doStartTagValue: // Scanned a '{', the opening delimiter for a tag value within a // rule. n = PushNewNode(IBM.ICU.Text.RBBINode.tag); n.fVal = 0; n.fFirstPos = fScanIndex; n.fLastPos = fNextIndex; break; case IBM.ICU.Text.RBBIRuleParseTable.doTagDigit: // Just scanned a decimal digit that's part of a tag value { n = fNodeStack[fNodeStackPtr]; int v = ILOG.J2CsMapping.Util.Character.Digit((char) fC.fChar,10); n.fVal = n.fVal * 10 + v; break; } case IBM.ICU.Text.RBBIRuleParseTable.doTagValue: n = fNodeStack[fNodeStackPtr]; n.fLastPos = fNextIndex; n.fText = fRB.fRules.Substring(n.fFirstPos,(n.fLastPos)-(n.fFirstPos)); break; case IBM.ICU.Text.RBBIRuleParseTable.doTagExpectedError: Error(IBM.ICU.Text.RBBIRuleBuilder.U_BRK_MALFORMED_RULE_TAG); returnVal = false; break; case IBM.ICU.Text.RBBIRuleParseTable.doOptionStart: // Scanning a !!option. At the start of string. fOptionStart = fScanIndex; break; case IBM.ICU.Text.RBBIRuleParseTable.doOptionEnd: { String opt = fRB.fRules.Substring(fOptionStart,(fScanIndex)-(fOptionStart)); if (opt.Equals("chain")) { fRB.fChainRules = true; } else if (opt.Equals("LBCMNoChain")) { fRB.fLBCMNoChain = true; } else if (opt.Equals("forward")) { fRB.fDefaultTree = IBM.ICU.Text.RBBIRuleBuilder.fForwardTree; } else if (opt.Equals("reverse")) { fRB.fDefaultTree = IBM.ICU.Text.RBBIRuleBuilder.fReverseTree; } else if (opt.Equals("safe_forward")) { fRB.fDefaultTree = IBM.ICU.Text.RBBIRuleBuilder.fSafeFwdTree; } else if (opt.Equals("safe_reverse")) { fRB.fDefaultTree = IBM.ICU.Text.RBBIRuleBuilder.fSafeRevTree; } else if (opt.Equals("lookAheadHardBreak")) { fRB.fLookAheadHardBreak = true; } else { Error(IBM.ICU.Text.RBBIRuleBuilder.U_BRK_UNRECOGNIZED_OPTION); } break; } case IBM.ICU.Text.RBBIRuleParseTable.doReverseDir: fReverseRule = true; break; case IBM.ICU.Text.RBBIRuleParseTable.doStartVariableName: n = PushNewNode(IBM.ICU.Text.RBBINode.varRef); n.fFirstPos = fScanIndex; break; case IBM.ICU.Text.RBBIRuleParseTable.doEndVariableName: n = fNodeStack[fNodeStackPtr]; if (n == null || n.fType != IBM.ICU.Text.RBBINode.varRef) { Error(IBM.ICU.Text.RBBIRuleBuilder.U_BRK_INTERNAL_ERROR); break; } n.fLastPos = fScanIndex; n.fText = fRB.fRules.Substring(n.fFirstPos + 1,(n.fLastPos)-(n.fFirstPos + 1)); // Look the newly scanned name up in the symbol table // If there's an entry, set the l. child of the var ref to the // replacement expression. // (We also pass through here when scanning assignments, but no harm // is done, other // than a slight wasted effort that seems hard to avoid. Lookup will // be null) n.fLeftChild = fSymbolTable.LookupNode(n.fText); break; case IBM.ICU.Text.RBBIRuleParseTable.doCheckVarDef: n = fNodeStack[fNodeStackPtr]; if (n.fLeftChild == null) { Error(IBM.ICU.Text.RBBIRuleBuilder.U_BRK_UNDEFINED_VARIABLE); returnVal = false; } break; case IBM.ICU.Text.RBBIRuleParseTable.doExprFinished: break; case IBM.ICU.Text.RBBIRuleParseTable.doRuleErrorAssignExpr: Error(IBM.ICU.Text.RBBIRuleBuilder.U_BRK_ASSIGN_ERROR); returnVal = false; break; case IBM.ICU.Text.RBBIRuleParseTable.doExit: returnVal = false; break; case IBM.ICU.Text.RBBIRuleParseTable.doScanUnicodeSet: ScanSet(); break; default: Error(IBM.ICU.Text.RBBIRuleBuilder.U_BRK_INTERNAL_ERROR); returnVal = false; break; } return returnVal; }
// /CLOVER:ON // ------------------------------------------------------------------------ // // printRangeGroups A debugging function. // dump out all of the range groups. // // ------------------------------------------------------------------------ // /CLOVER:OFF internal void PrintRangeGroups() { RBBISetBuilder.RangeDescriptor rlRange; RBBISetBuilder.RangeDescriptor tRange; int i; int lastPrintedGroupNum = 0; System.Console.Out.Write("\nRanges grouped by Unicode Set Membership...\n"); for (rlRange = fRangeList; rlRange != null; rlRange = rlRange.fNext) { int groupNum = rlRange.fNum & 0xbfff; if (groupNum > lastPrintedGroupNum) { lastPrintedGroupNum = groupNum; if (groupNum < 10) { System.Console.Out.Write(" "); } System.Console.Out.Write(groupNum + " "); if ((rlRange.fNum & 0x4000) != 0) { System.Console.Out.Write(" <DICT> "); } for (i = 0; i < rlRange.fIncludesSets.Count; i++) { RBBINode usetNode = (RBBINode)rlRange.fIncludesSets[i]; String setName = "anon"; RBBINode setRef = usetNode.fParent; if (setRef != null) { RBBINode varRef = setRef.fParent; if (varRef != null && varRef.fType == IBM.ICU.Text.RBBINode.varRef) { setName = varRef.fText; } } System.Console.Out.Write(setName); System.Console.Out.Write(" "); } i = 0; for (tRange = rlRange; tRange != null; tRange = tRange.fNext) { if (tRange.fNum == rlRange.fNum) { if (i++ % 5 == 0) { System.Console.Out.Write("\n "); } IBM.ICU.Text.RBBINode.PrintHex((int)tRange.fStartChar, -1); System.Console.Out.Write("-"); IBM.ICU.Text.RBBINode.PrintHex((int)tRange.fEndChar, 0); } } System.Console.Out.Write("\n"); } } System.Console.Out.Write("\n"); }