//----------------------------------------------------------------------------- // // mergeRuleStatusVals // // Allocate positions in the global array of rule status {tag} values // // The RBBI runtime uses an array of {sets of status values} that can // be returned for boundaries. Each accepting state that has non-zero // status includes an index into this array. The format of the array // is // Num of status values in group 1 // status val // status val // ... // Num of status vals in group 2 // status val // status val // ... // etc. // // //----------------------------------------------------------------------------- internal virtual void MergeRuleStatusVals() { // // The basic outline of what happens here is this... // // for each state in this state table // if the status tag list for this state is in the global statuses list // record where and // continue with the next state // else // add the tag list for this state to the global list. // int n; // Pre-load a single tag of {0} into the table. // We will need this as a default, for rule sets with no explicit tagging, // or with explicit tagging of {0}. if (fRB.fRuleStatusVals.Count == 0) { fRB.fRuleStatusVals.Add(1); // Num of statuses in group fRB.fRuleStatusVals.Add(0); // and our single status of zero SortedSet <int> s0 = new SortedSet <int>(); int izero = 0; fRB.fStatusSets[s0] = izero; SortedSet <int> s1 = new SortedSet <int>(); s1.Add(izero); fRB.fStatusSets[s0] = izero; } // For each state, check whether the state's status tag values are // already entered into the status values array, and add them if not. for (n = 0; n < fDStates.Count; n++) { RBBIStateDescriptor sd = fDStates[n]; SortedSet <int> statusVals = sd.fTagVals; int?arrayIndexI = fRB.fStatusSets.Get(statusVals); if (arrayIndexI == null) { // This is the first encounter of this set of status values. // Add them to the statusSets map, This map associates // the set of status values with an index in the runtime status // values array. arrayIndexI = fRB.fRuleStatusVals.Count; fRB.fStatusSets[statusVals] = arrayIndexI; // Add the new set of status values to the vector of values that // will eventually become the array used by the runtime engine. fRB.fRuleStatusVals.Add(statusVals.Count); fRB.fRuleStatusVals.AddRange(statusVals); } // Save the runtime array index back into the state descriptor. sd.fTagsIdx = arrayIndexI.Value; // ICU4N NOTE: At this pint the value cannot be null } }
//----------------------------------------------------------------------------- // // flagAcceptingStates Identify accepting states. // First get a list of all of the end marker nodes. // Then, for each state s, // if s contains one of the end marker nodes in its list of tree positions then // s is an accepting state. // //----------------------------------------------------------------------------- internal virtual void FlagAcceptingStates() { IList <RBBINode> endMarkerNodes = new JCG.List <RBBINode>(); RBBINode endMarker; int i; int n; fRB.fTreeRoots[fRootIx].FindNodes(endMarkerNodes, RBBINode.endMark); for (i = 0; i < endMarkerNodes.Count; i++) { endMarker = endMarkerNodes[i]; for (n = 0; n < fDStates.Count; n++) { RBBIStateDescriptor sd = fDStates[n]; //if (sd.fPositions.indexOf(endMarker) >= 0) { if (sd.fPositions.Contains(endMarker)) { // Any non-zero value for fAccepting means this is an accepting node. // The value is what will be returned to the user as the break status. // If no other value was specified, force it to -1. if (sd.fAccepting == 0) { // State hasn't been marked as accepting yet. Do it now. sd.fAccepting = endMarker.fVal; if (sd.fAccepting == 0) { sd.fAccepting = -1; } } if (sd.fAccepting == -1 && endMarker.fVal != 0) { // Both lookahead and non-lookahead accepting for this state. // Favor the look-ahead. Expedient for line break. // TODO: need a more elegant resolution for conflicting rules. sd.fAccepting = endMarker.fVal; } // implicit else: // if sd.fAccepting already had a value other than 0 or -1, leave it be. // If the end marker node is from a look-ahead rule, set // the fLookAhead field or this state also. if (endMarker.fLookAheadEnd) { // TODO: don't change value if already set? // TODO: allow for more than one active look-ahead rule in engine. // Make value here an index to a side array in engine? sd.fLookAhead = sd.fAccepting; } } } } }
//----------------------------------------------------------------------------- // // flagTaggedStates // //----------------------------------------------------------------------------- internal virtual void FlagTaggedStates() { IList <RBBINode> tagNodes = new JCG.List <RBBINode>(); RBBINode tagNode; int i; int n; fRB.fTreeRoots[fRootIx].FindNodes(tagNodes, RBBINode.tag); for (i = 0; i < tagNodes.Count; i++) { // For each tag node t (all of 'em) tagNode = tagNodes[i]; for (n = 0; n < fDStates.Count; n++) { // For each state s (row in the state table) RBBIStateDescriptor sd = fDStates[n]; if (sd.fPositions.Contains(tagNode)) { // if s include the tag node t sd.fTagVals.Add(tagNode.fVal); } } } }
//----------------------------------------------------------------------------- // // flagLookAheadStates Very similar to flagAcceptingStates, above. // //----------------------------------------------------------------------------- internal virtual void FlagLookAheadStates() { IList <RBBINode> lookAheadNodes = new JCG.List <RBBINode>(); RBBINode lookAheadNode; int i; int n; fRB.fTreeRoots[fRootIx].FindNodes(lookAheadNodes, RBBINode.lookAhead); for (i = 0; i < lookAheadNodes.Count; i++) { lookAheadNode = lookAheadNodes[i]; for (n = 0; n < fDStates.Count; n++) { RBBIStateDescriptor sd = fDStates[n]; if (sd.fPositions.Contains(lookAheadNode)) { sd.fLookAhead = lookAheadNode.fVal; } } } }
//----------------------------------------------------------------------------- // // printStates Debug Function. Dump the fully constructed state transition table. // //----------------------------------------------------------------------------- internal virtual void PrintStates() { int c; // input "character" int n; // state number Console.Out.Write("state | i n p u t s y m b o l s \n"); Console.Out.Write(" | Acc LA Tag"); for (c = 0; c < fRB.fSetBuilder.NumCharCategories; c++) { RBBINode.PrintInt32(c, 3); } Console.Out.Write("\n"); Console.Out.Write(" |---------------"); for (c = 0; c < fRB.fSetBuilder.NumCharCategories; c++) { Console.Out.Write("---"); } Console.Out.Write("\n"); for (n = 0; n < fDStates.Count; n++) { RBBIStateDescriptor sd = fDStates[n]; RBBINode.PrintInt32(n, 5); Console.Out.Write(" | "); RBBINode.PrintInt32(sd.fAccepting, 3); RBBINode.PrintInt32(sd.fLookAhead, 4); RBBINode.PrintInt32(sd.fTagsIdx, 6); Console.Out.Write(" "); for (c = 0; c < fRB.fSetBuilder.NumCharCategories; c++) { RBBINode.PrintInt32(sd.fDtran[c], 3); } Console.Out.Write("\n"); } Console.Out.Write("\n\n"); }
//----------------------------------------------------------------------------- // // exportTable() export the state transition table in the ICU4C format. // // Most of the table is 16 bit shorts. This function exports // the whole thing as an array of shorts. // // The size of the array must be rounded up to a multiple of // 8 bytes. // // See struct RBBIStateTable in ICU4C, common/rbbidata.h // //----------------------------------------------------------------------------- internal virtual short[] ExportTable() { int state; int col; if (fRB.fTreeRoots[fRootIx] == null) { return(new short[0]); } Assert.Assrt(fRB.fSetBuilder.NumCharCategories < 0x7fff && fDStates.Count < 0x7fff); int numStates = fDStates.Count; // Size of table size in shorts. // the "4" is the size of struct RBBIStateTableRow, the row header part only. int rowLen = 4 + fRB.fSetBuilder.NumCharCategories; int tableSize = GetTableSize() / 2; short[] table = new short[tableSize]; // // Fill in the header fields. // Annoying because they really want to be ints, not shorts. // // RBBIStateTable.fNumStates table[RBBIDataWrapper.NUMSTATES] = (short)(numStates.TripleShift(16)); table[RBBIDataWrapper.NUMSTATES + 1] = (short)(numStates & 0x0000ffff); // RBBIStateTable.fRowLen table[RBBIDataWrapper.ROWLEN] = (short)(rowLen.TripleShift(16)); table[RBBIDataWrapper.ROWLEN + 1] = (short)(rowLen & 0x0000ffff); // RBBIStateTable.fFlags int flags = 0; if (fRB.fLookAheadHardBreak) { flags |= RBBIDataWrapper.RBBI_LOOKAHEAD_HARD_BREAK; } if (fRB.fSetBuilder.SawBOF) { flags |= RBBIDataWrapper.RBBI_BOF_REQUIRED; } table[RBBIDataWrapper.FLAGS] = (short)(flags.TripleShift(16)); table[RBBIDataWrapper.FLAGS + 1] = (short)(flags & 0x0000ffff); int numCharCategories = fRB.fSetBuilder.NumCharCategories; for (state = 0; state < numStates; state++) { RBBIStateDescriptor sd = fDStates[state]; int row = 8 + state * rowLen; Assert.Assrt(-32768 < sd.fAccepting && sd.fAccepting <= 32767); Assert.Assrt(-32768 < sd.fLookAhead && sd.fLookAhead <= 32767); table[row + RBBIDataWrapper.ACCEPTING] = (short)sd.fAccepting; table[row + RBBIDataWrapper.LOOKAHEAD] = (short)sd.fLookAhead; table[row + RBBIDataWrapper.TAGIDX] = (short)sd.fTagsIdx; for (col = 0; col < numCharCategories; col++) { table[row + RBBIDataWrapper.NEXTSTATES + col] = (short)sd.fDtran[col]; } } return(table); }
//----------------------------------------------------------------------------- // // buildStateTable() Determine the set of runtime DFA states and the // transition tables for these states, by the algorithm // of fig. 3.44 in Aho. // // Most of the comments are quotes of Aho's psuedo-code. // //----------------------------------------------------------------------------- internal virtual void BuildStateTable() { // // Add a dummy state 0 - the stop state. Not from Aho. int lastInputSymbol = fRB.fSetBuilder.NumCharCategories - 1; RBBIStateDescriptor failState = new RBBIStateDescriptor(lastInputSymbol); fDStates.Add(failState); // initially, the only unmarked state in Dstates is firstpos(root), // where toot is the root of the syntax tree for (r)#; RBBIStateDescriptor initialState = new RBBIStateDescriptor(lastInputSymbol); initialState.fPositions.UnionWith(fRB.fTreeRoots[fRootIx].fFirstPosSet); fDStates.Add(initialState); // while there is an unmarked state T in Dstates do begin for (; ;) { RBBIStateDescriptor T = null; int tx; for (tx = 1; tx < fDStates.Count; tx++) { RBBIStateDescriptor temp = fDStates[tx]; if (temp.fMarked == false) { T = temp; break; } } if (T == null) { break; } // mark T; T.fMarked = true; // for each input symbol a do begin int a; for (a = 1; a <= lastInputSymbol; a++) { // let U be the set of positions that are in followpos(p) // for some position p in T // such that the symbol at position p is a; ISet <RBBINode> U = null; foreach (RBBINode p in T.fPositions) { if ((p.fType == RBBINode.leafChar) && (p.fVal == a)) { if (U == null) { U = new JCG.HashSet <RBBINode>(p.fFollowPos.Count); } U.UnionWith(p.fFollowPos); } } // if U is not empty and not in DStates then int ux = 0; bool UinDstates = false; if (U != null) { Assert.Assrt(U.Count > 0); int ix; for (ix = 0; ix < fDStates.Count; ix++) { RBBIStateDescriptor temp2; temp2 = fDStates[ix]; if (SetEqualityComparer <RBBINode> .Default.Equals(U, temp2.fPositions)) { U = temp2.fPositions; ux = ix; UinDstates = true; break; } } // Add U as an unmarked state to Dstates if (!UinDstates) { RBBIStateDescriptor newState = new RBBIStateDescriptor(lastInputSymbol); newState.fPositions = U; fDStates.Add(newState); ux = fDStates.Count - 1; } // Dtran[T, a] := U; T.fDtran[a] = ux; } } } }