private int mapped_charset_size; // reduced charset size public void simplify(CSpec m_spec) { computeClasses(m_spec); // initialize fields. // now rewrite the NFA using our character class mapping. IEnumerator e = m_spec.m_nfa_states.elements(); while (e.MoveNext()) { CNfa nfa = (CNfa)e.Current; if (nfa.m_edge == CNfa.EMPTY || nfa.m_edge == CNfa.EPSILON) { continue; // no change. } if (nfa.m_edge == CNfa.CCL) { CSet ncset = new CSet(); ncset.map(nfa.m_set, ccls); // map it. nfa.m_set = ncset; } else { // single character nfa.m_edge = ccls[nfa.m_edge]; // map it. } } // now update m_spec with the mapping. m_spec.m_ccls_map = ccls; m_spec.m_dtrans_ncols = mapped_charset_size; }
/******************************************************** * Function: mimic * Description: Converts this NFA state into a copy of * the input one. *******************************************************/ public void mimic ( CNfa nfa ) { m_edge = nfa.m_edge; if (null != nfa.m_set) { if (null == m_set) { m_set = new CSet(); } m_set.mimic(nfa.m_set); } else { m_set = null; } m_next = nfa.m_next; m_next2 = nfa.m_next2; m_accept = nfa.m_accept; m_anchor = nfa.m_anchor; if (null != nfa.m_states) { m_states = (SparseBitSet)nfa.m_states.Clone(); } else { m_states = null; } }
/*************************************************************** * Function: CNfaPair **************************************************************/ public CNfaPair ( ) { m_start = null; m_end = null; }
/*************************************************************** * Function: discardCNfa * Description: **************************************************************/ private void discardCNfa ( CNfa nfa ) { m_spec.m_nfa_states.removeElement(nfa); }
/******************************************************** * Function: CNfa *******************************************************/ public CNfa ( ) { m_edge = EMPTY; m_set = null; m_next = null; m_next2 = null; m_accept = null; m_anchor = CSpec.NONE; m_label = NO_LABEL; m_states = null; }
/*************************************************************** * Function: factor * Description: Recursive descent regular expression parser. **************************************************************/ private void factor ( CNfaPair pair ) { CNfa start = null; CNfa end = null; #if (DESCENT_DEBUG) { CUtility.enter("factor", m_spec.m_lexeme, m_spec.m_current_token); } #endif term(pair); if (CLexGen.CLOSURE == m_spec.m_current_token || CLexGen.PLUS_CLOSE == m_spec.m_current_token || CLexGen.OPTIONAL == m_spec.m_current_token) { start = CAlloc.newCNfa(m_spec); end = CAlloc.newCNfa(m_spec); start.m_next = pair.m_start; pair.m_end.m_next = end; if (CLexGen.CLOSURE == m_spec.m_current_token || CLexGen.OPTIONAL == m_spec.m_current_token) { start.m_next2 = end; } if (CLexGen.CLOSURE == m_spec.m_current_token || CLexGen.PLUS_CLOSE == m_spec.m_current_token) { pair.m_end.m_next2 = pair.m_start; } pair.m_start = start; pair.m_end = end; m_lexGen.advance(); } #if (DESCENT_DEBUG) { CUtility.leave("factor", m_spec.m_lexeme, m_spec.m_current_token); } #endif }
/*************************************************************** * Function: newCNfa * Description: **************************************************************/ public static CNfa newCNfa ( CSpec spec ) { CNfa p; /* UNDONE: Buffer this? */ p = new CNfa(); /*p.m_label = spec.m_nfa_states.size();*/ spec.m_nfa_states.addElement(p); p.m_edge = CNfa.EPSILON; return(p); }
/*************************************************************** * Function: processStates * Description: **************************************************************/ private void processStates ( SparseBitSet states, CNfa current ) { int size; int i; size = m_spec.m_states.Count; for (i = 0; i < size; ++i) { if (states.Get(i)) { m_spec.m_state_rules[i].addElement(current); } } }
/*************************************************************** * Function: CSpec * Description: Constructor. **************************************************************/ public CSpec ( CLexGen lexGen ) { m_lexGen = lexGen; /* Initialize regular expression token variables. */ m_current_token = CLexGen.EOS; m_lexeme = '\0'; m_in_quote = false; m_in_ccl = false; /* Initialize hashtable for lexer states. */ m_states = new Hashtable(); m_states.Add("YYINITIAL", m_states.Count); /* Initialize hashtable for lexical macros. */ m_macros = new Hashtable(); /* Initialize variables for lexer options. */ m_integer_type = false; m_intwrap_type = false; m_count_lines = false; m_count_chars = true; m_cup_compatible = false; m_unix = true; m_public = false; m_yyeof = false; m_ignorecase = false; /* Initialize variables for JLex runtime options. */ m_verbose = false; m_nfa_start = null; m_nfa_states = new Vector(); m_dfa_states = new Vector(); m_dfa_sets = new Hashtable(); m_dtrans_vector = new Vector(); m_dtrans_ncols = CUtility.MAX_SEVEN_BIT + 1; m_row_map = null; m_col_map = null; m_accept_vector = null; m_anchor_array = null; m_init_code = null; m_init_read = 0; m_init_throw_code = null; m_init_throw_read = 0; m_yylex_throw_code = null; m_yylex_throw_read = 0; m_class_code = null; m_class_read = 0; m_eof_code = null; m_eof_read = 0; m_eof_value_code = null; m_eof_value_read = 0; m_eof_throw_code = null; m_eof_throw_read = 0; m_state_dtrans = null; m_state_rules = null; }
/** Compute minimum Set of character classes needed to disambiguate * edges. We optimistically assume that every character belongs to * a single character class, and then incrementally split classes * as we see edges that require discrimination between characters in * the class. [CSA, 25-Jul-1999] */ private void computeClasses(CSpec m_spec) { this.original_charset_size = m_spec.m_dtrans_ncols; this.ccls = new int[original_charset_size]; // initially all zero. int nextcls = 1; SparseBitSet clsA = new SparseBitSet(), clsB = new SparseBitSet(); Hashtable h = new Hashtable(); if (m_spec.m_verbose) { System.Console.Write("Working on character classes."); } IEnumerator e = m_spec.m_nfa_states.elements(); while (e.MoveNext()) { CNfa nfa = (CNfa)e.Current; if (nfa.m_edge == CNfa.EMPTY || nfa.m_edge == CNfa.EPSILON) { continue; // no discriminatory information. } clsA.clearAll(); clsB.clearAll(); for (int i = 0; i < ccls.Length; i++) { if (nfa.m_edge == i || // edge labeled with a character nfa.m_edge == CNfa.CCL && nfa.m_set.contains(i)) // Set of characters { clsA.Set(ccls[i]); } else { clsB.Set(ccls[i]); } } // now figure out which character classes we need to split. clsA.and(clsB); // split the classes which show up on both sides of edge if (m_spec.m_verbose) { System.Console.Write(clsA.size() == 0?".":":"); } if (clsA.size() == 0) { continue; // nothing to do. } // and split them. h.Clear(); // h will map old to new class name for (int i = 0; i < ccls.Length; i++) { if (clsA.Get(ccls[i])) // a split class { if (nfa.m_edge == i || nfa.m_edge == CNfa.CCL && nfa.m_set.contains(i)) { // on A side int split = ccls[i]; if (!h.ContainsKey(split)) { h.Add(split, (nextcls++)); // make new class } ccls[i] = (int)h[split]; } } } } if (m_spec.m_verbose) { System.Console.WriteLine(); System.Console.WriteLine("NFA has " + nextcls + " distinct character classes."); } this.mapped_charset_size = nextcls; }
/*************************************************************** * Function: rule * Description: Recursive descent regular expression parser. **************************************************************/ private CNfa rule ( ) { CNfaPair pair; //CNfa p; CNfa start = null; CNfa end = null; int anchor = CSpec.NONE; #if (DESCENT_DEBUG) { CUtility.enter("rule", m_spec.m_lexeme, m_spec.m_current_token); } #endif pair = CAlloc.newCNfaPair(); if (CLexGen.AT_BOL == m_spec.m_current_token) { anchor = anchor | CSpec.START; m_lexGen.advance(); expr(pair); // CSA: fixed beginning-of-line operator. 8-aug-1999 start = CAlloc.newCNfa(m_spec); start.m_edge = m_spec.BOL; start.m_next = pair.m_start; end = pair.m_end; } else { expr(pair); start = pair.m_start; end = pair.m_end; } if (CLexGen.AT_EOL == m_spec.m_current_token) { m_lexGen.advance(); // CSA: fixed end-of-line operator. 8-aug-1999 CNfaPair nlpair = CAlloc.newNLPair(m_spec); end.m_next = CAlloc.newCNfa(m_spec); end.m_next.m_next = nlpair.m_start; end.m_next.m_next2 = CAlloc.newCNfa(m_spec); end.m_next.m_next2.m_edge = m_spec.EOF; end.m_next.m_next2.m_next = nlpair.m_end; end = nlpair.m_end; anchor = anchor | CSpec.END; } /* Check for null rules. Charles Fischer found this bug. [CSA] */ if (end == null) { CError.parse_error(CError.E_ZERO, m_input.m_line_number); } /* Handle end of regular expression. See page 103. */ end.m_accept = m_lexGen.packAccept(); end.m_anchor = anchor; /* Begin: Removed for states. */ /*m_lexGen.advance();*/ /* End: Removed for states. */ #if (DESCENT_DEBUG) { CUtility.leave("rule", m_spec.m_lexeme, m_spec.m_current_token); } #endif return(start); }