// ---------------------------------------------------------------------------------------- // // Constructor. // // ---------------------------------------------------------------------------------------- internal RBBIRuleScanner(RBBIRuleBuilder rb) { this.kStackSize = 100; this.fC = new RBBIRuleScanner.RBBIRuleChar (); this.fStack = new short[kStackSize]; this.fNodeStack = new RBBINode[kStackSize]; this.fSetTable = new Hashtable(); this.fRuleSets = new UnicodeSet[10]; fRB = rb; fLineNum = 1; // // Set up the constant Unicode Sets. // Note: These could be made static and shared among // all instances of RBBIRuleScanners. fRuleSets[IBM.ICU.Text.RBBIRuleParseTable.kRuleSet_rule_char - 128] = new UnicodeSet( gRuleSet_rule_char_pattern); fRuleSets[IBM.ICU.Text.RBBIRuleParseTable.kRuleSet_white_space - 128] = new UnicodeSet( gRuleSet_white_space_pattern); fRuleSets[IBM.ICU.Text.RBBIRuleParseTable.kRuleSet_name_char - 128] = new UnicodeSet( gRuleSet_name_char_pattern); fRuleSets[IBM.ICU.Text.RBBIRuleParseTable.kRuleSet_name_start_char - 128] = new UnicodeSet( gRuleSet_name_start_char_pattern); fRuleSets[IBM.ICU.Text.RBBIRuleParseTable.kRuleSet_digit_char - 128] = new UnicodeSet( gRuleSet_digit_char_pattern); fSymbolTable = new RBBISymbolTable(this, rb.fRules); }
// --------------------------------------------------------------------------------- // // nextChar for rules scanning. At this level, we handle stripping // out comments and processing backslash character escapes. // The rest of the rules grammar is handled at the next level up. // // --------------------------------------------------------------------------------- internal void NextChar(RBBIRuleScanner.RBBIRuleChar c) { // Unicode Character constants needed for the processing done by // nextChar(), // in hex because literals wont work on EBCDIC machines. fScanIndex = fNextIndex; c.fChar = NextCharLL(); c.fEscaped = false; // // check for '' sequence. // These are recognized in all contexts, whether in quoted text or not. // if (c.fChar == '\'') { if (IBM.ICU.Text.UTF16.CharAt(fRB.fRules, fNextIndex) == '\'') { c.fChar = NextCharLL(); // get nextChar officially so character // counts c.fEscaped = true; // stay correct. } else { // Single quote, by itself. // Toggle quoting mode. // Return either '(' or ')', because quotes cause a grouping of // the quoted text. fQuoteMode = !fQuoteMode; if (fQuoteMode == true) { c.fChar = '('; } else { c.fChar = ')'; } c.fEscaped = false; // The paren that we return is not escaped. return; } } if (fQuoteMode) { c.fEscaped = true; } else { // We are not in a 'quoted region' of the source. // if (c.fChar == '#') { // Start of a comment. Consume the rest of it. // The new-line char that terminates the comment is always // returned. // It will be treated as white-space, and serves to break up // anything // that might otherwise incorrectly clump together with a // comment in // the middle (a variable name, for example.) for (;;) { c.fChar = NextCharLL(); if (c.fChar == (int) -1 || // EOF c.fChar == '\r' || c.fChar == '\n' || c.fChar == chNEL || c.fChar == chLS) { break; } } } if (c.fChar == (int) -1) { return; } // // check for backslash escaped characters. // Use String.unescapeAt() to handle them. // if (c.fChar == '\\') { c.fEscaped = true; int[] unescapeIndex = new int[1]; unescapeIndex[0] = fNextIndex; c.fChar = IBM.ICU.Impl.Utility.UnescapeAt(fRB.fRules, unescapeIndex); if (unescapeIndex[0] == fNextIndex) { Error(IBM.ICU.Text.RBBIRuleBuilder.U_BRK_HEX_DIGITS_EXPECTED); } fCharNum += unescapeIndex[0] - fNextIndex; fNextIndex = unescapeIndex[0]; } } // putc(c.fChar, stdout); }