// --------------------------------------------------------------------------------- // // scanSet Construct a UnicodeSet from the text at the current scan // position. Advance the scan position to the first character // after the set. // // A new RBBI setref node referring to the set is pushed onto the node // stack. // // The scan position is normally under the control of the state machine // that controls rule parsing. UnicodeSets, however, are parsed by // the UnicodeSet constructor, not by the RBBI rule parser. // // --------------------------------------------------------------------------------- internal void ScanSet() { UnicodeSet uset = null; int startPos; ILOG.J2CsMapping.Text.ParsePosition pos = new ILOG.J2CsMapping.Text.ParsePosition(fScanIndex); int i; startPos = fScanIndex; try { uset = new UnicodeSet(fRB.fRules, pos, fSymbolTable, IBM.ICU.Text.UnicodeSet.IGNORE_SPACE); } catch (Exception e) { // TODO: catch fewer exception types. // Repackage UnicodeSet errors as RBBI rule builder errors, with // location info. Error(IBM.ICU.Text.RBBIRuleBuilder.U_BRK_MALFORMED_SET); } // Verify that the set contains at least one code point. // if (uset.IsEmpty()) { // This set is empty. // Make it an error, because it almost certainly is not what the // user wanted. // Also, avoids having to think about corner cases in the tree // manipulation code // that occurs later on. // TODO: this shouldn't be an error; it does happen. Error(IBM.ICU.Text.RBBIRuleBuilder.U_BRK_RULE_EMPTY_SET); } // Advance the RBBI parse postion over the UnicodeSet pattern. // Don't just set fScanIndex because the line/char positions maintained // for error reporting would be thrown off. i = pos.GetIndex(); for (;;) { if (fNextIndex >= i) { break; } NextCharLL(); } RBBINode n; n = PushNewNode(IBM.ICU.Text.RBBINode.setRef); n.fFirstPos = startPos; n.fLastPos = fNextIndex; n.fText = fRB.fRules.Substring(n.fFirstPos,(n.fLastPos)-(n.fFirstPos)); // findSetFor() serves several purposes here: // - Adopts storage for the UnicodeSet, will be responsible for // deleting. // - Mantains collection of all sets in use, needed later for // establishing // character categories for run time engine. // - Eliminates mulitiple instances of the same set. // - Creates a new uset node if necessary (if this isn't a duplicate.) FindSetFor(n.fText, n, uset); }
/// <exclude/> /// <summary> /// Return the set of all characters that may be modified by this /// Transliterator, ignoring the effect of our filter. /// </summary> /// protected internal override UnicodeSet HandleGetSourceSet() { UnicodeSet set = new UnicodeSet(); for (int i = 0; i < trans.Length; ++i) { set.AddAll(trans[i].GetSourceSet()); // Take the example of Hiragana-Latin. This is really // Hiragana-Katakana; Katakana-Latin. The source set of // these two is roughly [:Hiragana:] and [:Katakana:]. // But the source set for the entire transliterator is // actually [:Hiragana:] ONLY -- that is, the first // non-empty source set. // This is a heuristic, and not 100% reliable. if (!set.IsEmpty()) { break; } } return(set); }