Пример #1
0
	    // ---------------------------------------------------------------------------------
	    //
	    // scanSet Construct a UnicodeSet from the text at the current scan
	    // position. Advance the scan position to the first character
	    // after the set.
	    //
	    // A new RBBI setref node referring to the set is pushed onto the node
	    // stack.
	    //
	    // The scan position is normally under the control of the state machine
	    // that controls rule parsing. UnicodeSets, however, are parsed by
	    // the UnicodeSet constructor, not by the RBBI rule parser.
	    //
	    // ---------------------------------------------------------------------------------
	    internal void ScanSet() {
	        UnicodeSet uset = null;
	        int startPos;
	        ILOG.J2CsMapping.Text.ParsePosition pos = new ILOG.J2CsMapping.Text.ParsePosition(fScanIndex);
	        int i;
	
	        startPos = fScanIndex;
	        try {
	            uset = new UnicodeSet(fRB.fRules, pos, fSymbolTable,
	                    IBM.ICU.Text.UnicodeSet.IGNORE_SPACE);
	        } catch (Exception e) { // TODO: catch fewer exception types.
	            // Repackage UnicodeSet errors as RBBI rule builder errors, with
	            // location info.
	            Error(IBM.ICU.Text.RBBIRuleBuilder.U_BRK_MALFORMED_SET);
	        }
	
	        // Verify that the set contains at least one code point.
	        //
	        if (uset.IsEmpty()) {
	            // This set is empty.
	            // Make it an error, because it almost certainly is not what the
	            // user wanted.
	            // Also, avoids having to think about corner cases in the tree
	            // manipulation code
	            // that occurs later on.
	            // TODO: this shouldn't be an error; it does happen.
	            Error(IBM.ICU.Text.RBBIRuleBuilder.U_BRK_RULE_EMPTY_SET);
	        }
	
	        // Advance the RBBI parse postion over the UnicodeSet pattern.
	        // Don't just set fScanIndex because the line/char positions maintained
	        // for error reporting would be thrown off.
	        i = pos.GetIndex();
	        for (;;) {
	            if (fNextIndex >= i) {
	                break;
	            }
	            NextCharLL();
	        }
	
	        RBBINode n;
	
	        n = PushNewNode(IBM.ICU.Text.RBBINode.setRef);
	        n.fFirstPos = startPos;
	        n.fLastPos = fNextIndex;
	        n.fText = fRB.fRules.Substring(n.fFirstPos,(n.fLastPos)-(n.fFirstPos));
	        // findSetFor() serves several purposes here:
	        // - Adopts storage for the UnicodeSet, will be responsible for
	        // deleting.
	        // - Mantains collection of all sets in use, needed later for
	        // establishing
	        // character categories for run time engine.
	        // - Eliminates mulitiple instances of the same set.
	        // - Creates a new uset node if necessary (if this isn't a duplicate.)
	        FindSetFor(n.fText, n, uset);
	    }
Пример #2
0
        /// <exclude/>
        /// <summary>
        /// Return the set of all characters that may be modified by this
        /// Transliterator, ignoring the effect of our filter.
        /// </summary>
        ///
        protected internal override UnicodeSet HandleGetSourceSet()
        {
            UnicodeSet set = new UnicodeSet();

            for (int i = 0; i < trans.Length; ++i)
            {
                set.AddAll(trans[i].GetSourceSet());
                // Take the example of Hiragana-Latin. This is really
                // Hiragana-Katakana; Katakana-Latin. The source set of
                // these two is roughly [:Hiragana:] and [:Katakana:].
                // But the source set for the entire transliterator is
                // actually [:Hiragana:] ONLY -- that is, the first
                // non-empty source set.

                // This is a heuristic, and not 100% reliable.
                if (!set.IsEmpty())
                {
                    break;
                }
            }
            return(set);
        }