Ejemplo n.º 1
0
 /// <summary>
 /// Returns true if this set contains one or more of the characters
 /// and strings of the given set.
 /// </summary>
 /// <param name="set">This set.</param>
 /// <param name="s">Set to be checked for containment.</param>
 /// <returns>True if the condition is met.</returns>
 /// <draft>ICU4N 60.1</draft>
 /// <provisional>This API might change or be removed in a future release.</provisional>
 public static bool ContainsSome(this UnicodeSet set, UnicodeSet s)
 {
     if (set == null)
     {
         throw new ArgumentNullException(nameof(set));
     }
     return(set.ContainsSome(s));
 }
Ejemplo n.º 2
0
 /// <summary>
 /// Union the set of all characters that may output by this object into the
 /// given set.
 /// </summary>
 ///
 /// <param name="toUnionTo">the set into which to union the output characters</param>
 public virtual void AddReplacementSetTo(UnicodeSet toUnionTo)
 {
     // The output of this replacer varies; it is the source text between
     // matchStart and matchLimit. Since this varies depending on the
     // input text, we can't compute it here. We can either do nothing
     // or we can add ALL characters to the set. It's probably more useful
     // to do nothing.
 }
Ejemplo n.º 3
0
        public void ToChar()
        {
            const string         pattern    = "[A-GH-NO-ST-Z{TZ}]";
            IEnumerable <string> unicodeSet = UnicodeSet.ToCharacters(pattern);
            IEnumerable <string> expected   = "A B C D E F G H I J K L M N O P Q R S T U V W X Y Z TZ".Split(' ');

            Assert.That(unicodeSet, Is.EqualTo(expected));
        }
Ejemplo n.º 4
0
 /// <summary>
 /// Retains only the elements in this set that are contained in the
 /// specified set.  In other words, removes from this set all of
 /// its elements that are not contained in the specified set.  This
 /// operation effectively modifies this set so that its value is
 /// the <i>intersection</i> of the two sets.
 /// </summary>
 /// <param name="set">This set.</param>
 /// <param name="c">Set that defines which elements this set will retain.</param>
 /// <stable>ICU 2.0</stable>
 public static UnicodeSet RetainAll(this UnicodeSet set, UnicodeSet c)
 {
     if (set == null)
     {
         throw new ArgumentNullException(nameof(set));
     }
     return(set.RetainAll(c));
 }
Ejemplo n.º 5
0
        public void ToPattern()
        {
            List <string> unicodeSet = "A B C D E F G H I J K L M N O P Q R S TZ T U V W X Y Z".Split(' ').ToList();
            const string  expected   = "[A-Z{TZ}]";
            string        result     = UnicodeSet.ToPattern(unicodeSet);

            Assert.That(result, Is.EqualTo(expected));
        }
Ejemplo n.º 6
0
 /// <summary>
 /// Adds all characters in range (uses preferred naming convention).
 /// </summary>
 /// <param name="set">This set.</param>
 /// <param name="start">The index of where to start on adding all characters.</param>
 /// <param name="end">The index of where to end on adding all characters.</param>
 /// <returns>A reference to this object.</returns>
 /// <draft>ICU4N 60.1</draft>
 /// <provisional>This API might change or be removed in a future release.</provisional>
 public static UnicodeSet AddAll(this UnicodeSet set, int start, int end)
 {
     if (set == null)
     {
         throw new ArgumentNullException(nameof(set));
     }
     return(set.AddAll(start, end));
 }
Ejemplo n.º 7
0
 /// <summary>
 /// Add a collection (as strings) into this <see cref="UnicodeSet"/>.
 /// Uses standard naming convention.
 /// </summary>
 /// <param name="set">This set.</param>
 /// <param name="source">Source collection to add into.</param>
 /// <returns>A reference to this object.</returns>
 /// <draft>ICU4N 60.1</draft>
 /// <provisional>This API might change or be removed in a future release.</provisional>
 // ICU4N specific overload to optimize for string
 public static UnicodeSet AddAll(this UnicodeSet set, IEnumerable <string> source)
 {
     if (set == null)
     {
         throw new ArgumentNullException(nameof(set));
     }
     return(set.AddAll(source));
 }
Ejemplo n.º 8
0
 public CompoundRBTEntry(String theID, ArrayList theIDBlockVector,
                         ArrayList theDataVector, UnicodeSet theCompoundFilter)
 {
     ID             = theID;
     idBlockVector  = theIDBlockVector;
     dataVector     = theDataVector;
     compoundFilter = theCompoundFilter;
 }
Ejemplo n.º 9
0
        /// <seealso cref="Transliterator.AddSourceTargetSet(UnicodeSet, UnicodeSet, UnicodeSet)"/>
        public override void AddSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet)
        {
            // intersect myFilter with the input filter
            UnicodeSet myFilter = GetFilterAsUnicodeSet(inputFilter);

            sourceSet.AddAll(myFilter);
            // do nothing with the target
        }
Ejemplo n.º 10
0
        public void ToCharacters_UpperPlane()
        {
            string pattern = @"[\U00010D00]";
            string strForUpperPlaneCharacter = UnicodeSet.ToCharacters(pattern).First();

            Assert.That(strForUpperPlaneCharacter, Is.EqualTo("𐴀"));
            Assert.That(char.ConvertToUtf32(strForUpperPlaneCharacter, 0), Is.EqualTo(0x00010D00));
        }
Ejemplo n.º 11
0
 /// <summary>
 /// Add the contents of the UnicodeSet (as strings) into a collection.
 /// </summary>
 /// <typeparam name="T">Collection type.</typeparam>
 /// <param name="set">This set.</param>
 /// <param name="target">Collection to add into.</param>
 /// <draft>ICU4N 60.1</draft>
 /// <provisional>This API might change or be removed in a future release.</provisional>
 public static T AddAllTo <T>(this UnicodeSet set, T target) where T : ICollection <string>
 {
     if (set == null)
     {
         throw new ArgumentNullException(nameof(set));
     }
     return(set.AddAllTo(target));
 }
Ejemplo n.º 12
0
 /// <summary>
 /// Complements the specified character in this set.  The character
 /// will be removed if it is in this set, or will be added if it is
 /// not in this set.
 /// </summary>
 /// <draft>ICU4N 60.1</draft>
 /// <provisional>This API might change or be removed in a future release.</provisional>
 public static UnicodeSet Complement(this UnicodeSet set, int c)
 {
     if (set == null)
     {
         throw new ArgumentNullException(nameof(set));
     }
     return(set.Complement(c));
 }
Ejemplo n.º 13
0
        public void SinglePatternToChar()
        {
            const string pattern = "[A]";

            IEnumerable <string> unicodeSet = UnicodeSet.ToCharacters(pattern);
            IEnumerable <string> expected   = "A".Split(' ');

            Assert.That(unicodeSet, Is.EqualTo(expected));
        }
Ejemplo n.º 14
0
 public UnicodeSet GetMatchSet(UnicodeSet result)
 {
     if (result == null)
     {
         result = new UnicodeSet();
     }
     AddAll(matchIterator, result);
     return(result);
 }
Ejemplo n.º 15
0
	    // ---------------------------------------------------------------------------------
	    //
	    // scanSet Construct a UnicodeSet from the text at the current scan
	    // position. Advance the scan position to the first character
	    // after the set.
	    //
	    // A new RBBI setref node referring to the set is pushed onto the node
	    // stack.
	    //
	    // The scan position is normally under the control of the state machine
	    // that controls rule parsing. UnicodeSets, however, are parsed by
	    // the UnicodeSet constructor, not by the RBBI rule parser.
	    //
	    // ---------------------------------------------------------------------------------
	    internal void ScanSet() {
	        UnicodeSet uset = null;
	        int startPos;
	        ILOG.J2CsMapping.Text.ParsePosition pos = new ILOG.J2CsMapping.Text.ParsePosition(fScanIndex);
	        int i;
	
	        startPos = fScanIndex;
	        try {
	            uset = new UnicodeSet(fRB.fRules, pos, fSymbolTable,
	                    IBM.ICU.Text.UnicodeSet.IGNORE_SPACE);
	        } catch (Exception e) { // TODO: catch fewer exception types.
	            // Repackage UnicodeSet errors as RBBI rule builder errors, with
	            // location info.
	            Error(IBM.ICU.Text.RBBIRuleBuilder.U_BRK_MALFORMED_SET);
	        }
	
	        // Verify that the set contains at least one code point.
	        //
	        if (uset.IsEmpty()) {
	            // This set is empty.
	            // Make it an error, because it almost certainly is not what the
	            // user wanted.
	            // Also, avoids having to think about corner cases in the tree
	            // manipulation code
	            // that occurs later on.
	            // TODO: this shouldn't be an error; it does happen.
	            Error(IBM.ICU.Text.RBBIRuleBuilder.U_BRK_RULE_EMPTY_SET);
	        }
	
	        // Advance the RBBI parse postion over the UnicodeSet pattern.
	        // Don't just set fScanIndex because the line/char positions maintained
	        // for error reporting would be thrown off.
	        i = pos.GetIndex();
	        for (;;) {
	            if (fNextIndex >= i) {
	                break;
	            }
	            NextCharLL();
	        }
	
	        RBBINode n;
	
	        n = PushNewNode(IBM.ICU.Text.RBBINode.setRef);
	        n.fFirstPos = startPos;
	        n.fLastPos = fNextIndex;
	        n.fText = fRB.fRules.Substring(n.fFirstPos,(n.fLastPos)-(n.fFirstPos));
	        // findSetFor() serves several purposes here:
	        // - Adopts storage for the UnicodeSet, will be responsible for
	        // deleting.
	        // - Mantains collection of all sets in use, needed later for
	        // establishing
	        // character categories for run time engine.
	        // - Eliminates mulitiple instances of the same set.
	        // - Creates a new uset node if necessary (if this isn't a duplicate.)
	        FindSetFor(n.fText, n, uset);
	    }
Ejemplo n.º 16
0
 /// <summary>
 /// Returns the next token in this string tokenizer's string. First, the set
 /// of characters considered to be delimiters by this
 /// <tt>StringTokenizer</tt> object is changed to be the characters in the
 /// string <tt>delim</tt>. Then the next token in the string after the
 /// current position is returned. The current position is advanced beyond the
 /// recognized token. The new delimiter set remains the default after this
 /// call.
 /// </summary>
 ///
 /// <param name="delim">the new delimiters.</param>
 /// <returns>the next token, after switching to the new delimiter set.</returns>
 /// <exception cref="NoSuchElementException">if there are no more tokens in this tokenizer's string.</exception>
 /// @stable ICU 2.4
 public String NextToken(String delim)
 {
     m_delimiters_ = EMPTY_DELIMITER_;
     if (delim != null && delim.Length > 0)
     {
         m_delimiters_ = new UnicodeSet();
         m_delimiters_.AddAll(delim);
     }
     return(NextToken(m_delimiters_));
 }
Ejemplo n.º 17
0
        /// <seealso cref="Transliterator.AddSourceTargetSet(UnicodeSet, UnicodeSet, UnicodeSet)"/>
#pragma warning disable 672
        public override void AddSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet)
        {
#pragma warning restore 672
#pragma warning disable 612, 618
            // intersect myFilter with the input filter
            UnicodeSet myFilter = GetFilterAsUnicodeSet(inputFilter);
#pragma warning restore 612, 618
            sourceSet.AddAll(myFilter);
            // do nothing with the target
        }
Ejemplo n.º 18
0
 public BNF AddSet(String variable, UnicodeSet set)
 {
     if (set != null)
     {
         String body = set.ToString();
         t.AddSymbol(variable, body, 0, body.Length);
         AddPick(variable, IBM.ICU.Charset.Pick.CodePointMthd(set));
     }
     return(this);
 }
Ejemplo n.º 19
0
        // might want to add to UnicodeSet
        private String getList(UnicodeSet set)
        {
            StringBuffer result = new StringBuffer();

            for (UnicodeSetIterator it = new UnicodeSetIterator(set); it.Next();)
            {
                result.Append(it.GetString());
            }
            return(result.ToString());
        }
Ejemplo n.º 20
0
        /// <exclude/>
        /// <summary>
        /// Returns the set of all characters that may be generated as replacement
        /// text by this transliterator.
        /// </summary>
        ///
        public override UnicodeSet GetTargetSet()
        {
            UnicodeSet set = new UnicodeSet();

            for (int i = 0; i < trans.Length; ++i)
            {
                // This is a heuristic, and not 100% reliable.
                set.AddAll(trans[i].GetTargetSet());
            }
            return(set);
        }
Ejemplo n.º 21
0
 /// <summary>
 /// Returns the ending offset found by matching characters with testSet,
 /// until a position is found that doen't match
 /// </summary>
 ///
 /// <param name="string"></param>
 /// <param name="offset"></param>
 /// <param name="testSet"></param>
 /// <returns></returns>
 public int Span(String str0, int offset, UnicodeSet testSet)
 {
     while (true)
     {
         int newOffset = testSet.MatchesAt(str0, offset);
         if (newOffset < 0)
         {
             return(offset);
         }
     }
 }
Ejemplo n.º 22
0
        /// <summary>
        /// Adds bunch o' codepoints; otherwise like put.
        /// </summary>
        ///
        /// <param name="codepoints"></param>
        /// <param name="value"></param>
        /// <returns>this (for chaining)</returns>
        public UnicodeMap PutAll(UnicodeSet codepoints, Object value_ren)
        {
            // TODO optimize
            UnicodeSetIterator it = new UnicodeSetIterator(codepoints);

            while (it.NextRange())
            {
                _putAll(it.codepoint, it.codepointEnd, value_ren);
            }
            return(this);
        }
Ejemplo n.º 23
0
        //
        // RBBISymbolTable::lookupMatcher This function from the abstract symbol
        // table
        // interface maps a single stand-in character to a
        // pointer to a Unicode Set. The Unicode Set code uses this
        // mechanism to get all references to the same $variable
        // name to refer to a single common Unicode Set instance.
        //
        // This implementation cheats a little, and does not maintain a map of
        // stand-in chars
        // to sets. Instead, it takes advantage of the fact that the UnicodeSet
        // constructor will always call this function right after calling lookup(),
        // and we just need to remember what set to return between these two calls.
        public virtual UnicodeMatcher LookupMatcher(int ch)
        {
            UnicodeSet retVal = null;

            if (ch == 0xffff)
            {
                retVal           = fCachedSetLookup;
                fCachedSetLookup = null;
            }
            return(retVal);
        }
Ejemplo n.º 24
0
 /// <summary>
 /// Adds a starting or ending string character to the <see cref="spanNotSet"/>
 /// so that a character span ends before any string.
 /// </summary>
 private void AddToSpanNotSet(int c)
 {
     if (Utility.SameObjects(spanNotSet, null) || Utility.SameObjects(spanNotSet, spanSet))
     {
         if (spanSet.Contains(c))
         {
             return; // Nothing to do.
         }
         spanNotSet = spanSet.CloneAsThawed();
     }
     spanNotSet.Add(c);
 }
Ejemplo n.º 25
0
	    // ----------------------------------------------------------------------------------------
	    //
	    // findSetFor given a String,
	    // - find the corresponding Unicode Set (uset node)
	    // (create one if necessary)
	    // - Set fLeftChild of the caller's node (should be a setRef node)
	    // to the uset node
	    // Maintain a hash table of uset nodes, so the same one is always used
	    // for the same string.
	    // If a "to adopt" set is provided and we haven't seen this key before,
	    // add the provided set to the hash table.
	    // If the string is one (32 bit) char in length, the set contains
	    // just one element which is the char in question.
	    // If the string is "any", return a set containing all chars.
	    //
	    // ----------------------------------------------------------------------------------------
	    internal void FindSetFor(String s, RBBINode node, UnicodeSet setToAdopt) {
	
	        RBBIRuleScanner.RBBISetTableEl  el;
	
	        // First check whether we've already cached a set for this string.
	        // If so, just use the cached set in the new node.
	        // delete any set provided by the caller, since we own it.
	        el = (RBBIRuleScanner.RBBISetTableEl ) ILOG.J2CsMapping.Collections.Collections.Get(fSetTable,s);
	        if (el != null) {
	            node.fLeftChild = el.val;
	            IBM.ICU.Impl.Assert.Assrt(node.fLeftChild.fType == IBM.ICU.Text.RBBINode.uset);
	            return;
	        }
	
	        // Haven't seen this set before.
	        // If the caller didn't provide us with a prebuilt set,
	        // create a new UnicodeSet now.
	        if (setToAdopt == null) {
	            if (s.Equals(kAny)) {
	                setToAdopt = new UnicodeSet(0x000000, 0x10ffff);
	            } else {
	                int c;
	                c = IBM.ICU.Text.UTF16.CharAt(s, 0);
	                setToAdopt = new UnicodeSet(c, c);
	            }
	        }
	
	        //
	        // Make a new uset node to refer to this UnicodeSet
	        // This new uset node becomes the child of the caller's setReference
	        // node.
	        //
	        RBBINode usetNode = new RBBINode(IBM.ICU.Text.RBBINode.uset);
	        usetNode.fInputSet = setToAdopt;
	        usetNode.fParent = node;
	        node.fLeftChild = usetNode;
	        usetNode.fText = s;
	
	        //
	        // Add the new uset node to the list of all uset nodes.
	        //
	        ILOG.J2CsMapping.Collections.Generics.Collections.Add(fRB.fUSetNodes,usetNode);
	
	        //
	        // Add the new set to the set hash table.
	        //
	        el = new RBBIRuleScanner.RBBISetTableEl ();
	        el.key = s;
	        el.val = usetNode;
	        ILOG.J2CsMapping.Collections.Collections.Put(fSetTable,el.key,el);
	
	        return;
	    }
Ejemplo n.º 26
0
 /// <summary>
 /// Returns the next token in this string tokenizer's string. First, the set
 /// of characters considered to be delimiters by this
 /// <tt>StringTokenizer</tt> object is changed to be the characters in the
 /// string <tt>delim</tt>. Then the next token in the string after the
 /// current position is returned. The current position is advanced beyond the
 /// recognized token. The new delimiter set remains the default after this
 /// call.
 /// </summary>
 ///
 /// <param name="delim">the new delimiters.</param>
 /// <returns>the next token, after switching to the new delimiter set.</returns>
 /// <exception cref="NoSuchElementException">if there are no more tokens in this tokenizer's string.</exception>
 /// @stable ICU 2.4
 public String NextToken(UnicodeSet delim)
 {
     m_delimiters_ = delim;
     CheckDelimiters();
     m_tokenOffset_ = -1;
     m_tokenSize_   = -1;
     if (!m_returnDelimiters_)
     {
         m_nextOffset_ = GetNextNonDelimiter(m_nextOffset_);
     }
     return(NextToken());
 }
Ejemplo n.º 27
0
 private void FixSets()
 {
     if (syntax.ContainsSome(QUOTERS) || syntax.ContainsSome(whiteSpace))
     {
         syntax = ((UnicodeSet)syntax.Clone()).RemoveAll(QUOTERS)
                  .RemoveAll(whiteSpace);
     }
     if (whiteSpace.ContainsSome(QUOTERS))
     {
         whiteSpace = ((UnicodeSet)whiteSpace.Clone()).RemoveAll(QUOTERS);
     }
     non_string = new UnicodeSet(syntax).AddAll(whiteSpace);
 }
Ejemplo n.º 28
0
 /// <summary>
 /// Returns the ending offset found by matching characters with testSet,
 /// until a position is found that does match
 /// </summary>
 ///
 /// <param name="string"></param>
 /// <param name="offset"></param>
 /// <param name="testSet"></param>
 /// <returns></returns>
 public int SpanNot(String str0, int offset, UnicodeSet testSet)
 {
     while (true)
     {
         int newOffset = testSet.MatchesAt(str0, offset);
         if (newOffset >= 0)
         {
             return(offset);
         }
         ++offset;     // try next character position
         // we don't have to worry about surrogates for this.
     }
 }
Ejemplo n.º 29
0
 /* Utilities */
 public static bool VerifySetsIdentical(AbstractTestLog here, UnicodeSet set1, UnicodeSet set2)
 {
     if (set1.Equals(set2))
     {
         return(true);
     }
     TestFmwk.Errln("Sets differ:");
     TestFmwk.Errln("UnicodeMap - HashMap");
     TestFmwk.Errln(new UnicodeSet(set1).RemoveAll(set2).ToPattern(true));
     TestFmwk.Errln("HashMap - UnicodeMap");
     TestFmwk.Errln(new UnicodeSet(set2).RemoveAll(set1).ToPattern(true));
     return(false);
 }
Ejemplo n.º 30
0
        private NamePrepTransform()
        {
            // load the resource bundle
            //ICUResourceBundle bundle = (ICUResourceBundle)ICUResourceBundle.GetBundleInstance("com/ibm/icu/dev/data/testdata", "idna_rules", typeof(NamePrepTransform).GetTypeInfo().Assembly, true);
            ICUResourceBundle bundle   = (ICUResourceBundle)ICUResourceBundle.GetBundleInstance("Dev/Data/TestData", "idna_rules", typeof(NamePrepTransform).GetTypeInfo().Assembly, true);
            String            mapRules = bundle.GetString("MapNoNormalization");

            mapRules += bundle.GetString("MapNFKC");
            // disable
            mapTransform      = new MapTransform("CaseMap", mapRules, 0 /*Transliterator.FORWARD*/);
            labelSeparatorSet = new UnicodeSet(bundle.GetString("LabelSeparatorSet"));
            prohibitedSet     = new UnicodeSet(bundle.GetString("ProhibitedSet"));
            unassignedSet     = new UnicodeSet(bundle.GetString("UnassignedSet"));
        }
Ejemplo n.º 31
0
        static Characters()
        {
            UCSCHAR = new UnicodeSet();
            UCSCHAR.add(0xA0, 0xD7FF);
            UCSCHAR.add(0xF900, 0xFDCF);
            UCSCHAR.add(0xFDF0, 0xFFEF);
            UCSCHAR.add(0x1000, 0x1FFFD);
            UCSCHAR.add(0x2000, 0x2FFFD);
            UCSCHAR.add(0x3000, 0x3FFFD);
            UCSCHAR.add(0x4000, 0x4FFFD);
            UCSCHAR.add(0x5000, 0x5FFFD);
            UCSCHAR.add(0x6000, 0x6FFFD);
            UCSCHAR.add(0x7000, 0x7FFFD);
            UCSCHAR.add(0x8000, 0x8FFFD);
            UCSCHAR.add(0x9000, 0x9FFFD);
            UCSCHAR.add(0xA000, 0xAFFFD);
            UCSCHAR.add(0xB000, 0xBFFFD);
            UCSCHAR.add(0xC000, 0xCFFFD);
            UCSCHAR.add(0xD000, 0xDFFFD);
            UCSCHAR.add(0xE000, 0xEFFFD);

            IUNRESERVED = new UnicodeSet();
            IUNRESERVED.addAll("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-._~");
            IUNRESERVED.addAll(UCSCHAR);

            // ipchar = iunreserved / pct-encoded / sub-delims / ":" / "@"
            IPCHAR = new UnicodeSet();
            IPCHAR.addAll(IUNRESERVED);
            IPCHAR.add(0x25); // '%'
            IPCHAR.addAll(SUB_DELIMS); // sub-delims
            IPCHAR.add(0x3A); // ':'
            IPCHAR.add(0x40); // '@'

            // xri-pchar = iunreserved / pct-encoded / xri-sub-delims / ":"
            XRI_PCHAR = new UnicodeSet();
            XRI_PCHAR.addAll(IUNRESERVED);
            XRI_PCHAR.add(0x25); // '%'
            XRI_PCHAR.addAll(XRI_SUB_DELIMS);
            XRI_PCHAR.add(0x3A); // ':'
        }