Beispiel #1
0
        private List <CharPartRange> canonicalized(List <CharClassPart> e)
        {
            List <CharPartRange> l2 = new List <CharPartRange>();

            foreach (CharClassPart c in e)
            {
                if (c is CharPartSingle)
                {
                    CharPartSingle cs = c as CharPartSingle;
                    l2.Add(new CharPartRange(cs.Ch, cs.Ch));
                }
                else if (c is CharPartRange)
                {
                    CharPartRange cr = c as CharPartRange;
                    if (cr.From > cr.To)
                    {
                        cr = new CharPartRange(cr.To, cr.From);
                    }
                    l2.Add(cr);
                }
                else
                {
                    throw new ArgumentException(c.ToString());
                }
            }

            l2.Sort((a, b) => a.From.CompareTo(b.From));

            return(l2);
        }
Beispiel #2
0
        private List <Trans> charRange(CharClass cc)
        {
            List <Trans> ret = new List <Trans>();

            foreach (CharClassPart _cr in cc.Parts)
            {
                // Assumes the CharClass has been canonicalized
                // i.e all parts are ranges
                CharPartRange cr = _cr as CharPartRange;
                ret.Add(new CharRange(cr.From, cr.To));
            }
            return(ret);
        }
Beispiel #3
0
        private CharClass canonicalized(NotCharClass e)
        {
            /*
             * Here we convert something like [^a-zA-Z] into a vector of CharPartRange objects
             * we start with the 'any char' range, from 0 to 0xffff, and we subtract each CharClassPart in turn
             */

            // todo:
            // Since the code tries to simplify things by converting
            // all CharClassPart objects into ranges, we will treat 'Any'
            // as a range from 0 to 0xffff (the maximum value for a char, i.e 16-bit Unicode)
            // this probably wreaks havoc with internationalization, but we're using chars
            // and ignoring anything more than 16bit for now anyway. If we later use a more advanced library
            // we need to deal with 'Any' and 'Not' for CharClasses in a more general
            // (and encoding independent) way.

            // Note that the code below assumes the 'excluded' vector is sorted
            // which is provided by the called other canonicalized(..) function
            List <CharPartRange> excluded = canonicalized(e.Parts);

            List <CharClassPart> result = new List <CharClassPart>();
            int start = 0, end = 0xffff;

            for (int i = 0; i < excluded.Count; ++i)
            {
                CharPartRange r = excluded[i];
                // It is important for a, b to be SIGNED ints, so that From.unicode()-1 can be negative
                // and To.unicode()+1 doesn't wrap around
                int a = start, b = r.From - 1;
                if (valid(a, b))
                {
                    result.Add(new CharPartRange((char)a, (char)b));
                }
                start = r.To + 1;
            }

            int aa = start, bb = end;

            if (valid(aa, bb))
            {
                result.Add(new CharPartRange((char)aa, (char)bb));
            }

            return(new CharClass(result));
        }