private CharClass canonicalized(NotCharClass e) { /* * Here we convert something like [^a-zA-Z] into a vector of CharPartRange objects * we start with the 'any char' range, from 0 to 0xffff, and we subtract each CharClassPart in turn */ // todo: // Since the code tries to simplify things by converting // all CharClassPart objects into ranges, we will treat 'Any' // as a range from 0 to 0xffff (the maximum value for a char, i.e 16-bit Unicode) // this probably wreaks havoc with internationalization, but we're using chars // and ignoring anything more than 16bit for now anyway. If we later use a more advanced library // we need to deal with 'Any' and 'Not' for CharClasses in a more general // (and encoding independent) way. // Note that the code below assumes the 'excluded' vector is sorted // which is provided by the called other canonicalized(..) function List <CharPartRange> excluded = canonicalized(e.Parts); List <CharClassPart> result = new List <CharClassPart>(); int start = 0, end = 0xffff; for (int i = 0; i < excluded.Count; ++i) { CharPartRange r = excluded[i]; // It is important for a, b to be SIGNED ints, so that From.unicode()-1 can be negative // and To.unicode()+1 doesn't wrap around int a = start, b = r.From - 1; if (valid(a, b)) { result.Add(new CharPartRange((char)a, (char)b)); } start = r.To + 1; } int aa = start, bb = end; if (valid(aa, bb)) { result.Add(new CharPartRange((char)aa, (char)bb)); } return(new CharClass(result)); }
public FA nfaFromRegEx(RExpr _e) { if (_e is ByName) { //ByName e = (ByName) _e; throw new System.NotImplementedException("nfaFromRegEx/ByName not implemented"); } else if (_e is CharClass) { CharClass e = _e as CharClass; e = canonicalized(e); int start = newState(); int end = newState(); return(fa(start, end).trans(start, end, charRange(e))); } else if (_e is NotCharClass) { NotCharClass __e = _e as NotCharClass; CharClass e = canonicalized(__e); int start = newState(); int end = newState(); return(fa(start, end).trans(start, end, charRange(e))); } else if (_e is Oring) { Oring e = _e as Oring; List <FA> opts = Utils.list(e.Exprs.Select(a => { return(nfaFromRegEx(a)); })); List <int> startEpsilons = Utils.list(opts.Select(a => { return(a.startState); })); List <int> endEpsilons = Utils.list(opts.Select(a => { return(onlyAcceptingState(a)); })); int start = newState(); int end = newState(); FA _fa = fa(start, end).merge(opts); for (int i = 0; i < opts.Count; ++i) { _fa.trans(start, startEpsilons[i], epsilon()); _fa.trans(endEpsilons[i], end, epsilon()); } return(_fa); } else if (_e is Plus) { Plus e = _e as Plus; FA fa = nfaFromRegEx(e.Expr); fa.trans(onlyAcceptingState(fa), fa.startState, epsilon()); return(fa); } else if (_e is RXSeq) { RXSeq e = _e as RXSeq; if (e.Exprs.Count == 1) { return(nfaFromRegEx(e.Exprs[0])); } List <FA> opts = Utils.list(e.Exprs.Select(a => { return(nfaFromRegEx(a)); })); FA _fa = fa(opts[0].startState, onlyAcceptingState(Utils.last(opts))).merge(opts); for (int i = 0; i < opts.Count - 1; ++i) { int prev = onlyAcceptingState(opts [i]); int next = opts [i + 1].startState; _fa.trans(prev, next, epsilon()); } return(_fa); } else if (_e is Star) { Star e = _e as Star; FA fa = nfaFromRegEx(e.Expr); fa.trans(onlyAcceptingState(fa), fa.startState, epsilon()); fa.trans(fa.startState, onlyAcceptingState(fa), epsilon()); return(fa); } else if (_e is Question) { Question e = _e as Question; FA fa = nfaFromRegEx(e.Expr); fa.trans(fa.startState, onlyAcceptingState(fa), epsilon()); return(fa); } else if (_e is Str) { Str e = _e as Str; List <RExpr> seq = new List <RExpr>(); // todo: iterating over a string, this would // not work with surrogate pairs...etc for (int i = 0; i < e.Value.Length; ++i) { char c = e.Value[i]; List <CharClassPart> v = Utils.list1((CharClassPart) new CharPartRange(c, c)); CharClass cc = new CharClass(v); seq.Add(cc); } return(nfaFromRegEx(new RXSeq(seq))); } else { throw new Exception("OptionNotHandledException(_e)"); } }