/// <summary> /// Builds a NFA from a unicode code point /// </summary> /// <param name="node">An AST node representing a NFA</param> /// <returns>The equivalent NFA</returns> private NFA BuildNFAFromCodepoint(ASTNode node) { // extract the code point value string value = node.Value; value = value.Substring(2, value.Length - 2); int cpValue = Convert.ToInt32(value, 16); if (cpValue < 0 || (cpValue >= 0xD800 && cpValue <= 0xDFFF) || cpValue >= 0x110000) { OnError(node.Position, "The value U+{0} is not a supported unicode code point", cpValue.ToString("X")); return(BuildEpsilonNFA()); } UnicodeCodePoint cp = new UnicodeCodePoint(cpValue); // build the NFA NFA automata = NFA.NewMinimal(); char[] data = cp.GetUTF16(); if (data.Length == 1) { automata.StateEntry.AddTransition(new CharSpan(data[0], data[0]), automata.StateExit); } else { NFAState intermediate = automata.AddNewState(); automata.StateEntry.AddTransition(new CharSpan(data[0], data[0]), intermediate); intermediate.AddTransition(new CharSpan(data[1], data[1]), automata.StateExit); } return(automata); }
/// <summary> /// Builds a NFA that matches everything (a single character) /// </summary> /// <returns>The equivalent NFA</returns> private static NFA BuildNFAFromAny() { NFA automata = NFA.NewMinimal(); // plane 0 transitions automata.StateEntry.AddTransition(new CharSpan((char)0x0000, (char)0xD7FF), automata.StateExit); automata.StateEntry.AddTransition(new CharSpan((char)0xE000, (char)0xFFFF), automata.StateExit); // surrogate pairs NFAState intermediate = automata.AddNewState(); automata.StateEntry.AddTransition(new CharSpan((char)0xD800, (char)0xDBFF), intermediate); intermediate.AddTransition(new CharSpan((char)0xDC00, (char)0xDFFF), automata.StateExit); return(automata); }
/// <summary> /// Builds a NFA from a piece of text /// </summary> /// <param name="node">An AST node representing a NFA</param> /// <returns>The equivalent NFA</returns> private NFA BuildNFAFromText(ASTNode node) { NFA automata = NFA.NewMinimal(); automata.StateExit = automata.StateEntry; // build the raw piece of text string value = node.Value; bool insensitive = caseInsensitive; if (value.StartsWith("~")) { insensitive = true; value = value.Substring(2, value.Length - 3); } else { value = value.Substring(1, value.Length - 2); } value = ReplaceEscapees(value); // build the result foreach (char c in value) { NFAState temp = automata.AddNewState(); if (insensitive && char.IsLetter(c)) { char c2 = char.IsLower(c) ? char.ToUpper(c) : char.ToLower(c); automata.StateExit.AddTransition(new CharSpan(c, c), temp); automata.StateExit.AddTransition(new CharSpan(c2, c2), temp); } else { automata.StateExit.AddTransition(new CharSpan(c, c), temp); } automata.StateExit = temp; } return(automata); }
public void AddNewStateTest() { var t1 = new HashSet <int>() { 1, 2, 3 }; var t2 = new HashSet <int>() { 1, 4 }; var t3 = new HashSet <int>() { 2, 3 }; var t4 = new HashSet <int>() { 3, 4 }; var states = new List <HashSet <int> > { t1, t2, t3, t4 }; var newState = new HashSet <int>() { 1, 4 }; Assert.AreEqual(1, NFA.AddNewState(ref newState, ref states)); newState = new HashSet <int>() { 1, 3 }; Assert.AreEqual(4, NFA.AddNewState(ref newState, ref states)); Assert.IsTrue(states.Contains(newState)); }
/// <summary> /// Builds a NFA from a character class /// </summary> /// <param name="node">An AST node representing a NFA</param> /// <returns>The equivalent NFA</returns> private NFA BuildNFAFromClass(ASTNode node) { // extract the value string value = node.Value; value = value.Substring(1, value.Length - 2); bool positive = true; if (value.Length > 0 && value[0] == '^') { value = value.Substring(1); positive = false; } // build the character spans List <CharSpan> spans = new List <CharSpan>(); for (int i = 0; i != value.Length;) { // read the first full unicode character CharValue b = GetCharValue(value, i); i += b.length; if (b.chars[0] >= 0xD800 && b.chars[0] <= 0xDFFF) { OnError(node.Position, "Unsupported non-plane 0 Unicode character ({0}) in character class", new String(b.chars)); return(BuildEpsilonNFA()); } if ((i <= value.Length - 2) && (value[i] == '-')) { // this is a range, match the '-' i++; CharValue e = GetCharValue(value, i); i += e.length; if (e.chars[0] >= 0xD800 && e.chars[0] <= 0xDFFF) { OnError(node.Position, "Unsupported non-plane 0 Unicode character ({0}) in character class", new String(e.chars)); return(BuildEpsilonNFA()); } char begin = b.chars.Length == 1 ? b.chars[0] : b.chars[1]; char end = e.chars.Length == 1 ? e.chars[0] : e.chars[1]; if (begin < 0xD800 && end > 0xDFFF) { // oooh you ... spans.Add(new CharSpan(begin, (char)0xD7FF)); spans.Add(new CharSpan((char)0xE000, end)); } else { spans.Add(new CharSpan(begin, end)); } } else { // this is a normal character char begin = b.chars.Length == 1 ? b.chars[0] : b.chars[1]; spans.Add(new CharSpan(begin, begin)); } } // build the result NFA automata = NFA.NewMinimal(); if (positive) { foreach (CharSpan span in spans) { automata.StateEntry.AddTransition(span, automata.StateExit); } } else { spans.Sort(new System.Comparison <CharSpan>(CharSpan.Compare)); // TODO: Check for span intersections and overflow of b (when a span ends on 0xFFFF) char b = (char)0; for (int i = 0; i != spans.Count; i++) { if (spans[i].Begin > b) { automata.StateEntry.AddTransition(new CharSpan(b, (char)(spans[i].Begin - 1)), automata.StateExit); } b = (char)(spans[i].End + 1); // skip the surrogate encoding points if (b >= 0xD800 && b <= 0xDFFF) { b = (char)0xE000; } } if (b <= 0xD7FF) { automata.StateEntry.AddTransition(new CharSpan(b, (char)0xD7FF), automata.StateExit); automata.StateEntry.AddTransition(new CharSpan((char)0xE000, (char)0xFFFF), automata.StateExit); } else if (b != 0xFFFF) { // here b >= 0xE000 automata.StateEntry.AddTransition(new CharSpan(b, (char)0xFFFF), automata.StateExit); } // surrogate pairs NFAState intermediate = automata.AddNewState(); automata.StateEntry.AddTransition(new CharSpan((char)0xD800, (char)0xDBFF), intermediate); intermediate.AddTransition(new CharSpan((char)0xDC00, (char)0xDFFF), automata.StateExit); } return(automata); }
/// <summary> /// Adds a unicode character span to an existing NFA automaton /// </summary> /// <param name="automata">The target NFA</param> /// <param name="span">The unicode span to add</param> private static void AddUnicodeSpanToNFA(NFA automata, UnicodeSpan span) { char[] b = span.Begin.GetUTF16(); char[] e = span.End.GetUTF16(); if (span.IsPlane0) { // this span is entirely in plane 0 automata.StateEntry.AddTransition(new CharSpan(b[0], e[0]), automata.StateExit); } else if (span.Begin.IsPlane0) { // this span has only a part in plane 0 if (b[0] < 0xD800) { automata.StateEntry.AddTransition(new CharSpan(b[0], (char)0xD7FF), automata.StateExit); automata.StateEntry.AddTransition(new CharSpan((char)0xE000, (char)0xFFFF), automata.StateExit); } else { automata.StateEntry.AddTransition(new CharSpan(b[0], (char)0xFFFF), automata.StateExit); } NFAState intermediate = automata.AddNewState(); automata.StateEntry.AddTransition(new CharSpan((char)0xD800, e[0]), intermediate); intermediate.AddTransition(new CharSpan((char)0xDC00, e[1]), automata.StateExit); } else { // this span has no part in plane 0 if (b[0] == e[0]) { // same first surrogate NFAState intermediate = automata.AddNewState(); automata.StateEntry.AddTransition(new CharSpan(b[0], b[0]), intermediate); intermediate.AddTransition(new CharSpan(b[1], e[1]), automata.StateExit); } else if (e[0] == b[0] + 1) { // the first surrogates are consecutive encodings // build lower half NFAState i1 = automata.AddNewState(); automata.StateEntry.AddTransition(new CharSpan(b[0], b[0]), i1); i1.AddTransition(new CharSpan(b[1], (char)0xDFFF), automata.StateExit); // build upper half NFAState i2 = automata.AddNewState(); automata.StateEntry.AddTransition(new CharSpan(e[0], e[0]), i2); i2.AddTransition(new CharSpan((char)0xDC00, e[1]), automata.StateExit); } else { // there is at least one surrogate value between the first surrogates of begin and end // build lower part NFAState ia = automata.AddNewState(); automata.StateEntry.AddTransition(new CharSpan(b[0], b[0]), ia); ia.AddTransition(new CharSpan(b[1], (char)0xDFFF), automata.StateExit); // build intermediate part NFAState im = automata.AddNewState(); automata.StateEntry.AddTransition(new CharSpan((char)(b[0] + 1), (char)(e[0] - 1)), im); im.AddTransition(new CharSpan((char)0xDC00, (char)0xDFFF), automata.StateExit); // build upper part NFAState iz = automata.AddNewState(); automata.StateEntry.AddTransition(new CharSpan(e[0], e[0]), iz); iz.AddTransition(new CharSpan((char)0xDC00, e[1]), automata.StateExit); } } }