Esempio n. 1
0
        /// <summary>
        /// Builds a NFA from a unicode code point
        /// </summary>
        /// <param name="node">An AST node representing a NFA</param>
        /// <returns>The equivalent NFA</returns>
        private NFA BuildNFAFromCodepoint(ASTNode node)
        {
            // extract the code point value
            string value = node.Value;

            value = value.Substring(2, value.Length - 2);
            int cpValue = Convert.ToInt32(value, 16);

            if (cpValue < 0 || (cpValue >= 0xD800 && cpValue <= 0xDFFF) || cpValue >= 0x110000)
            {
                OnError(node.Position, "The value U+{0} is not a supported unicode code point", cpValue.ToString("X"));
                return(BuildEpsilonNFA());
            }
            UnicodeCodePoint cp = new UnicodeCodePoint(cpValue);
            // build the NFA
            NFA automata = NFA.NewMinimal();

            char[] data = cp.GetUTF16();
            if (data.Length == 1)
            {
                automata.StateEntry.AddTransition(new CharSpan(data[0], data[0]), automata.StateExit);
            }
            else
            {
                NFAState intermediate = automata.AddNewState();
                automata.StateEntry.AddTransition(new CharSpan(data[0], data[0]), intermediate);
                intermediate.AddTransition(new CharSpan(data[1], data[1]), automata.StateExit);
            }
            return(automata);
        }
Esempio n. 2
0
        /// <summary>
        /// Builds a NFA that matches everything (a single character)
        /// </summary>
        /// <returns>The equivalent NFA</returns>
        private static NFA BuildNFAFromAny()
        {
            NFA automata = NFA.NewMinimal();

            // plane 0 transitions
            automata.StateEntry.AddTransition(new CharSpan((char)0x0000, (char)0xD7FF), automata.StateExit);
            automata.StateEntry.AddTransition(new CharSpan((char)0xE000, (char)0xFFFF), automata.StateExit);
            // surrogate pairs
            NFAState intermediate = automata.AddNewState();

            automata.StateEntry.AddTransition(new CharSpan((char)0xD800, (char)0xDBFF), intermediate);
            intermediate.AddTransition(new CharSpan((char)0xDC00, (char)0xDFFF), automata.StateExit);
            return(automata);
        }
Esempio n. 3
0
        /// <summary>
        /// Builds a NFA from a piece of text
        /// </summary>
        /// <param name="node">An AST node representing a NFA</param>
        /// <returns>The equivalent NFA</returns>
        private NFA BuildNFAFromText(ASTNode node)
        {
            NFA automata = NFA.NewMinimal();

            automata.StateExit = automata.StateEntry;

            // build the raw piece of text
            string value       = node.Value;
            bool   insensitive = caseInsensitive;

            if (value.StartsWith("~"))
            {
                insensitive = true;
                value       = value.Substring(2, value.Length - 3);
            }
            else
            {
                value = value.Substring(1, value.Length - 2);
            }
            value = ReplaceEscapees(value);

            // build the result
            foreach (char c in value)
            {
                NFAState temp = automata.AddNewState();
                if (insensitive && char.IsLetter(c))
                {
                    char c2 = char.IsLower(c) ? char.ToUpper(c) : char.ToLower(c);
                    automata.StateExit.AddTransition(new CharSpan(c, c), temp);
                    automata.StateExit.AddTransition(new CharSpan(c2, c2), temp);
                }
                else
                {
                    automata.StateExit.AddTransition(new CharSpan(c, c), temp);
                }
                automata.StateExit = temp;
            }
            return(automata);
        }
Esempio n. 4
0
        public void AddNewStateTest()
        {
            var t1 = new HashSet <int>()
            {
                1, 2, 3
            };
            var t2 = new HashSet <int>()
            {
                1, 4
            };
            var t3 = new HashSet <int>()
            {
                2, 3
            };
            var t4 = new HashSet <int>()
            {
                3, 4
            };
            var states = new List <HashSet <int> > {
                t1, t2, t3, t4
            };

            var newState = new HashSet <int>()
            {
                1, 4
            };

            Assert.AreEqual(1, NFA.AddNewState(ref newState, ref states));

            newState = new HashSet <int>()
            {
                1, 3
            };
            Assert.AreEqual(4, NFA.AddNewState(ref newState, ref states));
            Assert.IsTrue(states.Contains(newState));
        }
Esempio n. 5
0
        /// <summary>
        /// Builds a NFA from a character class
        /// </summary>
        /// <param name="node">An AST node representing a NFA</param>
        /// <returns>The equivalent NFA</returns>
        private NFA BuildNFAFromClass(ASTNode node)
        {
            // extract the value
            string value = node.Value;

            value = value.Substring(1, value.Length - 2);
            bool positive = true;

            if (value.Length > 0 && value[0] == '^')
            {
                value    = value.Substring(1);
                positive = false;
            }
            // build the character spans
            List <CharSpan> spans = new List <CharSpan>();

            for (int i = 0; i != value.Length;)
            {
                // read the first full unicode character
                CharValue b = GetCharValue(value, i);
                i += b.length;
                if (b.chars[0] >= 0xD800 && b.chars[0] <= 0xDFFF)
                {
                    OnError(node.Position, "Unsupported non-plane 0 Unicode character ({0}) in character class", new String(b.chars));
                    return(BuildEpsilonNFA());
                }
                if ((i <= value.Length - 2) && (value[i] == '-'))
                {
                    // this is a range, match the '-'
                    i++;
                    CharValue e = GetCharValue(value, i);
                    i += e.length;
                    if (e.chars[0] >= 0xD800 && e.chars[0] <= 0xDFFF)
                    {
                        OnError(node.Position, "Unsupported non-plane 0 Unicode character ({0}) in character class", new String(e.chars));
                        return(BuildEpsilonNFA());
                    }
                    char begin = b.chars.Length == 1 ? b.chars[0] : b.chars[1];
                    char end   = e.chars.Length == 1 ? e.chars[0] : e.chars[1];
                    if (begin < 0xD800 && end > 0xDFFF)
                    {
                        // oooh you ...
                        spans.Add(new CharSpan(begin, (char)0xD7FF));
                        spans.Add(new CharSpan((char)0xE000, end));
                    }
                    else
                    {
                        spans.Add(new CharSpan(begin, end));
                    }
                }
                else
                {
                    // this is a normal character
                    char begin = b.chars.Length == 1 ? b.chars[0] : b.chars[1];
                    spans.Add(new CharSpan(begin, begin));
                }
            }
            // build the result
            NFA automata = NFA.NewMinimal();

            if (positive)
            {
                foreach (CharSpan span in spans)
                {
                    automata.StateEntry.AddTransition(span, automata.StateExit);
                }
            }
            else
            {
                spans.Sort(new System.Comparison <CharSpan>(CharSpan.Compare));
                // TODO: Check for span intersections and overflow of b (when a span ends on 0xFFFF)
                char b = (char)0;
                for (int i = 0; i != spans.Count; i++)
                {
                    if (spans[i].Begin > b)
                    {
                        automata.StateEntry.AddTransition(new CharSpan(b, (char)(spans[i].Begin - 1)), automata.StateExit);
                    }
                    b = (char)(spans[i].End + 1);
                    // skip the surrogate encoding points
                    if (b >= 0xD800 && b <= 0xDFFF)
                    {
                        b = (char)0xE000;
                    }
                }
                if (b <= 0xD7FF)
                {
                    automata.StateEntry.AddTransition(new CharSpan(b, (char)0xD7FF), automata.StateExit);
                    automata.StateEntry.AddTransition(new CharSpan((char)0xE000, (char)0xFFFF), automata.StateExit);
                }
                else if (b != 0xFFFF)
                {
                    // here b >= 0xE000
                    automata.StateEntry.AddTransition(new CharSpan(b, (char)0xFFFF), automata.StateExit);
                }
                // surrogate pairs
                NFAState intermediate = automata.AddNewState();
                automata.StateEntry.AddTransition(new CharSpan((char)0xD800, (char)0xDBFF), intermediate);
                intermediate.AddTransition(new CharSpan((char)0xDC00, (char)0xDFFF), automata.StateExit);
            }
            return(automata);
        }
Esempio n. 6
0
        /// <summary>
        /// Adds a unicode character span to an existing NFA automaton
        /// </summary>
        /// <param name="automata">The target NFA</param>
        /// <param name="span">The unicode span to add</param>
        private static void AddUnicodeSpanToNFA(NFA automata, UnicodeSpan span)
        {
            char[] b = span.Begin.GetUTF16();
            char[] e = span.End.GetUTF16();

            if (span.IsPlane0)
            {
                // this span is entirely in plane 0
                automata.StateEntry.AddTransition(new CharSpan(b[0], e[0]), automata.StateExit);
            }
            else if (span.Begin.IsPlane0)
            {
                // this span has only a part in plane 0
                if (b[0] < 0xD800)
                {
                    automata.StateEntry.AddTransition(new CharSpan(b[0], (char)0xD7FF), automata.StateExit);
                    automata.StateEntry.AddTransition(new CharSpan((char)0xE000, (char)0xFFFF), automata.StateExit);
                }
                else
                {
                    automata.StateEntry.AddTransition(new CharSpan(b[0], (char)0xFFFF), automata.StateExit);
                }
                NFAState intermediate = automata.AddNewState();
                automata.StateEntry.AddTransition(new CharSpan((char)0xD800, e[0]), intermediate);
                intermediate.AddTransition(new CharSpan((char)0xDC00, e[1]), automata.StateExit);
            }
            else
            {
                // this span has no part in plane 0
                if (b[0] == e[0])
                {
                    // same first surrogate
                    NFAState intermediate = automata.AddNewState();
                    automata.StateEntry.AddTransition(new CharSpan(b[0], b[0]), intermediate);
                    intermediate.AddTransition(new CharSpan(b[1], e[1]), automata.StateExit);
                }
                else if (e[0] == b[0] + 1)
                {
                    // the first surrogates are consecutive encodings
                    // build lower half
                    NFAState i1 = automata.AddNewState();
                    automata.StateEntry.AddTransition(new CharSpan(b[0], b[0]), i1);
                    i1.AddTransition(new CharSpan(b[1], (char)0xDFFF), automata.StateExit);
                    // build upper half
                    NFAState i2 = automata.AddNewState();
                    automata.StateEntry.AddTransition(new CharSpan(e[0], e[0]), i2);
                    i2.AddTransition(new CharSpan((char)0xDC00, e[1]), automata.StateExit);
                }
                else
                {
                    // there is at least one surrogate value between the first surrogates of begin and end
                    // build lower part
                    NFAState ia = automata.AddNewState();
                    automata.StateEntry.AddTransition(new CharSpan(b[0], b[0]), ia);
                    ia.AddTransition(new CharSpan(b[1], (char)0xDFFF), automata.StateExit);
                    // build intermediate part
                    NFAState im = automata.AddNewState();
                    automata.StateEntry.AddTransition(new CharSpan((char)(b[0] + 1), (char)(e[0] - 1)), im);
                    im.AddTransition(new CharSpan((char)0xDC00, (char)0xDFFF), automata.StateExit);
                    // build upper part
                    NFAState iz = automata.AddNewState();
                    automata.StateEntry.AddTransition(new CharSpan(e[0], e[0]), iz);
                    iz.AddTransition(new CharSpan((char)0xDC00, e[1]), automata.StateExit);
                }
            }
        }