Пример #1
0
        public override Core.Tokenization.Token Recognize(string s, int from, bool allowTokenBundles, ref int consumedLength)
        {
            // TODO this could test whether the match is followed by whitespace or non-word characters (punctuation etc) -
            //  that would be simpler than inlcuding the respective constraints in the RX

            Token winner          = null;
            int   winningLength   = 0;
            int   winningPriority = 0;

            for (int p = 0; p < _Patterns.Count; ++p)
            {
                System.Text.RegularExpressions.Regex rx = _Patterns[p].Regex;
                CharacterSet first = _Patterns[p].First;

                // NOTE if the requirement that m.Index == from is dropped, we cannot use FIRST any more
                if (first != null && from < s.Length && !first.Contains(s[from]))
                {
                    continue;
                }

                System.Text.RegularExpressions.Match m = rx.Match(s, from);
                if (m != null && m.Success && m.Index == from)
                {
                    if (VerifyContextConstraints(s, m.Index + m.Value.Length))
                    {
                        Token t = CreateToken(m.Value, m.Groups);
                        // TODO set other token values?
                        if (t != null && m.Length > 0)
                        {
                            // longest wins, if two matches are found with equal length, the one with
                            //  higher prio wins, or if both have same prio, first match wins
                            if ((m.Length > winningLength) ||
                                winner == null ||
                                (m.Length == winningLength && _Patterns[p].Priority > winningPriority && !allowTokenBundles))
                            {
                                winningLength   = m.Length;
                                winner          = t;
                                winningPriority = _Patterns[p].Priority;
                            }
                            else if (allowTokenBundles && m.Length == winningLength)
                            {
                                if (!(winner is TokenBundle))
                                {
                                    winner = new TokenBundle(winner, winningPriority);
                                }

                                ((TokenBundle)winner).Add(t, _Patterns[p].Priority);
                                winningPriority = Math.Max(winningPriority, _Patterns[p].Priority);
                            }
                        }
                    }
                }
            }

            if (winner != null)
            {
                consumedLength = winningLength;
                return(winner);
            }
            else
            {
                return(null);
            }
        }
Пример #2
0
        private List <Core.Tokenization.Token> TokenizeInternal(string s,
                                                                int currentRun,
                                                                bool createWhitespaceTokens,
                                                                bool allowTokenBundles)
        {
            List <Token> result = new List <Token>();

            int p    = 0;
            int sLen = s.Length;

            while (p < sLen)
            {
                int start = p;

                while (p < sLen && System.Char.IsWhiteSpace(s, p))
                {
                    ++p;
                }

                if (p > start)
                {
                    if (createWhitespaceTokens)
                    {
                        Token t = new SimpleToken(s.Substring(start, p - start), TokenType.Whitespace);
                        t.Span = new SegmentRange(currentRun, start, p - 1);
                        result.Add(t);
                    }
                    start = p;
                }
                if (p >= sLen)
                {
                    break;
                }

                // test which recognizer claims the longest prefix

                Recognizer winningRecognizer = null;
                int        winningLength     = 0;
                Token      winningToken      = null;

                const bool allowBundlesOfDifferentType = false;

                for (int r = 0; r < _Parameters.Count; ++r)
                {
                    Recognizer rec            = _Parameters[r];
                    int        consumedLength = 0;
                    Token      t = rec.Recognize(s, start, allowTokenBundles, ref consumedLength);

                    if (t != null)
                    {
                        if (winningRecognizer == null ||
                            (winningLength < consumedLength && !(winningRecognizer.OverrideFallbackRecognizer && rec.IsFallbackRecognizer)))
                        {
                            winningToken      = t;
                            winningRecognizer = rec;
                            winningLength     = consumedLength;
                            p = start + consumedLength;
                        }
                        else if (allowTokenBundles && allowBundlesOfDifferentType)
                        {
                            Core.Tokenization.TokenBundle winningBundle
                                = winningToken as Core.Tokenization.TokenBundle;

                            if (winningBundle == null)
                            {
                                winningBundle = new TokenBundle(winningToken, winningRecognizer.Priority);
                                winningToken  = winningBundle;
                            }
                            else
                            {
                                winningBundle.Add(t, winningRecognizer.Priority);
                            }

                            System.Diagnostics.Debug.Assert(winningLength == consumedLength);
                            System.Diagnostics.Debug.Assert(p == start + consumedLength);
                        }
                        else if (winningRecognizer.Priority < rec.Priority)
                        {
                            // same length, but lower priority - highest prio wins
                            winningToken      = t;
                            winningRecognizer = rec;
                            winningLength     = consumedLength;
                            p = start + consumedLength;
                        }
                    }
                }

                if (winningToken == null)
                {
                    // none of the recognizers claimed any input, or there were no recognizers set up.
                    // ultimate fallback required: group by same Unicode category
                    // TODO scanning on just the category is too fine - we may want to group coarser categories together
                    System.Globalization.UnicodeCategory cat = System.Char.GetUnicodeCategory(s, start);
                    while (p < sLen && System.Char.GetUnicodeCategory(s, p) == cat)
                    {
                        ++p;
                    }
                    winningLength = p - start;
                    // TODO distinguish result token type depending on the category
                    winningToken      = new SimpleToken(s.Substring(start, p - start), TokenType.Word);
                    winningRecognizer = null;
                }
                else if (winningToken is TokenBundle)
                {
                    // convert single-element token bundles to single tokens
                    TokenBundle tb = winningToken as TokenBundle;
                    if (tb.Count == 1)
                    {
                        winningToken = tb[0].Token;
                    }
                }

                System.Diagnostics.Debug.Assert(winningLength > 0);
                System.Diagnostics.Debug.Assert(winningToken != null);

                winningToken.Span = new SegmentRange(currentRun, start, p - 1);

                result.Add(winningToken);
            }

            return(result);
        }