Exemple #1
0
        /// <summary>
        /// The Token result
        /// </summary>
        /// <param name="definition"></param>
        /// <param name="token"></param>
        /// <param name="originalString"></param>
        /// <param name="index"></param>
        public TokenResult(ITokenDefinition definition, string token, string originalString, int index)
        {
            Index          = index;
            OriginalString = originalString ?? throw new ArgumentNullException(nameof(originalString));
            Definition     = definition ?? throw new ArgumentNullException(nameof(definition));
            Token          = token ?? throw new ArgumentNullException(nameof(token));

            if (!Definition.IsToken(Token))
            {
                throw new InvalidTokenException(token);
            }

            if (index < 0 || index > originalString.Length)
            {
                throw new ArgumentOutOfRangeException(nameof(index));
            }
        }
 public TokenExtractor(ITokenDefinition definition) => Definition = definition ?? throw new ArgumentNullException(nameof(definition));
Exemple #3
0
        /// <summary>
        /// Starts the tokenization of the value set into ToTokenize and will return a Queue with IToken objects
        /// </summary>
        /// <exception cref="NoTokenDefinitionsSpecifiedException">Thrown when no TokenDefinition list is specified</exception>
        /// <returns>Queue with IToken objects which represent a tokenized version of the inputstring ToTokenize</returns>
        public Queue <IToken> Tokenize()
        {
            if (Parser.TokenDefinitions == null)
            {
                throw new NoTokenDefinitionsSpecifiedException("No Token Definitions Found.");
            }

            foreach (ITokenDefinition currentTokenDefinition in Parser.TokenDefinitions)
            {
                if (currentTokenDefinition.TokenID == (int)BuildInTokenID.UntokenizedLiteralString)
                {
                    _untokenizedLiteralStringToken = currentTokenDefinition;
                    continue;
                }
                if (currentTokenDefinition.TokenID == (int)BuildInTokenID.EOF)
                {
                    _eofToken = currentTokenDefinition;
                    continue;
                }
            }

            // all clear. Now walk all tokendefinitions for their regular expressions and collect MatchCollections of these
            // regular expressions. Store all found tokens in a sorted list, with the starting char as the key. These will
            // be then sorted on that key and based on the sorting result the result queue is build. When regular expressions
            // are build badly, overlapping tokens can be found. The parser should be handling this.

            // create sorted list of result tokens
            SortedList <int, IToken> tokensFound = new SortedList <int, IToken>(_stringToTokenize.Length / 10);

            // start the tokenization process
            // walk all token definitions. Match all regular expressions with the string to tokenize.
            // all matches are tokenized into tokens belonging with the token definition
            // All tokens are then added to the sorted list with the index as the key
            foreach (ITokenDefinition currentToken in Parser.TokenDefinitions)
            {
                if ((currentToken.TokenID == (int)BuildInTokenID.UntokenizedLiteralString) ||
                    (currentToken.TokenID == (int)BuildInTokenID.EOF))
                {
                    // skip this token definition
                    continue;
                }

                MatchCollection matchesFound = currentToken.MatchingRegularExpression.Matches(_stringToTokenize);
                if (matchesFound.Count > 0)
                {
                    // found matches, convert the matches to tokens, if there is not already a token present at
                    // the current position. This is necessary since we're using regular expressions.
                    foreach (Match matchedSnippet in matchesFound)
                    {
                        // create a token using the factory object. TokenID and RelatedTokenDefinition are already filled in.
                        IToken toAdd = currentToken.CreateTokenFromDefinition();
                        toAdd.LiteralMatchedTokenText = matchedSnippet.Value;
                        toAdd.StartIndexInInputStream = matchedSnippet.Index;

                        if (!tokensFound.ContainsKey(matchedSnippet.Index))
                        {
                            // no token found on this spot. Add it.
                            // add to sorted list
                            tokensFound.Add(matchedSnippet.Index, toAdd);
                        }
                        else
                        {
                            // it does have already a token on the spot. If that token is longer than the current token,
                            // keep it, otherwise replace the token with this token.
                            if (tokensFound[matchedSnippet.Index].LiteralMatchedTokenText.Length < toAdd.LiteralMatchedTokenText.Length)
                            {
                                tokensFound[matchedSnippet.Index] = toAdd;
                            }
                        }
                    }
                }
            }

            // now walk the resulted sorted list. Insert untokenizedliteralstring tokens for all unmatched characters in the
            // inputstream. Do this at the same time as we transform the sorted list into a queue
            Queue <IToken> toReturn = new Queue <IToken>(tokensFound.Count);

            int currentIndex             = 0;
            int firstIndexWithoutOverlap = 0;

            //for(int i = 0; i < tokensFound.Count; i++)
            foreach (KeyValuePair <int, IToken> pair in tokensFound)
            {
                IToken currentToken = pair.Value;

                if (_removeOverlappedTokens)
                {
                    // check if the index of this token falls inside another token. This is checked
                    // using iFirstIndxWithoutOverlap
                    if (currentToken.StartIndexInInputStream < firstIndexWithoutOverlap)
                    {
                        // this token is overlapped by another token. Skip it, it will then
                        // not be added to the queue
                        continue;
                    }
                }
                // check the index of the token with the current index. If there is a difference, the
                // snippet formed by the difference is an UntokenizedLiteralString token.
                if (currentIndex < currentToken.StartIndexInInputStream)
                {
                    // there is difference, add an UntokenizedLiteralString token first.
                    IToken untokenizedLiteralString = _untokenizedLiteralStringToken.CreateTokenFromDefinition();
                    untokenizedLiteralString.LiteralMatchedTokenText = _stringToTokenize.Substring(currentIndex, (currentToken.StartIndexInInputStream - currentIndex));
                    untokenizedLiteralString.StartIndexInInputStream = currentIndex;
                    // add it directly to queue
                    toReturn.Enqueue(untokenizedLiteralString);
                }
                currentIndex             = currentToken.StartIndexInInputStream + currentToken.LiteralMatchedTokenText.Length;
                firstIndexWithoutOverlap = currentIndex;
                toReturn.Enqueue(currentToken);
            }

            // check if there is an unmatched snippet behind the last token found...
            if (currentIndex < _stringToTokenize.Length)
            {
                // there is difference, add an UntokenizedLiteralString token first.
                IToken untokenizedLiteralString = _untokenizedLiteralStringToken.CreateTokenFromDefinition();
                untokenizedLiteralString.LiteralMatchedTokenText = _stringToTokenize.Substring(currentIndex, (_stringToTokenize.Length - currentIndex));
                untokenizedLiteralString.StartIndexInInputStream = currentIndex;
                // add it directly to queue
                toReturn.Enqueue(untokenizedLiteralString);
            }


            // Add the EOF token, by definition token ID 0, to the queue at the end.
            IToken eofToken = _eofToken.CreateTokenFromDefinition();

            toReturn.Enqueue(eofToken);

            // Done, return the queue
            return(toReturn);
        }
Exemple #4
0
        /// <summary>
        /// Starts the tokenization of the value set into ToTokenize and will return a Queue with IToken objects
        /// </summary>
        /// <exception cref="NoTokenDefinitionsSpecifiedException">Thrown when no TokenDefinition list is specified</exception>
        /// <returns>Queue with IToken objects which represent a tokenized version of the inputstring ToTokenize</returns>
        public Queue<IToken> Tokenize()
        {
            if(Parser.TokenDefinitions == null)
            {
                throw new NoTokenDefinitionsSpecifiedException("No Token Definitions Found.");
            }

            foreach(ITokenDefinition currentTokenDefinition in Parser.TokenDefinitions)
            {
                if(currentTokenDefinition.TokenID == (int)BuildInTokenID.UntokenizedLiteralString)
                {
                    _untokenizedLiteralStringToken = currentTokenDefinition;
                    continue;
                }
                if(currentTokenDefinition.TokenID == (int)BuildInTokenID.EOF)
                {
                    _eofToken = currentTokenDefinition;
                    continue;
                }
            }

            // all clear. Now walk all tokendefinitions for their regular expressions and collect MatchCollections of these
            // regular expressions. Store all found tokens in a sorted list, with the starting char as the key. These will
            // be then sorted on that key and based on the sorting result the result queue is build. When regular expressions
            // are build badly, overlapping tokens can be found. The parser should be handling this.

            // create sorted list of result tokens
            SortedList<int, IToken> tokensFound = new SortedList<int, IToken>(_stringToTokenize.Length / 10);

            // start the tokenization process
            // walk all token definitions. Match all regular expressions with the string to tokenize.
            // all matches are tokenized into tokens belonging with the token definition
            // All tokens are then added to the sorted list with the index as the key
            foreach(ITokenDefinition currentToken in Parser.TokenDefinitions)
            {
                if((currentToken.TokenID == (int)BuildInTokenID.UntokenizedLiteralString) ||
                    (currentToken.TokenID == (int)BuildInTokenID.EOF))
                {
                    // skip this token definition
                    continue;
                }

                MatchCollection matchesFound = currentToken.MatchingRegularExpression.Matches(_stringToTokenize);
                if(matchesFound.Count > 0)
                {
                    // found matches, convert the matches to tokens, if there is not already a token present at
                    // the current position. This is necessary since we're using regular expressions.
                    foreach(Match matchedSnippet in matchesFound)
                    {
                        // create a token using the factory object. TokenID and RelatedTokenDefinition are already filled in.
                        IToken toAdd = currentToken.CreateTokenFromDefinition();
                        toAdd.LiteralMatchedTokenText = matchedSnippet.Value;
                        toAdd.StartIndexInInputStream = matchedSnippet.Index;

                        if(!tokensFound.ContainsKey(matchedSnippet.Index))
                        {
                            // no token found on this spot. Add it.
                            // add to sorted list
                            tokensFound.Add(matchedSnippet.Index, toAdd);
                        }
                        else
                        {
                            // it does have already a token on the spot. If that token is longer than the current token,
                            // keep it, otherwise replace the token with this token.
                            if(tokensFound[matchedSnippet.Index].LiteralMatchedTokenText.Length < toAdd.LiteralMatchedTokenText.Length)
                            {
                                tokensFound[matchedSnippet.Index] = toAdd;
                            }
                        }
                    }
                }
            }

            // now walk the resulted sorted list. Insert untokenizedliteralstring tokens for all unmatched characters in the
            // inputstream. Do this at the same time as we transform the sorted list into a queue
            Queue<IToken> toReturn = new Queue<IToken>(tokensFound.Count);

            int currentIndex = 0;
            int firstIndexWithoutOverlap = 0;
            //for(int i = 0; i < tokensFound.Count; i++)
            foreach(KeyValuePair<int, IToken> pair in tokensFound)
            {
                IToken currentToken = pair.Value;

                if(_removeOverlappedTokens)
                {
                    // check if the index of this token falls inside another token. This is checked
                    // using iFirstIndxWithoutOverlap
                    if(currentToken.StartIndexInInputStream < firstIndexWithoutOverlap)
                    {
                        // this token is overlapped by another token. Skip it, it will then
                        // not be added to the queue
                        continue;
                    }
                }
                // check the index of the token with the current index. If there is a difference, the
                // snippet formed by the difference is an UntokenizedLiteralString token.
                if(currentIndex < currentToken.StartIndexInInputStream)
                {
                    // there is difference, add an UntokenizedLiteralString token first.
                    IToken untokenizedLiteralString = _untokenizedLiteralStringToken.CreateTokenFromDefinition();
                    untokenizedLiteralString.LiteralMatchedTokenText = _stringToTokenize.Substring(currentIndex, (currentToken.StartIndexInInputStream - currentIndex));
                    untokenizedLiteralString.StartIndexInInputStream = currentIndex;
                    // add it directly to queue
                    toReturn.Enqueue(untokenizedLiteralString);
                }
                currentIndex = currentToken.StartIndexInInputStream + currentToken.LiteralMatchedTokenText.Length;
                firstIndexWithoutOverlap = currentIndex;
                toReturn.Enqueue(currentToken);
            }

            // check if there is an unmatched snippet behind the last token found...
            if(currentIndex < _stringToTokenize.Length)
            {
                // there is difference, add an UntokenizedLiteralString token first.
                IToken untokenizedLiteralString = _untokenizedLiteralStringToken.CreateTokenFromDefinition();
                untokenizedLiteralString.LiteralMatchedTokenText = _stringToTokenize.Substring(currentIndex, (_stringToTokenize.Length - currentIndex));
                untokenizedLiteralString.StartIndexInInputStream = currentIndex;
                // add it directly to queue
                toReturn.Enqueue(untokenizedLiteralString);
            }

            // Add the EOF token, by definition token ID 0, to the queue at the end.
            IToken eofToken = _eofToken.CreateTokenFromDefinition();
            toReturn.Enqueue(eofToken);

            // Done, return the queue
            return toReturn;
        }