Beispiel #1
0
        /// <summary>
        /// The actual implementation of the IsMatch done by the individual classes.
        /// </summary>
        /// <param name="tokenizer">The tokenizer.</param>
        /// <returns>The token.</returns>
        protected override Token IsMatchImpl(TokenizableStream <char> tokenizer)
        {
            if (tokenizer.End() || (tokenizer.Current != '.' && tokenizer.Current != '…'))
            {
                return(null);
            }

            var StartPosition = tokenizer.Index;
            var EndPosition   = StartPosition;

            var Count         = 0;
            var FoundEllipsis = false;

            if (tokenizer.Current == '…')
            {
                FoundEllipsis = true;
                EndPosition   = tokenizer.Index;
                tokenizer.Consume();
            }
            else
            {
                while (!tokenizer.End() && (tokenizer.Current == '.' || char.IsWhiteSpace(tokenizer.Current)))
                {
                    if (tokenizer.Current == '.')
                    {
                        ++Count;
                        FoundEllipsis |= Count >= 3;
                        EndPosition    = tokenizer.Index;
                        if (FoundEllipsis)
                        {
                            tokenizer.Consume();
                            break;
                        }
                    }
                    tokenizer.Consume();
                }
            }
            if (!FoundEllipsis)
            {
                return(null);
            }

            return(new Token
            {
                EndPosition = EndPosition,
                StartPosition = StartPosition,
                TokenType = TokenType.Ellipsis,
                Value = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray())
            });
        }
Beispiel #2
0
        /// <summary>
        /// The actual implementation of the IsMatch done by the individual classes.
        /// </summary>
        /// <param name="tokenizer">The tokenizer.</param>
        /// <returns>The token.</returns>
        protected override Token?IsMatchImpl(TokenizableStream <char> tokenizer)
        {
            if (tokenizer.End() || tokenizer.Index + 8 > tokenizer.Length)
            {
                return(null);
            }

            var StartPosition = tokenizer.Index;

            var TempSlice = tokenizer.Slice(StartPosition, StartPosition + 7);

            if (TempSlice[0] != 'f' && TempSlice[0] != 'h' && TempSlice[0] != 's' && TempSlice[0] != 'w')
            {
                return(null);
            }

            while (!tokenizer.End() && !char.IsWhiteSpace(tokenizer.Current))
            {
                tokenizer.Consume();
            }

            var EndPosition = tokenizer.Index - 1;

            var Result = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray());

            Result = Result.StripRight(".");

            tokenizer.Index = StartPosition + Result.Length;

            if (!Uri.IsWellFormedUriString(Result, UriKind.RelativeOrAbsolute) ||
                (!Result.StartsWith("ftp:", StringComparison.OrdinalIgnoreCase) &&
                 !Result.StartsWith("sftp:", StringComparison.OrdinalIgnoreCase) &&
                 !Result.StartsWith("http:", StringComparison.OrdinalIgnoreCase) &&
                 !Result.StartsWith("https:", StringComparison.OrdinalIgnoreCase) &&
                 !Result.StartsWith("www.", StringComparison.OrdinalIgnoreCase)))
            {
                return(null);
            }

            EndPosition = tokenizer.Index - 1;

            var TempResult = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray());

            return(new Token(
                       EndPosition,
                       StartPosition,
                       TokenType.Url,
                       TempResult
                       ));
        }
Beispiel #3
0
 /// <summary>
 /// The actual implementation of the IsMatch done by the individual classes.
 /// </summary>
 /// <param name="tokenizer">The tokenizer.</param>
 /// <returns>The token.</returns>
 protected override Token?IsMatchImpl(TokenizableStream <char> tokenizer)
 {
     if (!tokenizer.End() && tokenizer.Current == Character)
     {
         var StartPos = tokenizer.Index;
         tokenizer.Consume();
         var EndPos = tokenizer.Index;
         return(new Token(
                    EndPos,
                    StartPos,
                    TokenType,
                    Character.ToString()
                    ));
     }
     return(null);
 }
Beispiel #4
0
        /// <summary>
        /// Gets the tokens.
        /// </summary>
        /// <param name="tokenizableStream">The tokenizable stream.</param>
        /// <param name="tokenFinders">The token finders.</param>
        /// <returns>The tokens.</returns>
        private IEnumerable <Token> GetTokens(TokenizableStream <char> tokenizableStream, IEnglishTokenFinder[] tokenFinders)
        {
            var CurrentToken = Next(tokenizableStream, tokenFinders);

            while (CurrentToken != null)
            {
                yield return(CurrentToken);

                CurrentToken = Next(tokenizableStream, tokenFinders);
            }
            yield return(new Token(
                             tokenizableStream.Index,
                             tokenizableStream.Index,
                             TokenType.EOF,
                             string.Empty
                             ));
        }
Beispiel #5
0
        /// <summary>
        /// Gets the tokens.
        /// </summary>
        /// <param name="tokenizableStream">The tokenizable stream.</param>
        /// <param name="tokenFinders">The token finders.</param>
        /// <returns>The tokens.</returns>
        private IEnumerable <Token> GetTokens(TokenizableStream <char> tokenizableStream, IEnglishTokenFinder[] tokenFinders)
        {
            var CurrentToken = Next(tokenizableStream, tokenFinders);

            while (CurrentToken != null)
            {
                yield return(CurrentToken);

                CurrentToken = Next(tokenizableStream, tokenFinders);
            }
            yield return(new Token
            {
                EndPosition = tokenizableStream.Index,
                StartPosition = tokenizableStream.Index,
                TokenType = TokenType.EOF,
                Value = string.Empty
            });
        }
Beispiel #6
0
        /// <summary>
        /// The actual implementation of the IsMatch done by the individual classes.
        /// </summary>
        /// <param name="tokenizer">The tokenizer.</param>
        /// <returns>The token.</returns>
        protected override Token?IsMatchImpl(TokenizableStream <char> tokenizer)
        {
            if (tokenizer.End() || !char.IsLetter(tokenizer.Current))
            {
                return(null);
            }

            var StartPosition = tokenizer.Index;

            while (!tokenizer.End() && (char.IsLetter(tokenizer.Current) || char.IsNumber(tokenizer.Current) || tokenizer.Current == '-' || tokenizer.Current == '.'))
            {
                tokenizer.Consume();
            }

            if (tokenizer.Current != '@')
            {
                return(null);
            }

            tokenizer.Consume();

            bool EmailFound = false;

            while (!tokenizer.End() && (char.IsLetter(tokenizer.Current) || char.IsNumber(tokenizer.Current) || tokenizer.Current == '-' || tokenizer.Current == '.'))
            {
                if (tokenizer.Current == '.' && !char.IsLetter(tokenizer.Peek(1)))
                {
                    break;
                }
                EmailFound |= tokenizer.Current == '.';
                tokenizer.Consume();
            }

            var EndPosition = tokenizer.Index - 1;

            var Result = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray());

            return(new Token(
                       EndPosition,
                       StartPosition,
                       TokenType.Email,
                       Result
                       ));
        }
Beispiel #7
0
        /// <summary>
        /// The actual implementation of the IsMatch done by the individual classes.
        /// </summary>
        /// <param name="tokenizer">The tokenizer.</param>
        /// <returns>The token.</returns>
        protected override Token IsMatchImpl(TokenizableStream <char> tokenizer)
        {
            if (tokenizer.End() || tokenizer.Index + 8 > tokenizer.Length)
            {
                return(null);
            }

            var StartPosition = tokenizer.Index;

            var TempSlice = tokenizer.Slice(StartPosition, StartPosition + 7);

            if (TempSlice[0] != 'f' && TempSlice[0] != 'h' && TempSlice[0] != 's')
            {
                return(null);
            }

            while (!tokenizer.End() && !char.IsWhiteSpace(tokenizer.Current))
            {
                tokenizer.Consume();
            }

            var EndPosition = tokenizer.Index - 1;

            var Result = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray());

            Result = Result.StripRight(".");

            tokenizer.Index = StartPosition + Result.Length;

            if (!Uri.IsWellFormedUriString(Result, UriKind.RelativeOrAbsolute))
            {
                return(null);
            }

            EndPosition = tokenizer.Index - 1;

            return(new Token
            {
                EndPosition = EndPosition,
                StartPosition = StartPosition,
                TokenType = TokenType.Url,
                Value = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray())
            });
        }
Beispiel #8
0
        /// <summary>
        /// Determines whether [is match implementation] [the specified tokenizer].
        /// </summary>
        /// <param name="tokenizer">The tokenizer.</param>
        /// <returns></returns>
        protected override Token IsMatchImpl(TokenizableStream <char> tokenizer)
        {
            if (tokenizer.End() || !Symbols.ContainsKey(tokenizer.Current))
            {
                return(null);
            }

            var StartPosition = tokenizer.Index;
            var Value         = tokenizer.Current;

            tokenizer.Consume();

            var EndPosition = tokenizer.Index - 1;

            return(new Token
            {
                EndPosition = EndPosition,
                StartPosition = StartPosition,
                TokenType = Symbols[Value],
                Value = new string(new char[] { Value })
            });
        }
Beispiel #9
0
            /// <summary>
            /// The actual implementation of the IsMatch done by the individual classes.
            /// </summary>
            /// <param name="tokenizer">The tokenizer.</param>
            /// <returns>The token.</returns>
            protected override Token IsMatchImpl(TokenizableStream <char> tokenizer)
            {
                if (tokenizer.End() || tokenizer.Current != '.')
                {
                    return(null);
                }

                var StartPosition = tokenizer.Index;
                var EndPosition   = StartPosition;

                var    Count         = 0;
                var    FoundEllipsis = false;
                string FinalValue    = "";

                while (!tokenizer.End() && (tokenizer.Current == '.' || char.IsWhiteSpace(tokenizer.Current)))
                {
                    FinalValue += tokenizer.Current;
                    if (tokenizer.Current == '.')
                    {
                        ++Count;
                        FoundEllipsis |= Count >= 3;
                        EndPosition    = tokenizer.Index;
                    }
                    tokenizer.Consume();
                }
                if (!FoundEllipsis)
                {
                    return(null);
                }

                return(new Token
                {
                    EndPosition = EndPosition,
                    StartPosition = StartPosition,
                    TokenType = TokenType.Ellipsis,
                    Value = new string(FinalValue.Trim())
                });
            }
Beispiel #10
0
        /// <summary>
        /// The actual implementation of the IsMatch done by the individual classes.
        /// </summary>
        /// <param name="tokenizer">The tokenizer.</param>
        /// <returns>The token.</returns>
        protected override Token IsMatchImpl(TokenizableStream <char> tokenizer)
        {
            if (tokenizer.End() || char.IsWhiteSpace(tokenizer.Current))
            {
                return(null);
            }

            var StartPosition = tokenizer.Index;

            while (!tokenizer.End() && !char.IsWhiteSpace(tokenizer.Current))
            {
                tokenizer.Consume();
            }
            var EndPosition = tokenizer.Index - 1;

            return(new Token
            {
                EndPosition = EndPosition,
                StartPosition = StartPosition,
                TokenType = TokenType.Other,
                Value = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray())
            });
        }
Beispiel #11
0
        /// <summary>
        /// The actual implementation of the IsMatch done by the individual classes.
        /// </summary>
        /// <param name="tokenizer">The tokenizer.</param>
        /// <returns>The token.</returns>
        protected override Token IsMatchImpl(TokenizableStream <char> tokenizer)
        {
            if (tokenizer.End() || !char.IsLetter(tokenizer.Current))
            {
                return(null);
            }

            var StartPosition = tokenizer.Index;

            while (!tokenizer.End() && (char.IsLetter(tokenizer.Current) || tokenizer.Current == '\'' || tokenizer.Current == '-'))
            {
                tokenizer.Consume();
            }

            var EndPosition = tokenizer.Index - 1;

            return(new Token
            {
                EndPosition = EndPosition,
                StartPosition = StartPosition,
                TokenType = TokenType.Word,
                Value = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray())
            });
        }
Beispiel #12
0
        /// <summary>
        /// The actual implementation of the IsMatch done by the individual classes.
        /// </summary>
        /// <param name="tokenizer">The tokenizer.</param>
        /// <returns>The token.</returns>
        protected override Token?IsMatchImpl(TokenizableStream <char> tokenizer)
        {
            if (tokenizer.End() || !RomanNumeralCharacters.Contains(tokenizer.Current))
            {
                return(null);
            }

            var StartPosition = tokenizer.Index;

            ConsumeNumbers(tokenizer, RomanNumeralCharacters);

            if (!tokenizer.End() && char.IsLetter(tokenizer.Current))
            {
                return(null);
            }

            var EndPosition = tokenizer.Index - 1;

            var Result = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray());

            if (Result == "I" || !Validate(Result))
            {
                return(null);
            }

            return(new Token(
                       EndPosition,
                       StartPosition,
                       TokenType.Number,
                       Result
                       )
            {
                NormalizedValue = ConvertToNumber(Result),
                ReplacementValue = "<NUMBER>"
            });
        }
Beispiel #13
0
 /// <summary>
 /// Tokenizes the specified text.
 /// </summary>
 /// <param name="text">The text.</param>
 /// <returns>The tokenized version of the text.</returns>
 public Token[] Tokenize(TokenizableStream <char> text)
 {
     return(GetTokens(text, TokenFinders.OrderBy(x => x.Order).ToArray()).ToArray());
 }
Beispiel #14
0
 /// <summary>
 /// The actual implementation of the IsMatch done by the individual classes.
 /// </summary>
 /// <param name="tokenizer">The tokenizer.</param>
 /// <returns>The token.</returns>
 protected abstract Token?IsMatchImpl(TokenizableStream <char> tokenizer);
Beispiel #15
0
        /// <summary>
        /// The actual implementation of the IsMatch done by the individual classes.
        /// </summary>
        /// <param name="tokenizer">The tokenizer.</param>
        /// <returns>The token.</returns>
        protected override Token?IsMatchImpl(TokenizableStream <char> tokenizer)
        {
            if (tokenizer.End() || !char.IsLetter(tokenizer.Current))
            {
                return(null);
            }

            var StartPosition = tokenizer.Index;

            bool CharacterFound = true;
            int  PeriodCount    = 0;

            while (CharacterFound)
            {
                CharacterFound = false;
                while (!tokenizer.End() && (char.IsLetter(tokenizer.Current) || tokenizer.Current == '\'' || tokenizer.Current == '-'))
                {
                    CharacterFound = true;
                    tokenizer.Consume();
                }

                if (tokenizer.Current == '.' && CharacterFound)
                {
                    tokenizer.Consume();
                    ++PeriodCount;
                }
            }

            var EndPosition = tokenizer.Index - 1;

            var Result = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray());

            if (PeriodCount > 1)
            {
                return(new Token(
                           EndPosition,
                           StartPosition,
                           TokenType.Abbreviation,
                           Result
                           ));
            }

            var UpperResult = Result.ToUpperInvariant();

            if (Result == UpperResult && Result.Length <= 4 && Result.Length > 1)
            {
                return(new Token(
                           EndPosition,
                           StartPosition,
                           TokenType.Abbreviation,
                           Result
                           ));
            }

            if (!CommonAbbreviations.Any(x => x == UpperResult))
            {
                return(null);
            }

            return(new Token(
                       EndPosition,
                       StartPosition,
                       TokenType.Abbreviation,
                       Result
                       ));
        }