Esempio n. 1
0
        /// <summary>
        /// The actual implementation of the IsMatch done by the individual classes.
        /// </summary>
        /// <param name="tokenizer">The tokenizer.</param>
        /// <returns>The token.</returns>
        protected override Token?IsMatchImpl(TokenizableStream <char> tokenizer)
        {
            if (tokenizer.End() || tokenizer.Index + 8 > tokenizer.Length)
            {
                return(null);
            }

            var StartPosition = tokenizer.Index;

            var TempSlice = tokenizer.Slice(StartPosition, StartPosition + 7);

            if (TempSlice[0] != 'f' && TempSlice[0] != 'h' && TempSlice[0] != 's' && TempSlice[0] != 'w')
            {
                return(null);
            }

            while (!tokenizer.End() && !char.IsWhiteSpace(tokenizer.Current))
            {
                tokenizer.Consume();
            }

            var EndPosition = tokenizer.Index - 1;

            var Result = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray());

            Result = Result.StripRight(".");

            tokenizer.Index = StartPosition + Result.Length;

            if (!Uri.IsWellFormedUriString(Result, UriKind.RelativeOrAbsolute) ||
                (!Result.StartsWith("ftp:", StringComparison.OrdinalIgnoreCase) &&
                 !Result.StartsWith("sftp:", StringComparison.OrdinalIgnoreCase) &&
                 !Result.StartsWith("http:", StringComparison.OrdinalIgnoreCase) &&
                 !Result.StartsWith("https:", StringComparison.OrdinalIgnoreCase) &&
                 !Result.StartsWith("www.", StringComparison.OrdinalIgnoreCase)))
            {
                return(null);
            }

            EndPosition = tokenizer.Index - 1;

            var TempResult = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray());

            return(new Token(
                       EndPosition,
                       StartPosition,
                       TokenType.Url,
                       TempResult
                       ));
        }
Esempio n. 2
0
        /// <summary>
        /// The actual implementation of the IsMatch done by the individual classes.
        /// </summary>
        /// <param name="tokenizer">The tokenizer.</param>
        /// <returns>The token.</returns>
        protected override Token IsMatchImpl(TokenizableStream <char> tokenizer)
        {
            if (tokenizer.End() || !char.IsNumber(tokenizer.Current))
            {
                return(null);
            }

            var StartPosition = tokenizer.Index;

            ConsumeNumbers(tokenizer);

            var PeekCharacter = tokenizer.Peek(1);

            if (tokenizer.Current == '.' && (char.IsNumber(PeekCharacter) || PeekCharacter == ','))
            {
                tokenizer.Consume();
                ConsumeNumbers(tokenizer);
            }

            var EndPosition = tokenizer.Index - 1;

            return(new Token
            {
                EndPosition = EndPosition,
                StartPosition = StartPosition,
                TokenType = TokenType.Number,
                Value = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray())
            });
        }
Esempio n. 3
0
        /// <summary>
        /// The actual implementation of the IsMatch done by the individual classes.
        /// </summary>
        /// <param name="tokenizer">The tokenizer.</param>
        /// <returns>The token.</returns>
        protected override Token?IsMatchImpl(TokenizableStream <char> tokenizer)
        {
            if (tokenizer.End() || (tokenizer.Current != '@'))
            {
                return(null);
            }

            var StartPosition = tokenizer.Index;

            tokenizer.Consume();

            bool UsernameFound = false;

            while (!tokenizer.End() && (char.IsLetter(tokenizer.Current) || char.IsNumber(tokenizer.Current) || tokenizer.Current == '_'))
            {
                UsernameFound = true;
                tokenizer.Consume();
            }

            if (!UsernameFound)
            {
                return(null);
            }

            var EndPosition = tokenizer.Index - 1;

            var Result = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray());

            return(new Token(
                       EndPosition,
                       StartPosition,
                       TokenType.Username,
                       Result
                       ));
        }
Esempio n. 4
0
        /// <summary>
        /// The actual implementation of the IsMatch done by the individual classes.
        /// </summary>
        /// <param name="tokenizer">The tokenizer.</param>
        /// <returns>The token.</returns>
        protected override Token?IsMatchImpl(TokenizableStream <char> tokenizer)
        {
            if (tokenizer.End() || !char.IsLetter(tokenizer.Current))
            {
                return(null);
            }

            var StartPosition = tokenizer.Index;

            while (!tokenizer.End() && (char.IsLetter(tokenizer.Current) || tokenizer.Current == '\'' || tokenizer.Current == '-'))
            {
                tokenizer.Consume();
            }

            var EndPosition = tokenizer.Index - 1;

            var Result = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray());

            return(new Token(
                       EndPosition,
                       StartPosition,
                       TokenType.Word,
                       Result
                       ));
        }
Esempio n. 5
0
        /// <summary>
        /// The actual implementation of the IsMatch done by the individual classes.
        /// </summary>
        /// <param name="tokenizer">The tokenizer.</param>
        /// <returns>The token.</returns>
        protected override Token?IsMatchImpl(TokenizableStream <char> tokenizer)
        {
            if (tokenizer.End() || !IsEmoji(tokenizer.Current))
            {
                return(null);
            }

            var StartPosition = tokenizer.Index;

            while (!tokenizer.End() && IsEmoji(tokenizer.Current))
            {
                tokenizer.Consume();
            }

            var EndPosition = tokenizer.Index - 1;

            var Result = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray());

            return(new Token(
                       EndPosition,
                       StartPosition,
                       TokenType.Emoji,
                       Result
                       )
            {
                ReplacementValue = "<SYM>"
            });
        }
Esempio n. 6
0
        /// <summary>
        /// The actual implementation of the IsMatch done by the individual classes.
        /// </summary>
        /// <param name="tokenizer">The tokenizer.</param>
        /// <returns>The token.</returns>
        protected override Token?IsMatchImpl(TokenizableStream <char> tokenizer)
        {
            if (tokenizer.End() || (tokenizer.Current != '\r' && tokenizer.Current != '\n'))
            {
                return(null);
            }

            var StartPosition = tokenizer.Index;

            while (!tokenizer.End() && (tokenizer.Current == '\r' || tokenizer.Current == '\n'))
            {
                tokenizer.Consume();
            }

            var EndPosition = tokenizer.Index - 1;

            var Result = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray());

            return(new Token(
                       EndPosition,
                       StartPosition,
                       TokenType.NewLine,
                       Result
                       )
            {
                ReplacementValue = "<WHITE_SPACE>"
            });
        }
Esempio n. 7
0
        /// <summary>
        /// The actual implementation of the IsMatch done by the individual classes.
        /// </summary>
        /// <param name="tokenizer">The tokenizer.</param>
        /// <returns>The token.</returns>
        protected override Token IsMatchImpl(TokenizableStream <char> tokenizer)
        {
            if (tokenizer.End() || (tokenizer.Current != '#'))
            {
                return(null);
            }

            var StartPosition = tokenizer.Index;

            tokenizer.Consume();

            bool HashTagFound = false;

            while (!tokenizer.End() && (char.IsLetter(tokenizer.Current) || char.IsNumber(tokenizer.Current) || tokenizer.Current == '_'))
            {
                HashTagFound = true;
                tokenizer.Consume();
            }

            if (!HashTagFound)
            {
                return(null);
            }

            var EndPosition = tokenizer.Index - 1;

            return(new Token
            {
                EndPosition = EndPosition,
                StartPosition = StartPosition,
                TokenType = TokenType.HashTag,
                Value = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray())
            });
        }
Esempio n. 8
0
        /// <summary>
        /// The actual implementation of the IsMatch done by the individual classes.
        /// </summary>
        /// <param name="tokenizer">The tokenizer.</param>
        /// <returns>The token.</returns>
        protected override Token IsMatchImpl(TokenizableStream <char> tokenizer)
        {
            if (tokenizer.End() || tokenizer.Index + 8 > tokenizer.Length)
            {
                return(null);
            }

            var StartPosition = tokenizer.Index;

            var TempSlice = tokenizer.Slice(StartPosition, StartPosition + 7);

            if (TempSlice[0] != 'f' && TempSlice[0] != 'h' && TempSlice[0] != 's')
            {
                return(null);
            }

            while (!tokenizer.End() && !char.IsWhiteSpace(tokenizer.Current))
            {
                tokenizer.Consume();
            }

            var EndPosition = tokenizer.Index - 1;

            var Result = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray());

            Result = Result.StripRight(".");

            tokenizer.Index = StartPosition + Result.Length;

            if (!Uri.IsWellFormedUriString(Result, UriKind.RelativeOrAbsolute))
            {
                return(null);
            }

            EndPosition = tokenizer.Index - 1;

            return(new Token
            {
                EndPosition = EndPosition,
                StartPosition = StartPosition,
                TokenType = TokenType.Url,
                Value = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray())
            });
        }
Esempio n. 9
0
        /// <summary>
        /// The actual implementation of the IsMatch done by the individual classes.
        /// </summary>
        /// <param name="tokenizer">The tokenizer.</param>
        /// <returns>The token.</returns>
        protected override Token?IsMatchImpl(TokenizableStream <char> tokenizer)
        {
            if (tokenizer.End() || (tokenizer.Current != '.' && tokenizer.Current != '…'))
            {
                return(null);
            }

            var StartPosition = tokenizer.Index;
            var EndPosition   = StartPosition;

            var Count         = 0;
            var FoundEllipsis = false;

            if (tokenizer.Current == '…')
            {
                FoundEllipsis = true;
                EndPosition   = tokenizer.Index;
                tokenizer.Consume();
            }
            else
            {
                while (!tokenizer.End() && (tokenizer.Current == '.' || char.IsWhiteSpace(tokenizer.Current)))
                {
                    if (tokenizer.Current == '.')
                    {
                        ++Count;
                        FoundEllipsis |= Count >= 3;
                        EndPosition    = tokenizer.Index;
                        if (FoundEllipsis)
                        {
                            tokenizer.Consume();
                            break;
                        }
                    }
                    tokenizer.Consume();
                }
            }
            if (!FoundEllipsis)
            {
                return(null);
            }

            var Result = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray());

            return(new Token(
                       EndPosition,
                       StartPosition,
                       TokenType.Ellipsis,
                       Result
                       )
            {
                ReplacementValue = "<SYM>"
            });
        }
Esempio n. 10
0
        /// <summary>
        /// The actual implementation of the IsMatch done by the individual classes.
        /// </summary>
        /// <param name="tokenizer">The tokenizer.</param>
        /// <returns>The token.</returns>
        protected override Token?IsMatchImpl(TokenizableStream <char> tokenizer)
        {
            if (tokenizer.End() || !char.IsLetter(tokenizer.Current))
            {
                return(null);
            }

            var StartPosition = tokenizer.Index;

            while (!tokenizer.End() && (char.IsLetter(tokenizer.Current) || char.IsNumber(tokenizer.Current) || tokenizer.Current == '-' || tokenizer.Current == '.'))
            {
                tokenizer.Consume();
            }

            if (tokenizer.Current != '@')
            {
                return(null);
            }

            tokenizer.Consume();

            bool EmailFound = false;

            while (!tokenizer.End() && (char.IsLetter(tokenizer.Current) || char.IsNumber(tokenizer.Current) || tokenizer.Current == '-' || tokenizer.Current == '.'))
            {
                if (tokenizer.Current == '.' && !char.IsLetter(tokenizer.Peek(1)))
                {
                    break;
                }
                EmailFound |= tokenizer.Current == '.';
                tokenizer.Consume();
            }

            var EndPosition = tokenizer.Index - 1;

            var Result = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray());

            return(new Token(
                       EndPosition,
                       StartPosition,
                       TokenType.Email,
                       Result
                       ));
        }
Esempio n. 11
0
        /// <summary>
        /// The actual implementation of the IsMatch done by the individual classes.
        /// </summary>
        /// <param name="tokenizer">The tokenizer.</param>
        /// <returns>The token.</returns>
        protected override Token IsMatchImpl(TokenizableStream <char> tokenizer)
        {
            if (tokenizer.End() || char.IsWhiteSpace(tokenizer.Current))
            {
                return(null);
            }

            var StartPosition = tokenizer.Index;

            while (!tokenizer.End() && !char.IsWhiteSpace(tokenizer.Current))
            {
                tokenizer.Consume();
            }
            var EndPosition = tokenizer.Index - 1;

            return(new Token
            {
                EndPosition = EndPosition,
                StartPosition = StartPosition,
                TokenType = TokenType.Other,
                Value = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray())
            });
        }
Esempio n. 12
0
        /// <summary>
        /// The actual implementation of the IsMatch done by the individual classes.
        /// </summary>
        /// <param name="tokenizer">The tokenizer.</param>
        /// <returns>The token.</returns>
        protected override Token?IsMatchImpl(TokenizableStream <char> tokenizer)
        {
            if (tokenizer.End() || !RomanNumeralCharacters.Contains(tokenizer.Current))
            {
                return(null);
            }

            var StartPosition = tokenizer.Index;

            ConsumeNumbers(tokenizer, RomanNumeralCharacters);

            if (!tokenizer.End() && char.IsLetter(tokenizer.Current))
            {
                return(null);
            }

            var EndPosition = tokenizer.Index - 1;

            var Result = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray());

            if (Result == "I" || !Validate(Result))
            {
                return(null);
            }

            return(new Token(
                       EndPosition,
                       StartPosition,
                       TokenType.Number,
                       Result
                       )
            {
                NormalizedValue = ConvertToNumber(Result),
                ReplacementValue = "<NUMBER>"
            });
        }
Esempio n. 13
0
        /// <summary>
        /// The actual implementation of the IsMatch done by the individual classes.
        /// </summary>
        /// <param name="tokenizer">The tokenizer.</param>
        /// <returns>The token.</returns>
        protected override Token IsMatchImpl(TokenizableStream <char> tokenizer)
        {
            if (tokenizer.End() || (tokenizer.Current != '\r' && tokenizer.Current != '\n'))
            {
                return(null);
            }

            var StartPosition = tokenizer.Index;

            while (!tokenizer.End() && (tokenizer.Current == '\r' || tokenizer.Current == '\n'))
            {
                tokenizer.Consume();
            }

            var EndPosition = tokenizer.Index - 1;

            return(new Token
            {
                EndPosition = EndPosition,
                StartPosition = StartPosition,
                TokenType = TokenType.NewLine,
                Value = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray())
            });
        }
Esempio n. 14
0
        /// <summary>
        /// The actual implementation of the IsMatch done by the individual classes.
        /// </summary>
        /// <param name="tokenizer">The tokenizer.</param>
        /// <returns>The token.</returns>
        protected override Token?IsMatchImpl(TokenizableStream <char> tokenizer)
        {
            if (tokenizer.End() || !char.IsLetter(tokenizer.Current))
            {
                return(null);
            }

            var StartPosition = tokenizer.Index;

            bool CharacterFound = true;
            int  PeriodCount    = 0;

            while (CharacterFound)
            {
                CharacterFound = false;
                while (!tokenizer.End() && (char.IsLetter(tokenizer.Current) || tokenizer.Current == '\'' || tokenizer.Current == '-'))
                {
                    CharacterFound = true;
                    tokenizer.Consume();
                }

                if (tokenizer.Current == '.' && CharacterFound)
                {
                    tokenizer.Consume();
                    ++PeriodCount;
                }
            }

            var EndPosition = tokenizer.Index - 1;

            var Result = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray());

            if (PeriodCount > 1)
            {
                return(new Token(
                           EndPosition,
                           StartPosition,
                           TokenType.Abbreviation,
                           Result
                           ));
            }

            var UpperResult = Result.ToUpperInvariant();

            if (Result == UpperResult && Result.Length <= 4 && Result.Length > 1)
            {
                return(new Token(
                           EndPosition,
                           StartPosition,
                           TokenType.Abbreviation,
                           Result
                           ));
            }

            if (!CommonAbbreviations.Any(x => x == UpperResult))
            {
                return(null);
            }

            return(new Token(
                       EndPosition,
                       StartPosition,
                       TokenType.Abbreviation,
                       Result
                       ));
        }