Exemplo n.º 1
0
        /// <summary>
        /// The actual implementation of the IsMatch done by the individual classes.
        /// </summary>
        /// <param name="tokenizer">The tokenizer.</param>
        /// <returns>The token.</returns>
        protected override Token?IsMatchImpl(TokenizableStream <char> tokenizer)
        {
            if (tokenizer.End() || (tokenizer.Current != '\r' && tokenizer.Current != '\n'))
            {
                return(null);
            }

            var StartPosition = tokenizer.Index;

            while (!tokenizer.End() && (tokenizer.Current == '\r' || tokenizer.Current == '\n'))
            {
                tokenizer.Consume();
            }

            var EndPosition = tokenizer.Index - 1;

            var Result = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray());

            return(new Token(
                       EndPosition,
                       StartPosition,
                       TokenType.NewLine,
                       Result
                       )
            {
                ReplacementValue = "<WHITE_SPACE>"
            });
        }
Exemplo n.º 2
0
        /// <summary>
        /// The actual implementation of the IsMatch done by the individual classes.
        /// </summary>
        /// <param name="tokenizer">The tokenizer.</param>
        /// <returns>The token.</returns>
        protected override Token IsMatchImpl(TokenizableStream <char> tokenizer)
        {
            if (tokenizer.End() || (tokenizer.Current != '#'))
            {
                return(null);
            }

            var StartPosition = tokenizer.Index;

            tokenizer.Consume();

            bool HashTagFound = false;

            while (!tokenizer.End() && (char.IsLetter(tokenizer.Current) || char.IsNumber(tokenizer.Current) || tokenizer.Current == '_'))
            {
                HashTagFound = true;
                tokenizer.Consume();
            }

            if (!HashTagFound)
            {
                return(null);
            }

            var EndPosition = tokenizer.Index - 1;

            return(new Token
            {
                EndPosition = EndPosition,
                StartPosition = StartPosition,
                TokenType = TokenType.HashTag,
                Value = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray())
            });
        }
Exemplo n.º 3
0
        /// <summary>
        /// The actual implementation of the IsMatch done by the individual classes.
        /// </summary>
        /// <param name="tokenizer">The tokenizer.</param>
        /// <returns>The token.</returns>
        protected override Token?IsMatchImpl(TokenizableStream <char> tokenizer)
        {
            if (tokenizer.End() || !IsEmoji(tokenizer.Current))
            {
                return(null);
            }

            var StartPosition = tokenizer.Index;

            while (!tokenizer.End() && IsEmoji(tokenizer.Current))
            {
                tokenizer.Consume();
            }

            var EndPosition = tokenizer.Index - 1;

            var Result = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray());

            return(new Token(
                       EndPosition,
                       StartPosition,
                       TokenType.Emoji,
                       Result
                       )
            {
                ReplacementValue = "<SYM>"
            });
        }
Exemplo n.º 4
0
        /// <summary>
        /// The actual implementation of the IsMatch done by the individual classes.
        /// </summary>
        /// <param name="tokenizer">The tokenizer.</param>
        /// <returns>The token.</returns>
        protected override Token?IsMatchImpl(TokenizableStream <char> tokenizer)
        {
            if (tokenizer.End() || !char.IsLetter(tokenizer.Current))
            {
                return(null);
            }

            var StartPosition = tokenizer.Index;

            while (!tokenizer.End() && (char.IsLetter(tokenizer.Current) || tokenizer.Current == '\'' || tokenizer.Current == '-'))
            {
                tokenizer.Consume();
            }

            var EndPosition = tokenizer.Index - 1;

            var Result = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray());

            return(new Token(
                       EndPosition,
                       StartPosition,
                       TokenType.Word,
                       Result
                       ));
        }
Exemplo n.º 5
0
        /// <summary>
        /// The actual implementation of the IsMatch done by the individual classes.
        /// </summary>
        /// <param name="tokenizer">The tokenizer.</param>
        /// <returns>The token.</returns>
        protected override Token?IsMatchImpl(TokenizableStream <char> tokenizer)
        {
            if (tokenizer.End() || (tokenizer.Current != '@'))
            {
                return(null);
            }

            var StartPosition = tokenizer.Index;

            tokenizer.Consume();

            bool UsernameFound = false;

            while (!tokenizer.End() && (char.IsLetter(tokenizer.Current) || char.IsNumber(tokenizer.Current) || tokenizer.Current == '_'))
            {
                UsernameFound = true;
                tokenizer.Consume();
            }

            if (!UsernameFound)
            {
                return(null);
            }

            var EndPosition = tokenizer.Index - 1;

            var Result = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray());

            return(new Token(
                       EndPosition,
                       StartPosition,
                       TokenType.Username,
                       Result
                       ));
        }
Exemplo n.º 6
0
        /// <summary>
        /// The actual implementation of the IsMatch done by the individual classes.
        /// </summary>
        /// <param name="tokenizer">The tokenizer.</param>
        /// <returns>The token.</returns>
        protected override Token?IsMatchImpl(TokenizableStream <char> tokenizer)
        {
            if (tokenizer.End() || (tokenizer.Current != '.' && tokenizer.Current != '…'))
            {
                return(null);
            }

            var StartPosition = tokenizer.Index;
            var EndPosition   = StartPosition;

            var Count         = 0;
            var FoundEllipsis = false;

            if (tokenizer.Current == '…')
            {
                FoundEllipsis = true;
                EndPosition   = tokenizer.Index;
                tokenizer.Consume();
            }
            else
            {
                while (!tokenizer.End() && (tokenizer.Current == '.' || char.IsWhiteSpace(tokenizer.Current)))
                {
                    if (tokenizer.Current == '.')
                    {
                        ++Count;
                        FoundEllipsis |= Count >= 3;
                        EndPosition    = tokenizer.Index;
                        if (FoundEllipsis)
                        {
                            tokenizer.Consume();
                            break;
                        }
                    }
                    tokenizer.Consume();
                }
            }
            if (!FoundEllipsis)
            {
                return(null);
            }

            var Result = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray());

            return(new Token(
                       EndPosition,
                       StartPosition,
                       TokenType.Ellipsis,
                       Result
                       )
            {
                ReplacementValue = "<SYM>"
            });
        }
Exemplo n.º 7
0
        /// <summary>
        /// The actual implementation of the IsMatch done by the individual classes.
        /// </summary>
        /// <param name="tokenizer">The tokenizer.</param>
        /// <returns>The token.</returns>
        protected override Token?IsMatchImpl(TokenizableStream <char> tokenizer)
        {
            if (tokenizer.End() || tokenizer.Index + 8 > tokenizer.Length)
            {
                return(null);
            }

            var StartPosition = tokenizer.Index;

            var TempSlice = tokenizer.Slice(StartPosition, StartPosition + 7);

            if (TempSlice[0] != 'f' && TempSlice[0] != 'h' && TempSlice[0] != 's' && TempSlice[0] != 'w')
            {
                return(null);
            }

            while (!tokenizer.End() && !char.IsWhiteSpace(tokenizer.Current))
            {
                tokenizer.Consume();
            }

            var EndPosition = tokenizer.Index - 1;

            var Result = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray());

            Result = Result.StripRight(".");

            tokenizer.Index = StartPosition + Result.Length;

            if (!Uri.IsWellFormedUriString(Result, UriKind.RelativeOrAbsolute) ||
                (!Result.StartsWith("ftp:", StringComparison.OrdinalIgnoreCase) &&
                 !Result.StartsWith("sftp:", StringComparison.OrdinalIgnoreCase) &&
                 !Result.StartsWith("http:", StringComparison.OrdinalIgnoreCase) &&
                 !Result.StartsWith("https:", StringComparison.OrdinalIgnoreCase) &&
                 !Result.StartsWith("www.", StringComparison.OrdinalIgnoreCase)))
            {
                return(null);
            }

            EndPosition = tokenizer.Index - 1;

            var TempResult = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray());

            return(new Token(
                       EndPosition,
                       StartPosition,
                       TokenType.Url,
                       TempResult
                       ));
        }
Exemplo n.º 8
0
 /// <summary>
 /// Consumes the numbers.
 /// </summary>
 /// <param name="tokenizer">The tokenizer.</param>
 /// <param name="romanNumeralChars">The roman numeral chars.</param>
 private static void ConsumeNumbers(TokenizableStream <char> tokenizer, HashSet <char> romanNumeralChars)
 {
     while (!tokenizer.End() && romanNumeralChars.Contains(tokenizer.Current))
     {
         tokenizer.Consume();
     }
 }
Exemplo n.º 9
0
        /// <summary>
        /// Determines whether the next set of item on the stream matches this finder.
        /// </summary>
        /// <param name="stream">The stream.</param>
        /// <returns>The token.</returns>
        public Token IsMatch(TokenizableStream <char> stream)
        {
            if (stream.End())
            {
                return(new Token
                {
                    StartPosition = stream.Index,
                    EndPosition = stream.Index,
                    TokenType = TokenType.EOF,
                    Value = string.Empty
                });
            }

            stream.TakeSnapshot();

            var Match = IsMatchImpl(stream);

            if (Match == null)
            {
                stream.RollbackSnapshot();
            }
            else
            {
                stream.CommitSnapshot();
            }

            return(Match);
        }
Exemplo n.º 10
0
 private static void ConsumeNumbers(TokenizableStream <char> tokenizer)
 {
     while (!tokenizer.End() && (char.IsNumber(tokenizer.Current) || tokenizer.Current == ','))
     {
         tokenizer.Consume();
     }
 }
Exemplo n.º 11
0
        /// <summary>
        /// The actual implementation of the IsMatch done by the individual classes.
        /// </summary>
        /// <param name="tokenizer">The tokenizer.</param>
        /// <returns>The token.</returns>
        protected override Token IsMatchImpl(TokenizableStream <char> tokenizer)
        {
            if (tokenizer.End() || !char.IsNumber(tokenizer.Current))
            {
                return(null);
            }

            var StartPosition = tokenizer.Index;

            ConsumeNumbers(tokenizer);

            var PeekCharacter = tokenizer.Peek(1);

            if (tokenizer.Current == '.' && (char.IsNumber(PeekCharacter) || PeekCharacter == ','))
            {
                tokenizer.Consume();
                ConsumeNumbers(tokenizer);
            }

            var EndPosition = tokenizer.Index - 1;

            return(new Token
            {
                EndPosition = EndPosition,
                StartPosition = StartPosition,
                TokenType = TokenType.Number,
                Value = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray())
            });
        }
Exemplo n.º 12
0
        /// <summary>
        /// Determines whether [is match implementation] [the specified tokenizer].
        /// </summary>
        /// <param name="tokenizer">The tokenizer.</param>
        /// <returns></returns>
        protected override Token?IsMatchImpl(TokenizableStream <char> tokenizer)
        {
            if (tokenizer.End() || !Symbols.ContainsKey(tokenizer.Current))
            {
                return(null);
            }

            var StartPosition = tokenizer.Index;
            var Value         = tokenizer.Current;

            tokenizer.Consume();

            var EndPosition = tokenizer.Index - 1;

            var Result = new string(new char[] { Value });

            return(new Token(
                       EndPosition,
                       StartPosition,
                       Symbols[Value],
                       Result
                       )
            {
                ReplacementValue = "<SYM>"
            });
        }
Exemplo n.º 13
0
        /// <summary>
        /// Determines whether the next set of item on the stream matches this finder.
        /// </summary>
        /// <param name="stream">The stream.</param>
        /// <returns>The token.</returns>
        public Token?IsMatch(TokenizableStream <char> stream)
        {
            if (stream.End())
            {
                return(new Token(
                           stream.Index,
                           stream.Index,
                           TokenType.EOF,
                           string.Empty
                           ));
            }

            stream.TakeSnapshot();

            var Match = IsMatchImpl(stream);

            if (Match == null)
            {
                stream.RollbackSnapshot();
            }
            else
            {
                stream.CommitSnapshot();
            }

            return(Match);
        }
Exemplo n.º 14
0
        /// <summary>
        /// Gets the next token or null if their isn't one.
        /// </summary>
        /// <param name="tokenizableStream">The tokenizable stream.</param>
        /// <param name="tokenFinders">The token finders.</param>
        /// <returns>The next token.</returns>
        private static Token Next(TokenizableStream <char> tokenizableStream, IEnglishTokenFinder[] tokenFinders)
        {
            if (tokenizableStream.End())
            {
                return(null);
            }

            return(tokenFinders.Select(x => x.IsMatch(tokenizableStream)).FirstOrDefault(x => x != null));
        }
Exemplo n.º 15
0
        /// <summary>
        /// The actual implementation of the IsMatch done by the individual classes.
        /// </summary>
        /// <param name="tokenizer">The tokenizer.</param>
        /// <returns>The token.</returns>
        protected override Token?IsMatchImpl(TokenizableStream <char> tokenizer)
        {
            if (tokenizer.End() || !char.IsLetter(tokenizer.Current))
            {
                return(null);
            }

            var StartPosition = tokenizer.Index;

            while (!tokenizer.End() && (char.IsLetter(tokenizer.Current) || char.IsNumber(tokenizer.Current) || tokenizer.Current == '-' || tokenizer.Current == '.'))
            {
                tokenizer.Consume();
            }

            if (tokenizer.Current != '@')
            {
                return(null);
            }

            tokenizer.Consume();

            bool EmailFound = false;

            while (!tokenizer.End() && (char.IsLetter(tokenizer.Current) || char.IsNumber(tokenizer.Current) || tokenizer.Current == '-' || tokenizer.Current == '.'))
            {
                if (tokenizer.Current == '.' && !char.IsLetter(tokenizer.Peek(1)))
                {
                    break;
                }
                EmailFound |= tokenizer.Current == '.';
                tokenizer.Consume();
            }

            var EndPosition = tokenizer.Index - 1;

            var Result = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray());

            return(new Token(
                       EndPosition,
                       StartPosition,
                       TokenType.Email,
                       Result
                       ));
        }
Exemplo n.º 16
0
        /// <summary>
        /// The actual implementation of the IsMatch done by the individual classes.
        /// </summary>
        /// <param name="tokenizer">The tokenizer.</param>
        /// <returns>The token.</returns>
        protected override Token IsMatchImpl(TokenizableStream <char> tokenizer)
        {
            if (tokenizer.End() || tokenizer.Index + 8 > tokenizer.Length)
            {
                return(null);
            }

            var StartPosition = tokenizer.Index;

            var TempSlice = tokenizer.Slice(StartPosition, StartPosition + 7);

            if (TempSlice[0] != 'f' && TempSlice[0] != 'h' && TempSlice[0] != 's')
            {
                return(null);
            }

            while (!tokenizer.End() && !char.IsWhiteSpace(tokenizer.Current))
            {
                tokenizer.Consume();
            }

            var EndPosition = tokenizer.Index - 1;

            var Result = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray());

            Result = Result.StripRight(".");

            tokenizer.Index = StartPosition + Result.Length;

            if (!Uri.IsWellFormedUriString(Result, UriKind.RelativeOrAbsolute))
            {
                return(null);
            }

            EndPosition = tokenizer.Index - 1;

            return(new Token
            {
                EndPosition = EndPosition,
                StartPosition = StartPosition,
                TokenType = TokenType.Url,
                Value = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray())
            });
        }
Exemplo n.º 17
0
            /// <summary>
            /// The actual implementation of the IsMatch done by the individual classes.
            /// </summary>
            /// <param name="tokenizer">The tokenizer.</param>
            /// <returns>The token.</returns>
            protected override Token IsMatchImpl(TokenizableStream <char> tokenizer)
            {
                if (tokenizer.End() || tokenizer.Current != '.')
                {
                    return(null);
                }

                var StartPosition = tokenizer.Index;
                var EndPosition   = StartPosition;

                var    Count         = 0;
                var    FoundEllipsis = false;
                string FinalValue    = "";

                while (!tokenizer.End() && (tokenizer.Current == '.' || char.IsWhiteSpace(tokenizer.Current)))
                {
                    FinalValue += tokenizer.Current;
                    if (tokenizer.Current == '.')
                    {
                        ++Count;
                        FoundEllipsis |= Count >= 3;
                        EndPosition    = tokenizer.Index;
                    }
                    tokenizer.Consume();
                }
                if (!FoundEllipsis)
                {
                    return(null);
                }

                return(new Token
                {
                    EndPosition = EndPosition,
                    StartPosition = StartPosition,
                    TokenType = TokenType.Ellipsis,
                    Value = new string(FinalValue.Trim())
                });
            }
Exemplo n.º 18
0
        /// <summary>
        /// The actual implementation of the IsMatch done by the individual classes.
        /// </summary>
        /// <param name="tokenizer">The tokenizer.</param>
        /// <returns>The token.</returns>
        protected override Token IsMatchImpl(TokenizableStream <char> tokenizer)
        {
            if (tokenizer.End() || char.IsWhiteSpace(tokenizer.Current))
            {
                return(null);
            }

            var StartPosition = tokenizer.Index;

            while (!tokenizer.End() && !char.IsWhiteSpace(tokenizer.Current))
            {
                tokenizer.Consume();
            }
            var EndPosition = tokenizer.Index - 1;

            return(new Token
            {
                EndPosition = EndPosition,
                StartPosition = StartPosition,
                TokenType = TokenType.Other,
                Value = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray())
            });
        }
Exemplo n.º 19
0
        /// <summary>
        /// The actual implementation of the IsMatch done by the individual classes.
        /// </summary>
        /// <param name="tokenizer">The tokenizer.</param>
        /// <returns>The token.</returns>
        protected override Token IsMatchImpl(TokenizableStream <char> tokenizer)
        {
            if (tokenizer.End() || (tokenizer.Current != '\r' && tokenizer.Current != '\n'))
            {
                return(null);
            }

            var StartPosition = tokenizer.Index;

            while (!tokenizer.End() && (tokenizer.Current == '\r' || tokenizer.Current == '\n'))
            {
                tokenizer.Consume();
            }

            var EndPosition = tokenizer.Index - 1;

            return(new Token
            {
                EndPosition = EndPosition,
                StartPosition = StartPosition,
                TokenType = TokenType.NewLine,
                Value = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray())
            });
        }
Exemplo n.º 20
0
        /// <summary>
        /// The actual implementation of the IsMatch done by the individual classes.
        /// </summary>
        /// <param name="tokenizer">The tokenizer.</param>
        /// <returns>The token.</returns>
        protected override Token?IsMatchImpl(TokenizableStream <char> tokenizer)
        {
            if (tokenizer.End() || !RomanNumeralCharacters.Contains(tokenizer.Current))
            {
                return(null);
            }

            var StartPosition = tokenizer.Index;

            ConsumeNumbers(tokenizer, RomanNumeralCharacters);

            if (!tokenizer.End() && char.IsLetter(tokenizer.Current))
            {
                return(null);
            }

            var EndPosition = tokenizer.Index - 1;

            var Result = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray());

            if (Result == "I" || !Validate(Result))
            {
                return(null);
            }

            return(new Token(
                       EndPosition,
                       StartPosition,
                       TokenType.Number,
                       Result
                       )
            {
                NormalizedValue = ConvertToNumber(Result),
                ReplacementValue = "<NUMBER>"
            });
        }
Exemplo n.º 21
0
 /// <summary>
 /// The actual implementation of the IsMatch done by the individual classes.
 /// </summary>
 /// <param name="tokenizer">The tokenizer.</param>
 /// <returns>The token.</returns>
 protected override Token?IsMatchImpl(TokenizableStream <char> tokenizer)
 {
     if (!tokenizer.End() && tokenizer.Current == Character)
     {
         var StartPos = tokenizer.Index;
         tokenizer.Consume();
         var EndPos = tokenizer.Index;
         return(new Token(
                    EndPos,
                    StartPos,
                    TokenType,
                    Character.ToString()
                    ));
     }
     return(null);
 }
Exemplo n.º 22
0
        /// <summary>
        /// Determines whether [is match implementation] [the specified tokenizer].
        /// </summary>
        /// <param name="tokenizer">The tokenizer.</param>
        /// <returns></returns>
        protected override Token IsMatchImpl(TokenizableStream <char> tokenizer)
        {
            if (tokenizer.End() || !Symbols.ContainsKey(tokenizer.Current))
            {
                return(null);
            }

            var StartPosition = tokenizer.Index;
            var Value         = tokenizer.Current;

            tokenizer.Consume();

            var EndPosition = tokenizer.Index - 1;

            return(new Token
            {
                EndPosition = EndPosition,
                StartPosition = StartPosition,
                TokenType = Symbols[Value],
                Value = new string(new char[] { Value })
            });
        }
Exemplo n.º 23
0
        /// <summary>
        /// The actual implementation of the IsMatch done by the individual classes.
        /// </summary>
        /// <param name="tokenizer">The tokenizer.</param>
        /// <returns>The token.</returns>
        protected override Token?IsMatchImpl(TokenizableStream <char> tokenizer)
        {
            if (tokenizer.End() || !char.IsLetter(tokenizer.Current))
            {
                return(null);
            }

            var StartPosition = tokenizer.Index;

            bool CharacterFound = true;
            int  PeriodCount    = 0;

            while (CharacterFound)
            {
                CharacterFound = false;
                while (!tokenizer.End() && (char.IsLetter(tokenizer.Current) || tokenizer.Current == '\'' || tokenizer.Current == '-'))
                {
                    CharacterFound = true;
                    tokenizer.Consume();
                }

                if (tokenizer.Current == '.' && CharacterFound)
                {
                    tokenizer.Consume();
                    ++PeriodCount;
                }
            }

            var EndPosition = tokenizer.Index - 1;

            var Result = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray());

            if (PeriodCount > 1)
            {
                return(new Token(
                           EndPosition,
                           StartPosition,
                           TokenType.Abbreviation,
                           Result
                           ));
            }

            var UpperResult = Result.ToUpperInvariant();

            if (Result == UpperResult && Result.Length <= 4 && Result.Length > 1)
            {
                return(new Token(
                           EndPosition,
                           StartPosition,
                           TokenType.Abbreviation,
                           Result
                           ));
            }

            if (!CommonAbbreviations.Any(x => x == UpperResult))
            {
                return(null);
            }

            return(new Token(
                       EndPosition,
                       StartPosition,
                       TokenType.Abbreviation,
                       Result
                       ));
        }