/// <summary> /// The actual implementation of the IsMatch done by the individual classes. /// </summary> /// <param name="tokenizer">The tokenizer.</param> /// <returns>The token.</returns> protected override Token?IsMatchImpl(TokenizableStream <char> tokenizer) { if (tokenizer.End() || tokenizer.Index + 8 > tokenizer.Length) { return(null); } var StartPosition = tokenizer.Index; var TempSlice = tokenizer.Slice(StartPosition, StartPosition + 7); if (TempSlice[0] != 'f' && TempSlice[0] != 'h' && TempSlice[0] != 's' && TempSlice[0] != 'w') { return(null); } while (!tokenizer.End() && !char.IsWhiteSpace(tokenizer.Current)) { tokenizer.Consume(); } var EndPosition = tokenizer.Index - 1; var Result = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray()); Result = Result.StripRight("."); tokenizer.Index = StartPosition + Result.Length; if (!Uri.IsWellFormedUriString(Result, UriKind.RelativeOrAbsolute) || (!Result.StartsWith("ftp:", StringComparison.OrdinalIgnoreCase) && !Result.StartsWith("sftp:", StringComparison.OrdinalIgnoreCase) && !Result.StartsWith("http:", StringComparison.OrdinalIgnoreCase) && !Result.StartsWith("https:", StringComparison.OrdinalIgnoreCase) && !Result.StartsWith("www.", StringComparison.OrdinalIgnoreCase))) { return(null); } EndPosition = tokenizer.Index - 1; var TempResult = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray()); return(new Token( EndPosition, StartPosition, TokenType.Url, TempResult )); }
/// <summary> /// The actual implementation of the IsMatch done by the individual classes. /// </summary> /// <param name="tokenizer">The tokenizer.</param> /// <returns>The token.</returns> protected override Token IsMatchImpl(TokenizableStream <char> tokenizer) { if (tokenizer.End() || !char.IsNumber(tokenizer.Current)) { return(null); } var StartPosition = tokenizer.Index; ConsumeNumbers(tokenizer); var PeekCharacter = tokenizer.Peek(1); if (tokenizer.Current == '.' && (char.IsNumber(PeekCharacter) || PeekCharacter == ',')) { tokenizer.Consume(); ConsumeNumbers(tokenizer); } var EndPosition = tokenizer.Index - 1; return(new Token { EndPosition = EndPosition, StartPosition = StartPosition, TokenType = TokenType.Number, Value = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray()) }); }
/// <summary> /// The actual implementation of the IsMatch done by the individual classes. /// </summary> /// <param name="tokenizer">The tokenizer.</param> /// <returns>The token.</returns> protected override Token?IsMatchImpl(TokenizableStream <char> tokenizer) { if (tokenizer.End() || (tokenizer.Current != '@')) { return(null); } var StartPosition = tokenizer.Index; tokenizer.Consume(); bool UsernameFound = false; while (!tokenizer.End() && (char.IsLetter(tokenizer.Current) || char.IsNumber(tokenizer.Current) || tokenizer.Current == '_')) { UsernameFound = true; tokenizer.Consume(); } if (!UsernameFound) { return(null); } var EndPosition = tokenizer.Index - 1; var Result = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray()); return(new Token( EndPosition, StartPosition, TokenType.Username, Result )); }
/// <summary> /// The actual implementation of the IsMatch done by the individual classes. /// </summary> /// <param name="tokenizer">The tokenizer.</param> /// <returns>The token.</returns> protected override Token?IsMatchImpl(TokenizableStream <char> tokenizer) { if (tokenizer.End() || !char.IsLetter(tokenizer.Current)) { return(null); } var StartPosition = tokenizer.Index; while (!tokenizer.End() && (char.IsLetter(tokenizer.Current) || tokenizer.Current == '\'' || tokenizer.Current == '-')) { tokenizer.Consume(); } var EndPosition = tokenizer.Index - 1; var Result = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray()); return(new Token( EndPosition, StartPosition, TokenType.Word, Result )); }
/// <summary> /// The actual implementation of the IsMatch done by the individual classes. /// </summary> /// <param name="tokenizer">The tokenizer.</param> /// <returns>The token.</returns> protected override Token?IsMatchImpl(TokenizableStream <char> tokenizer) { if (tokenizer.End() || !IsEmoji(tokenizer.Current)) { return(null); } var StartPosition = tokenizer.Index; while (!tokenizer.End() && IsEmoji(tokenizer.Current)) { tokenizer.Consume(); } var EndPosition = tokenizer.Index - 1; var Result = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray()); return(new Token( EndPosition, StartPosition, TokenType.Emoji, Result ) { ReplacementValue = "<SYM>" }); }
/// <summary> /// The actual implementation of the IsMatch done by the individual classes. /// </summary> /// <param name="tokenizer">The tokenizer.</param> /// <returns>The token.</returns> protected override Token?IsMatchImpl(TokenizableStream <char> tokenizer) { if (tokenizer.End() || (tokenizer.Current != '\r' && tokenizer.Current != '\n')) { return(null); } var StartPosition = tokenizer.Index; while (!tokenizer.End() && (tokenizer.Current == '\r' || tokenizer.Current == '\n')) { tokenizer.Consume(); } var EndPosition = tokenizer.Index - 1; var Result = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray()); return(new Token( EndPosition, StartPosition, TokenType.NewLine, Result ) { ReplacementValue = "<WHITE_SPACE>" }); }
/// <summary> /// The actual implementation of the IsMatch done by the individual classes. /// </summary> /// <param name="tokenizer">The tokenizer.</param> /// <returns>The token.</returns> protected override Token IsMatchImpl(TokenizableStream <char> tokenizer) { if (tokenizer.End() || (tokenizer.Current != '#')) { return(null); } var StartPosition = tokenizer.Index; tokenizer.Consume(); bool HashTagFound = false; while (!tokenizer.End() && (char.IsLetter(tokenizer.Current) || char.IsNumber(tokenizer.Current) || tokenizer.Current == '_')) { HashTagFound = true; tokenizer.Consume(); } if (!HashTagFound) { return(null); } var EndPosition = tokenizer.Index - 1; return(new Token { EndPosition = EndPosition, StartPosition = StartPosition, TokenType = TokenType.HashTag, Value = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray()) }); }
/// <summary> /// The actual implementation of the IsMatch done by the individual classes. /// </summary> /// <param name="tokenizer">The tokenizer.</param> /// <returns>The token.</returns> protected override Token IsMatchImpl(TokenizableStream <char> tokenizer) { if (tokenizer.End() || tokenizer.Index + 8 > tokenizer.Length) { return(null); } var StartPosition = tokenizer.Index; var TempSlice = tokenizer.Slice(StartPosition, StartPosition + 7); if (TempSlice[0] != 'f' && TempSlice[0] != 'h' && TempSlice[0] != 's') { return(null); } while (!tokenizer.End() && !char.IsWhiteSpace(tokenizer.Current)) { tokenizer.Consume(); } var EndPosition = tokenizer.Index - 1; var Result = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray()); Result = Result.StripRight("."); tokenizer.Index = StartPosition + Result.Length; if (!Uri.IsWellFormedUriString(Result, UriKind.RelativeOrAbsolute)) { return(null); } EndPosition = tokenizer.Index - 1; return(new Token { EndPosition = EndPosition, StartPosition = StartPosition, TokenType = TokenType.Url, Value = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray()) }); }
/// <summary> /// The actual implementation of the IsMatch done by the individual classes. /// </summary> /// <param name="tokenizer">The tokenizer.</param> /// <returns>The token.</returns> protected override Token?IsMatchImpl(TokenizableStream <char> tokenizer) { if (tokenizer.End() || (tokenizer.Current != '.' && tokenizer.Current != '…')) { return(null); } var StartPosition = tokenizer.Index; var EndPosition = StartPosition; var Count = 0; var FoundEllipsis = false; if (tokenizer.Current == '…') { FoundEllipsis = true; EndPosition = tokenizer.Index; tokenizer.Consume(); } else { while (!tokenizer.End() && (tokenizer.Current == '.' || char.IsWhiteSpace(tokenizer.Current))) { if (tokenizer.Current == '.') { ++Count; FoundEllipsis |= Count >= 3; EndPosition = tokenizer.Index; if (FoundEllipsis) { tokenizer.Consume(); break; } } tokenizer.Consume(); } } if (!FoundEllipsis) { return(null); } var Result = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray()); return(new Token( EndPosition, StartPosition, TokenType.Ellipsis, Result ) { ReplacementValue = "<SYM>" }); }
/// <summary> /// The actual implementation of the IsMatch done by the individual classes. /// </summary> /// <param name="tokenizer">The tokenizer.</param> /// <returns>The token.</returns> protected override Token?IsMatchImpl(TokenizableStream <char> tokenizer) { if (tokenizer.End() || !char.IsLetter(tokenizer.Current)) { return(null); } var StartPosition = tokenizer.Index; while (!tokenizer.End() && (char.IsLetter(tokenizer.Current) || char.IsNumber(tokenizer.Current) || tokenizer.Current == '-' || tokenizer.Current == '.')) { tokenizer.Consume(); } if (tokenizer.Current != '@') { return(null); } tokenizer.Consume(); bool EmailFound = false; while (!tokenizer.End() && (char.IsLetter(tokenizer.Current) || char.IsNumber(tokenizer.Current) || tokenizer.Current == '-' || tokenizer.Current == '.')) { if (tokenizer.Current == '.' && !char.IsLetter(tokenizer.Peek(1))) { break; } EmailFound |= tokenizer.Current == '.'; tokenizer.Consume(); } var EndPosition = tokenizer.Index - 1; var Result = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray()); return(new Token( EndPosition, StartPosition, TokenType.Email, Result )); }
/// <summary> /// The actual implementation of the IsMatch done by the individual classes. /// </summary> /// <param name="tokenizer">The tokenizer.</param> /// <returns>The token.</returns> protected override Token IsMatchImpl(TokenizableStream <char> tokenizer) { if (tokenizer.End() || char.IsWhiteSpace(tokenizer.Current)) { return(null); } var StartPosition = tokenizer.Index; while (!tokenizer.End() && !char.IsWhiteSpace(tokenizer.Current)) { tokenizer.Consume(); } var EndPosition = tokenizer.Index - 1; return(new Token { EndPosition = EndPosition, StartPosition = StartPosition, TokenType = TokenType.Other, Value = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray()) }); }
/// <summary> /// The actual implementation of the IsMatch done by the individual classes. /// </summary> /// <param name="tokenizer">The tokenizer.</param> /// <returns>The token.</returns> protected override Token?IsMatchImpl(TokenizableStream <char> tokenizer) { if (tokenizer.End() || !RomanNumeralCharacters.Contains(tokenizer.Current)) { return(null); } var StartPosition = tokenizer.Index; ConsumeNumbers(tokenizer, RomanNumeralCharacters); if (!tokenizer.End() && char.IsLetter(tokenizer.Current)) { return(null); } var EndPosition = tokenizer.Index - 1; var Result = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray()); if (Result == "I" || !Validate(Result)) { return(null); } return(new Token( EndPosition, StartPosition, TokenType.Number, Result ) { NormalizedValue = ConvertToNumber(Result), ReplacementValue = "<NUMBER>" }); }
/// <summary> /// The actual implementation of the IsMatch done by the individual classes. /// </summary> /// <param name="tokenizer">The tokenizer.</param> /// <returns>The token.</returns> protected override Token IsMatchImpl(TokenizableStream <char> tokenizer) { if (tokenizer.End() || (tokenizer.Current != '\r' && tokenizer.Current != '\n')) { return(null); } var StartPosition = tokenizer.Index; while (!tokenizer.End() && (tokenizer.Current == '\r' || tokenizer.Current == '\n')) { tokenizer.Consume(); } var EndPosition = tokenizer.Index - 1; return(new Token { EndPosition = EndPosition, StartPosition = StartPosition, TokenType = TokenType.NewLine, Value = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray()) }); }
/// <summary> /// The actual implementation of the IsMatch done by the individual classes. /// </summary> /// <param name="tokenizer">The tokenizer.</param> /// <returns>The token.</returns> protected override Token?IsMatchImpl(TokenizableStream <char> tokenizer) { if (tokenizer.End() || !char.IsLetter(tokenizer.Current)) { return(null); } var StartPosition = tokenizer.Index; bool CharacterFound = true; int PeriodCount = 0; while (CharacterFound) { CharacterFound = false; while (!tokenizer.End() && (char.IsLetter(tokenizer.Current) || tokenizer.Current == '\'' || tokenizer.Current == '-')) { CharacterFound = true; tokenizer.Consume(); } if (tokenizer.Current == '.' && CharacterFound) { tokenizer.Consume(); ++PeriodCount; } } var EndPosition = tokenizer.Index - 1; var Result = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray()); if (PeriodCount > 1) { return(new Token( EndPosition, StartPosition, TokenType.Abbreviation, Result )); } var UpperResult = Result.ToUpperInvariant(); if (Result == UpperResult && Result.Length <= 4 && Result.Length > 1) { return(new Token( EndPosition, StartPosition, TokenType.Abbreviation, Result )); } if (!CommonAbbreviations.Any(x => x == UpperResult)) { return(null); } return(new Token( EndPosition, StartPosition, TokenType.Abbreviation, Result )); }