/// <summary> /// The actual implementation of the IsMatch done by the individual classes. /// </summary> /// <param name="tokenizer">The tokenizer.</param> /// <returns>The token.</returns> protected override Token?IsMatchImpl(TokenizableStream <char> tokenizer) { if (tokenizer.End() || (tokenizer.Current != '\r' && tokenizer.Current != '\n')) { return(null); } var StartPosition = tokenizer.Index; while (!tokenizer.End() && (tokenizer.Current == '\r' || tokenizer.Current == '\n')) { tokenizer.Consume(); } var EndPosition = tokenizer.Index - 1; var Result = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray()); return(new Token( EndPosition, StartPosition, TokenType.NewLine, Result ) { ReplacementValue = "<WHITE_SPACE>" }); }
/// <summary> /// The actual implementation of the IsMatch done by the individual classes. /// </summary> /// <param name="tokenizer">The tokenizer.</param> /// <returns>The token.</returns> protected override Token IsMatchImpl(TokenizableStream <char> tokenizer) { if (tokenizer.End() || (tokenizer.Current != '#')) { return(null); } var StartPosition = tokenizer.Index; tokenizer.Consume(); bool HashTagFound = false; while (!tokenizer.End() && (char.IsLetter(tokenizer.Current) || char.IsNumber(tokenizer.Current) || tokenizer.Current == '_')) { HashTagFound = true; tokenizer.Consume(); } if (!HashTagFound) { return(null); } var EndPosition = tokenizer.Index - 1; return(new Token { EndPosition = EndPosition, StartPosition = StartPosition, TokenType = TokenType.HashTag, Value = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray()) }); }
/// <summary> /// The actual implementation of the IsMatch done by the individual classes. /// </summary> /// <param name="tokenizer">The tokenizer.</param> /// <returns>The token.</returns> protected override Token?IsMatchImpl(TokenizableStream <char> tokenizer) { if (tokenizer.End() || !IsEmoji(tokenizer.Current)) { return(null); } var StartPosition = tokenizer.Index; while (!tokenizer.End() && IsEmoji(tokenizer.Current)) { tokenizer.Consume(); } var EndPosition = tokenizer.Index - 1; var Result = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray()); return(new Token( EndPosition, StartPosition, TokenType.Emoji, Result ) { ReplacementValue = "<SYM>" }); }
/// <summary> /// The actual implementation of the IsMatch done by the individual classes. /// </summary> /// <param name="tokenizer">The tokenizer.</param> /// <returns>The token.</returns> protected override Token?IsMatchImpl(TokenizableStream <char> tokenizer) { if (tokenizer.End() || !char.IsLetter(tokenizer.Current)) { return(null); } var StartPosition = tokenizer.Index; while (!tokenizer.End() && (char.IsLetter(tokenizer.Current) || tokenizer.Current == '\'' || tokenizer.Current == '-')) { tokenizer.Consume(); } var EndPosition = tokenizer.Index - 1; var Result = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray()); return(new Token( EndPosition, StartPosition, TokenType.Word, Result )); }
/// <summary> /// The actual implementation of the IsMatch done by the individual classes. /// </summary> /// <param name="tokenizer">The tokenizer.</param> /// <returns>The token.</returns> protected override Token?IsMatchImpl(TokenizableStream <char> tokenizer) { if (tokenizer.End() || (tokenizer.Current != '@')) { return(null); } var StartPosition = tokenizer.Index; tokenizer.Consume(); bool UsernameFound = false; while (!tokenizer.End() && (char.IsLetter(tokenizer.Current) || char.IsNumber(tokenizer.Current) || tokenizer.Current == '_')) { UsernameFound = true; tokenizer.Consume(); } if (!UsernameFound) { return(null); } var EndPosition = tokenizer.Index - 1; var Result = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray()); return(new Token( EndPosition, StartPosition, TokenType.Username, Result )); }
/// <summary> /// The actual implementation of the IsMatch done by the individual classes. /// </summary> /// <param name="tokenizer">The tokenizer.</param> /// <returns>The token.</returns> protected override Token?IsMatchImpl(TokenizableStream <char> tokenizer) { if (tokenizer.End() || (tokenizer.Current != '.' && tokenizer.Current != '…')) { return(null); } var StartPosition = tokenizer.Index; var EndPosition = StartPosition; var Count = 0; var FoundEllipsis = false; if (tokenizer.Current == '…') { FoundEllipsis = true; EndPosition = tokenizer.Index; tokenizer.Consume(); } else { while (!tokenizer.End() && (tokenizer.Current == '.' || char.IsWhiteSpace(tokenizer.Current))) { if (tokenizer.Current == '.') { ++Count; FoundEllipsis |= Count >= 3; EndPosition = tokenizer.Index; if (FoundEllipsis) { tokenizer.Consume(); break; } } tokenizer.Consume(); } } if (!FoundEllipsis) { return(null); } var Result = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray()); return(new Token( EndPosition, StartPosition, TokenType.Ellipsis, Result ) { ReplacementValue = "<SYM>" }); }
/// <summary> /// The actual implementation of the IsMatch done by the individual classes. /// </summary> /// <param name="tokenizer">The tokenizer.</param> /// <returns>The token.</returns> protected override Token?IsMatchImpl(TokenizableStream <char> tokenizer) { if (tokenizer.End() || tokenizer.Index + 8 > tokenizer.Length) { return(null); } var StartPosition = tokenizer.Index; var TempSlice = tokenizer.Slice(StartPosition, StartPosition + 7); if (TempSlice[0] != 'f' && TempSlice[0] != 'h' && TempSlice[0] != 's' && TempSlice[0] != 'w') { return(null); } while (!tokenizer.End() && !char.IsWhiteSpace(tokenizer.Current)) { tokenizer.Consume(); } var EndPosition = tokenizer.Index - 1; var Result = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray()); Result = Result.StripRight("."); tokenizer.Index = StartPosition + Result.Length; if (!Uri.IsWellFormedUriString(Result, UriKind.RelativeOrAbsolute) || (!Result.StartsWith("ftp:", StringComparison.OrdinalIgnoreCase) && !Result.StartsWith("sftp:", StringComparison.OrdinalIgnoreCase) && !Result.StartsWith("http:", StringComparison.OrdinalIgnoreCase) && !Result.StartsWith("https:", StringComparison.OrdinalIgnoreCase) && !Result.StartsWith("www.", StringComparison.OrdinalIgnoreCase))) { return(null); } EndPosition = tokenizer.Index - 1; var TempResult = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray()); return(new Token( EndPosition, StartPosition, TokenType.Url, TempResult )); }
/// <summary> /// Consumes the numbers. /// </summary> /// <param name="tokenizer">The tokenizer.</param> /// <param name="romanNumeralChars">The roman numeral chars.</param> private static void ConsumeNumbers(TokenizableStream <char> tokenizer, HashSet <char> romanNumeralChars) { while (!tokenizer.End() && romanNumeralChars.Contains(tokenizer.Current)) { tokenizer.Consume(); } }
/// <summary> /// Determines whether the next set of item on the stream matches this finder. /// </summary> /// <param name="stream">The stream.</param> /// <returns>The token.</returns> public Token IsMatch(TokenizableStream <char> stream) { if (stream.End()) { return(new Token { StartPosition = stream.Index, EndPosition = stream.Index, TokenType = TokenType.EOF, Value = string.Empty }); } stream.TakeSnapshot(); var Match = IsMatchImpl(stream); if (Match == null) { stream.RollbackSnapshot(); } else { stream.CommitSnapshot(); } return(Match); }
private static void ConsumeNumbers(TokenizableStream <char> tokenizer) { while (!tokenizer.End() && (char.IsNumber(tokenizer.Current) || tokenizer.Current == ',')) { tokenizer.Consume(); } }
/// <summary> /// The actual implementation of the IsMatch done by the individual classes. /// </summary> /// <param name="tokenizer">The tokenizer.</param> /// <returns>The token.</returns> protected override Token IsMatchImpl(TokenizableStream <char> tokenizer) { if (tokenizer.End() || !char.IsNumber(tokenizer.Current)) { return(null); } var StartPosition = tokenizer.Index; ConsumeNumbers(tokenizer); var PeekCharacter = tokenizer.Peek(1); if (tokenizer.Current == '.' && (char.IsNumber(PeekCharacter) || PeekCharacter == ',')) { tokenizer.Consume(); ConsumeNumbers(tokenizer); } var EndPosition = tokenizer.Index - 1; return(new Token { EndPosition = EndPosition, StartPosition = StartPosition, TokenType = TokenType.Number, Value = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray()) }); }
/// <summary> /// Determines whether [is match implementation] [the specified tokenizer]. /// </summary> /// <param name="tokenizer">The tokenizer.</param> /// <returns></returns> protected override Token?IsMatchImpl(TokenizableStream <char> tokenizer) { if (tokenizer.End() || !Symbols.ContainsKey(tokenizer.Current)) { return(null); } var StartPosition = tokenizer.Index; var Value = tokenizer.Current; tokenizer.Consume(); var EndPosition = tokenizer.Index - 1; var Result = new string(new char[] { Value }); return(new Token( EndPosition, StartPosition, Symbols[Value], Result ) { ReplacementValue = "<SYM>" }); }
/// <summary> /// Determines whether the next set of item on the stream matches this finder. /// </summary> /// <param name="stream">The stream.</param> /// <returns>The token.</returns> public Token?IsMatch(TokenizableStream <char> stream) { if (stream.End()) { return(new Token( stream.Index, stream.Index, TokenType.EOF, string.Empty )); } stream.TakeSnapshot(); var Match = IsMatchImpl(stream); if (Match == null) { stream.RollbackSnapshot(); } else { stream.CommitSnapshot(); } return(Match); }
/// <summary> /// Gets the next token or null if their isn't one. /// </summary> /// <param name="tokenizableStream">The tokenizable stream.</param> /// <param name="tokenFinders">The token finders.</param> /// <returns>The next token.</returns> private static Token Next(TokenizableStream <char> tokenizableStream, IEnglishTokenFinder[] tokenFinders) { if (tokenizableStream.End()) { return(null); } return(tokenFinders.Select(x => x.IsMatch(tokenizableStream)).FirstOrDefault(x => x != null)); }
/// <summary> /// The actual implementation of the IsMatch done by the individual classes. /// </summary> /// <param name="tokenizer">The tokenizer.</param> /// <returns>The token.</returns> protected override Token?IsMatchImpl(TokenizableStream <char> tokenizer) { if (tokenizer.End() || !char.IsLetter(tokenizer.Current)) { return(null); } var StartPosition = tokenizer.Index; while (!tokenizer.End() && (char.IsLetter(tokenizer.Current) || char.IsNumber(tokenizer.Current) || tokenizer.Current == '-' || tokenizer.Current == '.')) { tokenizer.Consume(); } if (tokenizer.Current != '@') { return(null); } tokenizer.Consume(); bool EmailFound = false; while (!tokenizer.End() && (char.IsLetter(tokenizer.Current) || char.IsNumber(tokenizer.Current) || tokenizer.Current == '-' || tokenizer.Current == '.')) { if (tokenizer.Current == '.' && !char.IsLetter(tokenizer.Peek(1))) { break; } EmailFound |= tokenizer.Current == '.'; tokenizer.Consume(); } var EndPosition = tokenizer.Index - 1; var Result = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray()); return(new Token( EndPosition, StartPosition, TokenType.Email, Result )); }
/// <summary> /// The actual implementation of the IsMatch done by the individual classes. /// </summary> /// <param name="tokenizer">The tokenizer.</param> /// <returns>The token.</returns> protected override Token IsMatchImpl(TokenizableStream <char> tokenizer) { if (tokenizer.End() || tokenizer.Index + 8 > tokenizer.Length) { return(null); } var StartPosition = tokenizer.Index; var TempSlice = tokenizer.Slice(StartPosition, StartPosition + 7); if (TempSlice[0] != 'f' && TempSlice[0] != 'h' && TempSlice[0] != 's') { return(null); } while (!tokenizer.End() && !char.IsWhiteSpace(tokenizer.Current)) { tokenizer.Consume(); } var EndPosition = tokenizer.Index - 1; var Result = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray()); Result = Result.StripRight("."); tokenizer.Index = StartPosition + Result.Length; if (!Uri.IsWellFormedUriString(Result, UriKind.RelativeOrAbsolute)) { return(null); } EndPosition = tokenizer.Index - 1; return(new Token { EndPosition = EndPosition, StartPosition = StartPosition, TokenType = TokenType.Url, Value = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray()) }); }
/// <summary> /// The actual implementation of the IsMatch done by the individual classes. /// </summary> /// <param name="tokenizer">The tokenizer.</param> /// <returns>The token.</returns> protected override Token IsMatchImpl(TokenizableStream <char> tokenizer) { if (tokenizer.End() || tokenizer.Current != '.') { return(null); } var StartPosition = tokenizer.Index; var EndPosition = StartPosition; var Count = 0; var FoundEllipsis = false; string FinalValue = ""; while (!tokenizer.End() && (tokenizer.Current == '.' || char.IsWhiteSpace(tokenizer.Current))) { FinalValue += tokenizer.Current; if (tokenizer.Current == '.') { ++Count; FoundEllipsis |= Count >= 3; EndPosition = tokenizer.Index; } tokenizer.Consume(); } if (!FoundEllipsis) { return(null); } return(new Token { EndPosition = EndPosition, StartPosition = StartPosition, TokenType = TokenType.Ellipsis, Value = new string(FinalValue.Trim()) }); }
/// <summary> /// The actual implementation of the IsMatch done by the individual classes. /// </summary> /// <param name="tokenizer">The tokenizer.</param> /// <returns>The token.</returns> protected override Token IsMatchImpl(TokenizableStream <char> tokenizer) { if (tokenizer.End() || char.IsWhiteSpace(tokenizer.Current)) { return(null); } var StartPosition = tokenizer.Index; while (!tokenizer.End() && !char.IsWhiteSpace(tokenizer.Current)) { tokenizer.Consume(); } var EndPosition = tokenizer.Index - 1; return(new Token { EndPosition = EndPosition, StartPosition = StartPosition, TokenType = TokenType.Other, Value = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray()) }); }
/// <summary> /// The actual implementation of the IsMatch done by the individual classes. /// </summary> /// <param name="tokenizer">The tokenizer.</param> /// <returns>The token.</returns> protected override Token IsMatchImpl(TokenizableStream <char> tokenizer) { if (tokenizer.End() || (tokenizer.Current != '\r' && tokenizer.Current != '\n')) { return(null); } var StartPosition = tokenizer.Index; while (!tokenizer.End() && (tokenizer.Current == '\r' || tokenizer.Current == '\n')) { tokenizer.Consume(); } var EndPosition = tokenizer.Index - 1; return(new Token { EndPosition = EndPosition, StartPosition = StartPosition, TokenType = TokenType.NewLine, Value = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray()) }); }
/// <summary> /// The actual implementation of the IsMatch done by the individual classes. /// </summary> /// <param name="tokenizer">The tokenizer.</param> /// <returns>The token.</returns> protected override Token?IsMatchImpl(TokenizableStream <char> tokenizer) { if (tokenizer.End() || !RomanNumeralCharacters.Contains(tokenizer.Current)) { return(null); } var StartPosition = tokenizer.Index; ConsumeNumbers(tokenizer, RomanNumeralCharacters); if (!tokenizer.End() && char.IsLetter(tokenizer.Current)) { return(null); } var EndPosition = tokenizer.Index - 1; var Result = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray()); if (Result == "I" || !Validate(Result)) { return(null); } return(new Token( EndPosition, StartPosition, TokenType.Number, Result ) { NormalizedValue = ConvertToNumber(Result), ReplacementValue = "<NUMBER>" }); }
/// <summary> /// The actual implementation of the IsMatch done by the individual classes. /// </summary> /// <param name="tokenizer">The tokenizer.</param> /// <returns>The token.</returns> protected override Token?IsMatchImpl(TokenizableStream <char> tokenizer) { if (!tokenizer.End() && tokenizer.Current == Character) { var StartPos = tokenizer.Index; tokenizer.Consume(); var EndPos = tokenizer.Index; return(new Token( EndPos, StartPos, TokenType, Character.ToString() )); } return(null); }
/// <summary> /// Determines whether [is match implementation] [the specified tokenizer]. /// </summary> /// <param name="tokenizer">The tokenizer.</param> /// <returns></returns> protected override Token IsMatchImpl(TokenizableStream <char> tokenizer) { if (tokenizer.End() || !Symbols.ContainsKey(tokenizer.Current)) { return(null); } var StartPosition = tokenizer.Index; var Value = tokenizer.Current; tokenizer.Consume(); var EndPosition = tokenizer.Index - 1; return(new Token { EndPosition = EndPosition, StartPosition = StartPosition, TokenType = Symbols[Value], Value = new string(new char[] { Value }) }); }
/// <summary> /// The actual implementation of the IsMatch done by the individual classes. /// </summary> /// <param name="tokenizer">The tokenizer.</param> /// <returns>The token.</returns> protected override Token?IsMatchImpl(TokenizableStream <char> tokenizer) { if (tokenizer.End() || !char.IsLetter(tokenizer.Current)) { return(null); } var StartPosition = tokenizer.Index; bool CharacterFound = true; int PeriodCount = 0; while (CharacterFound) { CharacterFound = false; while (!tokenizer.End() && (char.IsLetter(tokenizer.Current) || tokenizer.Current == '\'' || tokenizer.Current == '-')) { CharacterFound = true; tokenizer.Consume(); } if (tokenizer.Current == '.' && CharacterFound) { tokenizer.Consume(); ++PeriodCount; } } var EndPosition = tokenizer.Index - 1; var Result = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray()); if (PeriodCount > 1) { return(new Token( EndPosition, StartPosition, TokenType.Abbreviation, Result )); } var UpperResult = Result.ToUpperInvariant(); if (Result == UpperResult && Result.Length <= 4 && Result.Length > 1) { return(new Token( EndPosition, StartPosition, TokenType.Abbreviation, Result )); } if (!CommonAbbreviations.Any(x => x == UpperResult)) { return(null); } return(new Token( EndPosition, StartPosition, TokenType.Abbreviation, Result )); }