/// <summary> /// The actual implementation of the IsMatch done by the individual classes. /// </summary> /// <param name="tokenizer">The tokenizer.</param> /// <returns>The token.</returns> protected override Token IsMatchImpl(TokenizableStream <char> tokenizer) { if (tokenizer.End() || (tokenizer.Current != '.' && tokenizer.Current != '…')) { return(null); } var StartPosition = tokenizer.Index; var EndPosition = StartPosition; var Count = 0; var FoundEllipsis = false; if (tokenizer.Current == '…') { FoundEllipsis = true; EndPosition = tokenizer.Index; tokenizer.Consume(); } else { while (!tokenizer.End() && (tokenizer.Current == '.' || char.IsWhiteSpace(tokenizer.Current))) { if (tokenizer.Current == '.') { ++Count; FoundEllipsis |= Count >= 3; EndPosition = tokenizer.Index; if (FoundEllipsis) { tokenizer.Consume(); break; } } tokenizer.Consume(); } } if (!FoundEllipsis) { return(null); } return(new Token { EndPosition = EndPosition, StartPosition = StartPosition, TokenType = TokenType.Ellipsis, Value = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray()) }); }
/// <summary> /// The actual implementation of the IsMatch done by the individual classes. /// </summary> /// <param name="tokenizer">The tokenizer.</param> /// <returns>The token.</returns> protected override Token?IsMatchImpl(TokenizableStream <char> tokenizer) { if (tokenizer.End() || tokenizer.Index + 8 > tokenizer.Length) { return(null); } var StartPosition = tokenizer.Index; var TempSlice = tokenizer.Slice(StartPosition, StartPosition + 7); if (TempSlice[0] != 'f' && TempSlice[0] != 'h' && TempSlice[0] != 's' && TempSlice[0] != 'w') { return(null); } while (!tokenizer.End() && !char.IsWhiteSpace(tokenizer.Current)) { tokenizer.Consume(); } var EndPosition = tokenizer.Index - 1; var Result = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray()); Result = Result.StripRight("."); tokenizer.Index = StartPosition + Result.Length; if (!Uri.IsWellFormedUriString(Result, UriKind.RelativeOrAbsolute) || (!Result.StartsWith("ftp:", StringComparison.OrdinalIgnoreCase) && !Result.StartsWith("sftp:", StringComparison.OrdinalIgnoreCase) && !Result.StartsWith("http:", StringComparison.OrdinalIgnoreCase) && !Result.StartsWith("https:", StringComparison.OrdinalIgnoreCase) && !Result.StartsWith("www.", StringComparison.OrdinalIgnoreCase))) { return(null); } EndPosition = tokenizer.Index - 1; var TempResult = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray()); return(new Token( EndPosition, StartPosition, TokenType.Url, TempResult )); }
/// <summary> /// The actual implementation of the IsMatch done by the individual classes. /// </summary> /// <param name="tokenizer">The tokenizer.</param> /// <returns>The token.</returns> protected override Token?IsMatchImpl(TokenizableStream <char> tokenizer) { if (!tokenizer.End() && tokenizer.Current == Character) { var StartPos = tokenizer.Index; tokenizer.Consume(); var EndPos = tokenizer.Index; return(new Token( EndPos, StartPos, TokenType, Character.ToString() )); } return(null); }
/// <summary> /// Gets the tokens. /// </summary> /// <param name="tokenizableStream">The tokenizable stream.</param> /// <param name="tokenFinders">The token finders.</param> /// <returns>The tokens.</returns> private IEnumerable <Token> GetTokens(TokenizableStream <char> tokenizableStream, IEnglishTokenFinder[] tokenFinders) { var CurrentToken = Next(tokenizableStream, tokenFinders); while (CurrentToken != null) { yield return(CurrentToken); CurrentToken = Next(tokenizableStream, tokenFinders); } yield return(new Token( tokenizableStream.Index, tokenizableStream.Index, TokenType.EOF, string.Empty )); }
/// <summary> /// Gets the tokens. /// </summary> /// <param name="tokenizableStream">The tokenizable stream.</param> /// <param name="tokenFinders">The token finders.</param> /// <returns>The tokens.</returns> private IEnumerable <Token> GetTokens(TokenizableStream <char> tokenizableStream, IEnglishTokenFinder[] tokenFinders) { var CurrentToken = Next(tokenizableStream, tokenFinders); while (CurrentToken != null) { yield return(CurrentToken); CurrentToken = Next(tokenizableStream, tokenFinders); } yield return(new Token { EndPosition = tokenizableStream.Index, StartPosition = tokenizableStream.Index, TokenType = TokenType.EOF, Value = string.Empty }); }
/// <summary> /// The actual implementation of the IsMatch done by the individual classes. /// </summary> /// <param name="tokenizer">The tokenizer.</param> /// <returns>The token.</returns> protected override Token?IsMatchImpl(TokenizableStream <char> tokenizer) { if (tokenizer.End() || !char.IsLetter(tokenizer.Current)) { return(null); } var StartPosition = tokenizer.Index; while (!tokenizer.End() && (char.IsLetter(tokenizer.Current) || char.IsNumber(tokenizer.Current) || tokenizer.Current == '-' || tokenizer.Current == '.')) { tokenizer.Consume(); } if (tokenizer.Current != '@') { return(null); } tokenizer.Consume(); bool EmailFound = false; while (!tokenizer.End() && (char.IsLetter(tokenizer.Current) || char.IsNumber(tokenizer.Current) || tokenizer.Current == '-' || tokenizer.Current == '.')) { if (tokenizer.Current == '.' && !char.IsLetter(tokenizer.Peek(1))) { break; } EmailFound |= tokenizer.Current == '.'; tokenizer.Consume(); } var EndPosition = tokenizer.Index - 1; var Result = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray()); return(new Token( EndPosition, StartPosition, TokenType.Email, Result )); }
/// <summary> /// The actual implementation of the IsMatch done by the individual classes. /// </summary> /// <param name="tokenizer">The tokenizer.</param> /// <returns>The token.</returns> protected override Token IsMatchImpl(TokenizableStream <char> tokenizer) { if (tokenizer.End() || tokenizer.Index + 8 > tokenizer.Length) { return(null); } var StartPosition = tokenizer.Index; var TempSlice = tokenizer.Slice(StartPosition, StartPosition + 7); if (TempSlice[0] != 'f' && TempSlice[0] != 'h' && TempSlice[0] != 's') { return(null); } while (!tokenizer.End() && !char.IsWhiteSpace(tokenizer.Current)) { tokenizer.Consume(); } var EndPosition = tokenizer.Index - 1; var Result = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray()); Result = Result.StripRight("."); tokenizer.Index = StartPosition + Result.Length; if (!Uri.IsWellFormedUriString(Result, UriKind.RelativeOrAbsolute)) { return(null); } EndPosition = tokenizer.Index - 1; return(new Token { EndPosition = EndPosition, StartPosition = StartPosition, TokenType = TokenType.Url, Value = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray()) }); }
/// <summary> /// Determines whether [is match implementation] [the specified tokenizer]. /// </summary> /// <param name="tokenizer">The tokenizer.</param> /// <returns></returns> protected override Token IsMatchImpl(TokenizableStream <char> tokenizer) { if (tokenizer.End() || !Symbols.ContainsKey(tokenizer.Current)) { return(null); } var StartPosition = tokenizer.Index; var Value = tokenizer.Current; tokenizer.Consume(); var EndPosition = tokenizer.Index - 1; return(new Token { EndPosition = EndPosition, StartPosition = StartPosition, TokenType = Symbols[Value], Value = new string(new char[] { Value }) }); }
/// <summary> /// The actual implementation of the IsMatch done by the individual classes. /// </summary> /// <param name="tokenizer">The tokenizer.</param> /// <returns>The token.</returns> protected override Token IsMatchImpl(TokenizableStream <char> tokenizer) { if (tokenizer.End() || tokenizer.Current != '.') { return(null); } var StartPosition = tokenizer.Index; var EndPosition = StartPosition; var Count = 0; var FoundEllipsis = false; string FinalValue = ""; while (!tokenizer.End() && (tokenizer.Current == '.' || char.IsWhiteSpace(tokenizer.Current))) { FinalValue += tokenizer.Current; if (tokenizer.Current == '.') { ++Count; FoundEllipsis |= Count >= 3; EndPosition = tokenizer.Index; } tokenizer.Consume(); } if (!FoundEllipsis) { return(null); } return(new Token { EndPosition = EndPosition, StartPosition = StartPosition, TokenType = TokenType.Ellipsis, Value = new string(FinalValue.Trim()) }); }
/// <summary> /// The actual implementation of the IsMatch done by the individual classes. /// </summary> /// <param name="tokenizer">The tokenizer.</param> /// <returns>The token.</returns> protected override Token IsMatchImpl(TokenizableStream <char> tokenizer) { if (tokenizer.End() || char.IsWhiteSpace(tokenizer.Current)) { return(null); } var StartPosition = tokenizer.Index; while (!tokenizer.End() && !char.IsWhiteSpace(tokenizer.Current)) { tokenizer.Consume(); } var EndPosition = tokenizer.Index - 1; return(new Token { EndPosition = EndPosition, StartPosition = StartPosition, TokenType = TokenType.Other, Value = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray()) }); }
/// <summary> /// The actual implementation of the IsMatch done by the individual classes. /// </summary> /// <param name="tokenizer">The tokenizer.</param> /// <returns>The token.</returns> protected override Token IsMatchImpl(TokenizableStream <char> tokenizer) { if (tokenizer.End() || !char.IsLetter(tokenizer.Current)) { return(null); } var StartPosition = tokenizer.Index; while (!tokenizer.End() && (char.IsLetter(tokenizer.Current) || tokenizer.Current == '\'' || tokenizer.Current == '-')) { tokenizer.Consume(); } var EndPosition = tokenizer.Index - 1; return(new Token { EndPosition = EndPosition, StartPosition = StartPosition, TokenType = TokenType.Word, Value = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray()) }); }
/// <summary> /// The actual implementation of the IsMatch done by the individual classes. /// </summary> /// <param name="tokenizer">The tokenizer.</param> /// <returns>The token.</returns> protected override Token?IsMatchImpl(TokenizableStream <char> tokenizer) { if (tokenizer.End() || !RomanNumeralCharacters.Contains(tokenizer.Current)) { return(null); } var StartPosition = tokenizer.Index; ConsumeNumbers(tokenizer, RomanNumeralCharacters); if (!tokenizer.End() && char.IsLetter(tokenizer.Current)) { return(null); } var EndPosition = tokenizer.Index - 1; var Result = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray()); if (Result == "I" || !Validate(Result)) { return(null); } return(new Token( EndPosition, StartPosition, TokenType.Number, Result ) { NormalizedValue = ConvertToNumber(Result), ReplacementValue = "<NUMBER>" }); }
/// <summary> /// Tokenizes the specified text. /// </summary> /// <param name="text">The text.</param> /// <returns>The tokenized version of the text.</returns> public Token[] Tokenize(TokenizableStream <char> text) { return(GetTokens(text, TokenFinders.OrderBy(x => x.Order).ToArray()).ToArray()); }
/// <summary> /// The actual implementation of the IsMatch done by the individual classes. /// </summary> /// <param name="tokenizer">The tokenizer.</param> /// <returns>The token.</returns> protected abstract Token?IsMatchImpl(TokenizableStream <char> tokenizer);
/// <summary> /// The actual implementation of the IsMatch done by the individual classes. /// </summary> /// <param name="tokenizer">The tokenizer.</param> /// <returns>The token.</returns> protected override Token?IsMatchImpl(TokenizableStream <char> tokenizer) { if (tokenizer.End() || !char.IsLetter(tokenizer.Current)) { return(null); } var StartPosition = tokenizer.Index; bool CharacterFound = true; int PeriodCount = 0; while (CharacterFound) { CharacterFound = false; while (!tokenizer.End() && (char.IsLetter(tokenizer.Current) || tokenizer.Current == '\'' || tokenizer.Current == '-')) { CharacterFound = true; tokenizer.Consume(); } if (tokenizer.Current == '.' && CharacterFound) { tokenizer.Consume(); ++PeriodCount; } } var EndPosition = tokenizer.Index - 1; var Result = new string(tokenizer.Slice(StartPosition, EndPosition).ToArray()); if (PeriodCount > 1) { return(new Token( EndPosition, StartPosition, TokenType.Abbreviation, Result )); } var UpperResult = Result.ToUpperInvariant(); if (Result == UpperResult && Result.Length <= 4 && Result.Length > 1) { return(new Token( EndPosition, StartPosition, TokenType.Abbreviation, Result )); } if (!CommonAbbreviations.Any(x => x == UpperResult)) { return(null); } return(new Token( EndPosition, StartPosition, TokenType.Abbreviation, Result )); }