private static int ReadTextNode(TokenizerArgument tokenizerArg) { if (tokenizerArg == null) { throw new ArgumentNullException(nameof(tokenizerArg)); } var index = tokenizerArg.Index; var startTagIdentifierLength = StartTagIdentifier.Length; var startTagIdentifierVariation1Length = StartTagIdentifierVariation1.Length; if (index + startTagIdentifierVariation1Length < tokenizerArg.Input.Length && tokenizerArg.Input.Substring(index, startTagIdentifierVariation1Length).Equals(StartTagIdentifierVariation1)) { tokenizerArg.Output.Add(StartTagIdentifierVariation1); tokenizerArg.Status = ParseStatus.ReadEndTag; index += startTagIdentifierVariation1Length; } else if (tokenizerArg.Input.Substring(index, startTagIdentifierLength).Equals(StartTagIdentifier)) { tokenizerArg.Output.Add(StartTagIdentifier); tokenizerArg.Status = ParseStatus.ReadStartTag; index += startTagIdentifierLength; } else { var nextIndex = tokenizerArg.Input.IndexOf(StartTagIdentifier, index, StringComparison.OrdinalIgnoreCase); tokenizerArg.Output.Add(nextIndex == -1 ? tokenizerArg.Input.Substring(index) : tokenizerArg.Input.Substring(index, nextIndex - index)); index = nextIndex; } return(index); }
private static int ReadEndTag(TokenizerArgument tokenizerArg) { if (tokenizerArg == null) { throw new ArgumentNullException(nameof(tokenizerArg)); } var index = tokenizerArg.Index; index = GetIndexOfNextNonWhitespaceChar(tokenizerArg.Input, index); var tagNameStartIndex = index; index = GetIndexOfNextTagIdentifier(tokenizerArg.Input, index, EndTagIdentifier); tokenizerArg.Output.Add(tokenizerArg.Input.Substring(tagNameStartIndex, index - tagNameStartIndex)); index = GetIndexOfNextNonWhitespaceChar(tokenizerArg.Input, index); var endTagIdentifierLength = EndTagIdentifier.Length; if (index < tokenizerArg.Input.Length && tokenizerArg.Input.Substring(index, endTagIdentifierLength).Equals(EndTagIdentifier)) { tokenizerArg.Output.Add(EndTagIdentifier); tokenizerArg.Status = ParseStatus.ReadText; index += endTagIdentifierLength; } return(index); }
private static int ReadAttributeValue(TokenizerArgument tokenizerArg) { if (tokenizerArg == null) { throw new ArgumentNullException(nameof(tokenizerArg)); } var index = tokenizerArg.Index; index = GetIndexOfNextNonWhitespaceChar(tokenizerArg.Input, index); var valueStartIndex = index; var value = index < tokenizerArg.Input.Length ? tokenizerArg.Input.Substring(index, 1) : string.Empty; var isDoubleQuotes = value.Equals(DoubleQuotesIdentifier); var isSingleQuotes = value.Equals(SingleQuotesIdentifier); if (isDoubleQuotes || isSingleQuotes) { index++; index = GetIndexOfNextDelimiter( tokenizerArg.Input, index, isDoubleQuotes ? DoubleQuotesIdentifier : SingleQuotesIdentifier); tokenizerArg.Output.Add(tokenizerArg.Input.Substring(valueStartIndex + 1, index - valueStartIndex - 2)); tokenizerArg.Status = ParseStatus.ReadAttributeName; } else { index = GetIndexOfNextTagIdentifier(tokenizerArg.Input, index, EndTagIdentifierVariation1); tokenizerArg.Output.Add(tokenizerArg.Input.Substring(valueStartIndex, index - valueStartIndex)); index = GetIndexOfNextNonWhitespaceChar(tokenizerArg.Input, index); tokenizerArg.Status = ParseStatus.ReadAttributeName; } var endTagIdentifierLength = EndTagIdentifier.Length; var endTagIdentifierVariation1Length = EndTagIdentifierVariation1.Length; if (index + 1 < tokenizerArg.Input.Length && tokenizerArg.Input.Substring(index, endTagIdentifierVariation1Length).Equals(EndTagIdentifierVariation1)) { tokenizerArg.Output.Add(EndTagIdentifierVariation1); tokenizerArg.Status = ParseStatus.ReadText; index += endTagIdentifierVariation1Length; } else if (index < tokenizerArg.Input.Length && tokenizerArg.Input.Substring(index, endTagIdentifierLength).Equals(EndTagIdentifier)) { tokenizerArg.Output.Add(EndTagIdentifier); tokenizerArg.Status = ParseStatus.ReadText; index += endTagIdentifierLength; } return(index); }
private static int ReadAttributeName(TokenizerArgument tokenizerArg) { if (tokenizerArg == null) { throw new ArgumentNullException(nameof(tokenizerArg)); } var index = tokenizerArg.Index; index = GetIndexOfNextNonWhitespaceChar(tokenizerArg.Input, index); var attributeNameStartIndex = index; index = GetIndexOfNextTagIdentifier(tokenizerArg.Input, index, EndTagIdentifierVariation1 + EqualityIdentifier); tokenizerArg.Output.Add( tokenizerArg.Input.Substring(attributeNameStartIndex, index - attributeNameStartIndex)); index = GetIndexOfNextNonWhitespaceChar(tokenizerArg.Input, index); var endTagIdentifierLength = EndTagIdentifier.Length; var endTagIdentifierVariation1Length = EndTagIdentifierVariation1.Length; var equalityIdentifierLength = EqualityIdentifier.Length; var slashIdentifierLength = SlashIdentifier.Length; if (index + 1 < tokenizerArg.Input.Length && tokenizerArg.Input.Substring(index, endTagIdentifierVariation1Length).Equals(EndTagIdentifierVariation1)) { tokenizerArg.Output.Add(EndTagIdentifierVariation1); tokenizerArg.Status = ParseStatus.ReadText; index += endTagIdentifierVariation1Length; } else if (index < tokenizerArg.Input.Length && tokenizerArg.Input.Substring(index, endTagIdentifierLength).Equals(EndTagIdentifier)) { tokenizerArg.Output.Add(EndTagIdentifier); tokenizerArg.Status = ParseStatus.ReadText; index += endTagIdentifierLength; } else if (index < tokenizerArg.Input.Length && tokenizerArg.Input.Substring(index, equalityIdentifierLength).Equals(EqualityIdentifier)) { tokenizerArg.Output.Add(EqualityIdentifier); tokenizerArg.Status = ParseStatus.ReadAttributeValue; index += equalityIdentifierLength; } else if (index < tokenizerArg.Input.Length && tokenizerArg.Input.Substring(index, slashIdentifierLength).Equals(SlashIdentifier)) { index += slashIdentifierLength; } return(index); }
/// <summary> /// This will tokenise the HTML input string. /// </summary> /// <param name="input"></param> /// <returns></returns> private StringCollection GetTokens(string input) { var tokens = new StringCollection(); if (input.IsNullOrWhiteSpace()) { return(tokens); } var nodeProcessorMappings = new Dictionary <ParseStatus, ProcessNode> { [ParseStatus.ReadText] = ReadTextNode, [ParseStatus.ReadStartTag] = ReadStartTag, [ParseStatus.ReadEndTag] = ReadEndTag, [ParseStatus.ReadAttributeName] = ReadAttributeName, [ParseStatus.ReadAttributeValue] = ReadAttributeValue }; var index = 0; var status = ParseStatus.ReadText; while (index < input.Length) { if (nodeProcessorMappings.ContainsKey(status)) { var tokenizerArg = new TokenizerArgument(input, index, status, tokens); index = nodeProcessorMappings[status].Invoke(tokenizerArg); status = tokenizerArg.Status; if (index == -1) { break; } } } return(tokens); }