Пример #1
0
        private static int ReadTextNode(TokenizerArgument tokenizerArg)
        {
            if (tokenizerArg == null)
            {
                throw new ArgumentNullException(nameof(tokenizerArg));
            }

            var index = tokenizerArg.Index;
            var startTagIdentifierLength           = StartTagIdentifier.Length;
            var startTagIdentifierVariation1Length = StartTagIdentifierVariation1.Length;

            if (index + startTagIdentifierVariation1Length < tokenizerArg.Input.Length &&
                tokenizerArg.Input.Substring(index, startTagIdentifierVariation1Length).Equals(StartTagIdentifierVariation1))
            {
                tokenizerArg.Output.Add(StartTagIdentifierVariation1);
                tokenizerArg.Status = ParseStatus.ReadEndTag;
                index += startTagIdentifierVariation1Length;
            }
            else if (tokenizerArg.Input.Substring(index, startTagIdentifierLength).Equals(StartTagIdentifier))
            {
                tokenizerArg.Output.Add(StartTagIdentifier);
                tokenizerArg.Status = ParseStatus.ReadStartTag;
                index += startTagIdentifierLength;
            }
            else
            {
                var nextIndex = tokenizerArg.Input.IndexOf(StartTagIdentifier, index, StringComparison.OrdinalIgnoreCase);
                tokenizerArg.Output.Add(nextIndex == -1
                    ? tokenizerArg.Input.Substring(index)
                    : tokenizerArg.Input.Substring(index, nextIndex - index));
                index = nextIndex;
            }

            return(index);
        }
Пример #2
0
        private static int ReadEndTag(TokenizerArgument tokenizerArg)
        {
            if (tokenizerArg == null)
            {
                throw new ArgumentNullException(nameof(tokenizerArg));
            }

            var index = tokenizerArg.Index;

            index = GetIndexOfNextNonWhitespaceChar(tokenizerArg.Input, index);

            var tagNameStartIndex = index;

            index = GetIndexOfNextTagIdentifier(tokenizerArg.Input, index, EndTagIdentifier);
            tokenizerArg.Output.Add(tokenizerArg.Input.Substring(tagNameStartIndex, index - tagNameStartIndex));
            index = GetIndexOfNextNonWhitespaceChar(tokenizerArg.Input, index);

            var endTagIdentifierLength = EndTagIdentifier.Length;

            if (index < tokenizerArg.Input.Length &&
                tokenizerArg.Input.Substring(index, endTagIdentifierLength).Equals(EndTagIdentifier))
            {
                tokenizerArg.Output.Add(EndTagIdentifier);
                tokenizerArg.Status = ParseStatus.ReadText;
                index += endTagIdentifierLength;
            }

            return(index);
        }
Пример #3
0
        private static int ReadAttributeValue(TokenizerArgument tokenizerArg)
        {
            if (tokenizerArg == null)
            {
                throw new ArgumentNullException(nameof(tokenizerArg));
            }

            var index = tokenizerArg.Index;

            index = GetIndexOfNextNonWhitespaceChar(tokenizerArg.Input, index);

            var valueStartIndex = index;
            var value           = index < tokenizerArg.Input.Length
                ? tokenizerArg.Input.Substring(index, 1)
                : string.Empty;
            var isDoubleQuotes = value.Equals(DoubleQuotesIdentifier);
            var isSingleQuotes = value.Equals(SingleQuotesIdentifier);

            if (isDoubleQuotes || isSingleQuotes)
            {
                index++;
                index = GetIndexOfNextDelimiter(
                    tokenizerArg.Input,
                    index,
                    isDoubleQuotes ? DoubleQuotesIdentifier : SingleQuotesIdentifier);
                tokenizerArg.Output.Add(tokenizerArg.Input.Substring(valueStartIndex + 1, index - valueStartIndex - 2));
                tokenizerArg.Status = ParseStatus.ReadAttributeName;
            }
            else
            {
                index = GetIndexOfNextTagIdentifier(tokenizerArg.Input, index, EndTagIdentifierVariation1);
                tokenizerArg.Output.Add(tokenizerArg.Input.Substring(valueStartIndex, index - valueStartIndex));
                index = GetIndexOfNextNonWhitespaceChar(tokenizerArg.Input, index);
                tokenizerArg.Status = ParseStatus.ReadAttributeName;
            }

            var endTagIdentifierLength           = EndTagIdentifier.Length;
            var endTagIdentifierVariation1Length = EndTagIdentifierVariation1.Length;

            if (index + 1 < tokenizerArg.Input.Length &&
                tokenizerArg.Input.Substring(index, endTagIdentifierVariation1Length).Equals(EndTagIdentifierVariation1))
            {
                tokenizerArg.Output.Add(EndTagIdentifierVariation1);
                tokenizerArg.Status = ParseStatus.ReadText;
                index += endTagIdentifierVariation1Length;
            }
            else if (index < tokenizerArg.Input.Length &&
                     tokenizerArg.Input.Substring(index, endTagIdentifierLength).Equals(EndTagIdentifier))
            {
                tokenizerArg.Output.Add(EndTagIdentifier);
                tokenizerArg.Status = ParseStatus.ReadText;
                index += endTagIdentifierLength;
            }

            return(index);
        }
Пример #4
0
        private static int ReadAttributeName(TokenizerArgument tokenizerArg)
        {
            if (tokenizerArg == null)
            {
                throw new ArgumentNullException(nameof(tokenizerArg));
            }

            var index = tokenizerArg.Index;

            index = GetIndexOfNextNonWhitespaceChar(tokenizerArg.Input, index);

            var attributeNameStartIndex = index;

            index =
                GetIndexOfNextTagIdentifier(tokenizerArg.Input, index, EndTagIdentifierVariation1 + EqualityIdentifier);
            tokenizerArg.Output.Add(
                tokenizerArg.Input.Substring(attributeNameStartIndex, index - attributeNameStartIndex));
            index = GetIndexOfNextNonWhitespaceChar(tokenizerArg.Input, index);

            var endTagIdentifierLength           = EndTagIdentifier.Length;
            var endTagIdentifierVariation1Length = EndTagIdentifierVariation1.Length;
            var equalityIdentifierLength         = EqualityIdentifier.Length;
            var slashIdentifierLength            = SlashIdentifier.Length;

            if (index + 1 < tokenizerArg.Input.Length &&
                tokenizerArg.Input.Substring(index, endTagIdentifierVariation1Length).Equals(EndTagIdentifierVariation1))
            {
                tokenizerArg.Output.Add(EndTagIdentifierVariation1);
                tokenizerArg.Status = ParseStatus.ReadText;
                index += endTagIdentifierVariation1Length;
            }
            else if (index < tokenizerArg.Input.Length &&
                     tokenizerArg.Input.Substring(index, endTagIdentifierLength).Equals(EndTagIdentifier))
            {
                tokenizerArg.Output.Add(EndTagIdentifier);
                tokenizerArg.Status = ParseStatus.ReadText;
                index += endTagIdentifierLength;
            }
            else if (index < tokenizerArg.Input.Length &&
                     tokenizerArg.Input.Substring(index, equalityIdentifierLength).Equals(EqualityIdentifier))
            {
                tokenizerArg.Output.Add(EqualityIdentifier);
                tokenizerArg.Status = ParseStatus.ReadAttributeValue;
                index += equalityIdentifierLength;
            }
            else if (index < tokenizerArg.Input.Length &&
                     tokenizerArg.Input.Substring(index, slashIdentifierLength).Equals(SlashIdentifier))
            {
                index += slashIdentifierLength;
            }

            return(index);
        }
Пример #5
0
        /// <summary>
        /// This will tokenise the HTML input string.
        /// </summary>
        /// <param name="input"></param>
        /// <returns></returns>
        private StringCollection GetTokens(string input)
        {
            var tokens = new StringCollection();

            if (input.IsNullOrWhiteSpace())
            {
                return(tokens);
            }

            var nodeProcessorMappings = new Dictionary <ParseStatus, ProcessNode>
            {
                [ParseStatus.ReadText]           = ReadTextNode,
                [ParseStatus.ReadStartTag]       = ReadStartTag,
                [ParseStatus.ReadEndTag]         = ReadEndTag,
                [ParseStatus.ReadAttributeName]  = ReadAttributeName,
                [ParseStatus.ReadAttributeValue] = ReadAttributeValue
            };

            var index  = 0;
            var status = ParseStatus.ReadText;

            while (index < input.Length)
            {
                if (nodeProcessorMappings.ContainsKey(status))
                {
                    var tokenizerArg = new TokenizerArgument(input, index, status, tokens);
                    index  = nodeProcessorMappings[status].Invoke(tokenizerArg);
                    status = tokenizerArg.Status;
                    if (index == -1)
                    {
                        break;
                    }
                }
            }

            return(tokens);
        }