[SuppressMessage("Microsoft.Maintainability", "CA1502")] // Most of the complexity is due to simple switch statements. Not sure if it would truly be less complex if it was split into different methods... private void Parse() { if (m_ParseState == HtmlParseState.BetweenAttributes) { m_AttributeLineNumber = m_LineNumber; m_AttributeLinePosition = m_LinePosition; } // keep track of the previous character to find empty element nodes. char? previousChar = null; bool setPreviousChar = false; // only set it after the first loop char currentChar = '\0'; // only setting this here to shut up the compiler. while (true) { // All node types end at the '>' character with the exception of the Text node. // So, for the Text parse state, peek at the next character rather than read it. // If it is the '<' character, end the node. if (m_ParseState == HtmlParseState.Text && (char)((ushort)m_Reader.Peek()) == '<') { EndNode(previousChar); break; // break out of the while loop - done parsing the Text node } if (setPreviousChar) previousChar = currentChar; currentChar = NextChar(); setPreviousChar = true; switch (m_ParseState) { case HtmlParseState.AttributeAfterEquals: if (currentChar == '>') { AddAttribute(); EndNode(previousChar); break; } else if (!Char.IsWhiteSpace(currentChar)) { if (currentChar == '\'' || currentChar == '"') { m_QuoteChar = currentChar; m_ParseState = HtmlParseState.QuotedAttributeValue; // continue parsing } else // non-quoted attribute value { AppendToAttributeValue(currentChar); m_ParseState = HtmlParseState.AttributeValue; // continue parsing } } continue; case HtmlParseState.AttributeBeforeEquals: if (!Char.IsWhiteSpace(currentChar)) { if (currentChar == '>') { AddAttribute(); EndNode(previousChar); break; } else if (currentChar == '=') { m_ParseState = HtmlParseState.AttributeAfterEquals; // continue parsing } else // start of a new attribute name { AddAttribute(); StartNewAttribute(); AppendToAttributeName(currentChar); m_ParseState = HtmlParseState.AttributeName; // continue parsing } } continue; case HtmlParseState.AttributeName: if (Char.IsWhiteSpace(currentChar)) { m_ParseState = HtmlParseState.AttributeBeforeEquals; // continue parsing } else if (currentChar == '=') { m_ParseState = HtmlParseState.AttributeAfterEquals; // continue parsing } else if (currentChar == '>') { AddAttribute(); EndNode(previousChar); break; } else { AppendToAttributeName(currentChar); // continue parsing } continue; case HtmlParseState.AttributeValue: if (Char.IsWhiteSpace(currentChar)) { AddAttribute(); m_ParseState = HtmlParseState.BetweenAttributes; // continue parsing } else if (currentChar == '>') { AddAttribute(); EndNode(previousChar); break; } else { AppendToAttributeValue(currentChar); // continue parsing } continue; case HtmlParseState.BetweenAttributes: if (!Char.IsWhiteSpace(currentChar)) { if (currentChar == '>') { EndNode(previousChar); break; } else { StartNewAttribute(); AppendToAttributeName(currentChar); m_ParseState = HtmlParseState.AttributeName; // continue parsing } } continue; case HtmlParseState.Comment: if (currentChar == '-') { char secondChar = NextChar(); if (secondChar == '-') { char thirdChar = NextChar(); int count = 0; // keep track of the number of '-' signs to append to the node value while (thirdChar == '-') { thirdChar = NextChar(); count++; } // check if the end of the comment has been reached if (thirdChar == '>') { while (count-- > 0) { AppendToValue('-'); } EndNode(previousChar); break; } else { // If there were extra dashes before the final -->, append them to the // value. count += 2; while (count-- > 0) { AppendToValue('-'); } AppendToValue(thirdChar); // continue parsing } } else { AppendToValue(currentChar); AppendToValue(secondChar); // continue parsing } } else { AppendToValue(currentChar); // continue parsing } continue; case HtmlParseState.Identifier: if (currentChar == '>') { EndNode(previousChar); break; } else if (Char.IsWhiteSpace(currentChar)) { m_ParseState = HtmlParseState.BetweenAttributes; break; } else { m_Name.Append(currentChar); // continue parsing } continue; case HtmlParseState.QuotedAttributeValue: if (currentChar == m_QuoteChar && currentChar != '\0') { AddAttribute(); m_ParseState = HtmlParseState.BetweenAttributes; // continue parsing } else { AppendToAttributeValue(currentChar); // continue parsing } continue; case HtmlParseState.Tag: case HtmlParseState.EndTag: if (currentChar == '>') { EndNode(previousChar); break; } else if (Char.IsWhiteSpace(currentChar)) { m_ParseState = HtmlParseState.BetweenAttributes; break; } else { m_Name.Append(currentChar); // continue parsing } continue; case HtmlParseState.Text: // This node type ending was checked above as a special case. AppendToValue(currentChar); // continue parsing continue; case HtmlParseState.None: if (currentChar != '<') { m_ParseState = HtmlParseState.Text; StartNode(HtmlNodeType.Text); AppendToValue(currentChar); break; } else { m_ParseState = HtmlParseState.BeginTag; // continue parsing } continue; case HtmlParseState.BeginTag: if (currentChar == '!') { char secondChar = NextChar(); if (secondChar == '-') { char thirdChar = NextChar(); if (thirdChar == '-') { m_ParseState = HtmlParseState.Comment; StartNode(HtmlNodeType.Comment); break; } else if (thirdChar == '>') { StartNode(HtmlNodeType.Identifier); AppendToValue(secondChar); EndNode(previousChar); break; } } else if (secondChar == '>') { StartNode(HtmlNodeType.Identifier); EndNode(previousChar); break; } else { StartNode(HtmlNodeType.Identifier); m_Name.Append(secondChar); m_ParseState = HtmlParseState.Identifier; // continue parsing } } else if (Char.IsLetter(currentChar)) { StartNode(HtmlNodeType.Element); m_ParseState = HtmlParseState.Tag; m_Name.Append(currentChar); // continue parsing } else if (currentChar == '/') { StartNode(HtmlNodeType.EndElement); m_ParseState = HtmlParseState.EndTag; // continue parsing } else { // if a number or symbol appears after a '<', IE treats it // as text instead of an element. m_ParseState = HtmlParseState.Text; StartNode(HtmlNodeType.Text); AppendToValue('<'); AppendToValue(currentChar); break; } continue; } break; } }
/// <summary> /// Finishes parsing the current node. /// </summary> private void EndNode(char? previousChar) { // if previousChar is "/" and this is an element node, set IsEmptyElement. if (previousChar == '/' && NodeType == HtmlNodeType.Element) { IsEmptyElement = true; } else { IsEmptyElement = false; } m_ParseState = HtmlParseState.None; }