/** * Does the actual parsing. Perform this immediately * after creating the parser object. */ private void Go(TextReader reader) { doc.StartDocument(); while (true) { // read a new character if (previousCharacter == -1) { character = reader.Read(); } // or re-examin the previous character else { character = previousCharacter; previousCharacter = -1; } // the end of the file was reached if (character == -1) { if (html) { if (html && state == TEXT) { Flush(); } doc.EndDocument(); } else { ThrowException(MessageLocalization.GetComposedMessage("missing.end.tag")); } return; } // dealing with \n and \r if (character == '\n' && eol) { eol = false; continue; } else if (eol) { eol = false; } else if (character == '\n') { lines++; columns = 0; } else if (character == '\r') { eol = true; character = '\n'; lines++; columns = 0; } else { columns++; } switch (state) { // we are in an unknown state before there's actual content case UNKNOWN: if (character == '<') { SaveState(TEXT); state = TAG_ENCOUNTERED; } break; // we can encounter any content case TEXT: if (character == '<') { Flush(); SaveState(state); state = TAG_ENCOUNTERED; } else if (character == '&') { SaveState(state); entity.Length = 0; state = ENTITY; nowhite = true; } else if (character == ' ') { if (html && nowhite) { text.Append(' '); nowhite = false; } else { if (nowhite) { text.Append((char)character); } nowhite = false; } } else if (Char.IsWhiteSpace((char)character)) { if (html) { // totally ignore other whitespace } else { if (nowhite) { text.Append((char)character); } nowhite = false; } } else { text.Append((char)character); nowhite = true; } break; // we have just seen a < and are wondering what we are looking at // <foo>, </foo>, <!-- ... --->, etc. case TAG_ENCOUNTERED: InitTag(); if (character == '/') { state = IN_CLOSETAG; } else if (character == '?') { RestoreState(); state = PI; } else { text.Append((char)character); state = EXAMIN_TAG; } break; // we are processing something like this <foo ... >. // It could still be a <!-- ... --> or something. case EXAMIN_TAG: if (character == '>') { DoTag(); ProcessTag(true); InitTag(); state = RestoreState(); } else if (character == '/') { state = SINGLE_TAG; } else if (character == '-' && text.ToString().Equals("!-")) { Flush(); state = COMMENT; } else if (character == '[' && text.ToString().Equals("![CDATA")) { Flush(); state = CDATA; } else if (character == 'E' && text.ToString().Equals("!DOCTYP")) { Flush(); state = PI; } else if (char.IsWhiteSpace((char)character)) { DoTag(); state = TAG_EXAMINED; } else { text.Append((char)character); } break; // we know the name of the tag now. case TAG_EXAMINED: if (character == '>') { ProcessTag(true); InitTag(); state = RestoreState(); } else if (character == '/') { state = SINGLE_TAG; } else if (char.IsWhiteSpace((char)character)) { // empty } else { text.Append((char)character); state = ATTRIBUTE_KEY; } break; // we are processing a closing tag: e.g. </foo> case IN_CLOSETAG: if (character == '>') { DoTag(); ProcessTag(false); if (!html && nested == 0) { return; } state = RestoreState(); } else { if (!char.IsWhiteSpace((char)character)) { text.Append((char)character); } } break; // we have just seen something like this: <foo a="b"/ // and are looking for the final >. case SINGLE_TAG: if (character != '>') { ThrowException(MessageLocalization.GetComposedMessage("expected.gt.for.tag.lt.1.gt", tag)); } DoTag(); ProcessTag(true); ProcessTag(false); InitTag(); if (!html && nested == 0) { doc.EndDocument(); return; } state = RestoreState(); break; // we are processing CDATA case CDATA: if (character == '>' && text.ToString().EndsWith("]]")) { text.Length = text.Length - 2; Flush(); state = RestoreState(); } else { text.Append((char)character); } break; // we are processing a comment. We are inside // the <!-- .... --> looking for the -->. case COMMENT: if (character == '>' && text.ToString().EndsWith("--")) { text.Length = text.Length - 2; Flush(); state = RestoreState(); } else { text.Append((char)character); } break; // We are inside one of these <? ... ?> or one of these <!DOCTYPE ... > case PI: if (character == '>') { state = RestoreState(); if (state == TEXT) { state = UNKNOWN; } } break; // we are processing an entity, e.g. <, », etc. case ENTITY: if (character == ';') { state = RestoreState(); String cent = entity.ToString(); entity.Length = 0; char ce = EntitiesToUnicode.DecodeEntity(cent); if (ce == '\0') { text.Append('&').Append(cent).Append(';'); } else { text.Append(ce); } } else if ((character != '#' && (character < '0' || character > '9') && (character < 'a' || character > 'z') && (character < 'A' || character > 'Z')) || entity.Length >= 7) { state = RestoreState(); previousCharacter = character; text.Append('&').Append(entity.ToString()); entity.Length = 0; } else { entity.Append((char)character); } break; // We are processing the quoted right-hand side of an element's attribute. case QUOTE: if (html && quoteCharacter == ' ' && character == '>') { Flush(); ProcessTag(true); InitTag(); state = RestoreState(); } else if (html && quoteCharacter == ' ' && char.IsWhiteSpace((char)character)) { Flush(); state = TAG_EXAMINED; } else if (html && quoteCharacter == ' ') { text.Append((char)character); } else if (character == quoteCharacter) { Flush(); state = TAG_EXAMINED; } else if (" \r\n\u0009".IndexOf((char)character) >= 0) { text.Append(' '); } else if (character == '&') { SaveState(state); state = ENTITY; entity.Length = 0; } else { text.Append((char)character); } break; case ATTRIBUTE_KEY: if (char.IsWhiteSpace((char)character)) { Flush(); state = ATTRIBUTE_EQUAL; } else if (character == '=') { Flush(); state = ATTRIBUTE_VALUE; } else if (html && character == '>') { text.Length = 0; ProcessTag(true); InitTag(); state = RestoreState(); } else { text.Append((char)character); } break; case ATTRIBUTE_EQUAL: if (character == '=') { state = ATTRIBUTE_VALUE; } else if (char.IsWhiteSpace((char)character)) { // empty } else if (html && character == '>') { text.Length = 0; ProcessTag(true); InitTag(); state = RestoreState(); } else if (html && character == '/') { Flush(); state = SINGLE_TAG; } else if (html) { Flush(); text.Append((char)character); state = ATTRIBUTE_KEY; } else { ThrowException(MessageLocalization.GetComposedMessage("error.in.attribute.processing")); } break; case ATTRIBUTE_VALUE: if (character == '"' || character == '\'') { quoteCharacter = character; state = QUOTE; } else if (char.IsWhiteSpace((char)character)) { // empty } else if (html && character == '>') { Flush(); ProcessTag(true); InitTag(); state = RestoreState(); } else if (html) { text.Append((char)character); quoteCharacter = ' '; state = QUOTE; } else { ThrowException(MessageLocalization.GetComposedMessage("error.in.attribute.processing")); } break; } } }
/// <summary> /// Does the actual parsing. Perform this immediately /// after creating the parser object. /// </summary> private void go(TextReader reader) { Doc.StartDocument(); while (true) { // read a new character if (PreviousCharacter == -1) { Character = reader.Read(); } // or re-examin the previous character else { Character = PreviousCharacter; PreviousCharacter = -1; } // the end of the file was reached if (Character == -1) { if (Html) { if (Html && State == Text) { flush(); } Doc.EndDocument(); } else { throwException("Missing end tag"); } return; } // dealing with \n and \r if (Character == '\n' && Eol) { Eol = false; continue; } else if (Eol) { Eol = false; } else if (Character == '\n') { Lines++; Columns = 0; } else if (Character == '\r') { Eol = true; Character = '\n'; Lines++; Columns = 0; } else { Columns++; } switch (State) { // we are in an unknown state before there's actual content case Unknown: if (Character == '<') { saveState(Text); State = TagEncountered; } break; // we can encounter any content case Text: if (Character == '<') { flush(); saveState(State); State = TagEncountered; } else if (Character == '&') { saveState(State); entity.Length = 0; State = Entity; } else if (char.IsWhiteSpace((char)Character)) { if (Nowhite) { text.Append((char)Character); } Nowhite = false; } else { text.Append((char)Character); Nowhite = true; } break; // we have just seen a < and are wondering what we are looking at // <foo>, </foo>, <!-- ... --->, etc. case TagEncountered: initTag(); if (Character == '/') { State = InClosetag; } else if (Character == '?') { restoreState(); State = Pi; } else { text.Append((char)Character); State = ExaminTag; } break; // we are processing something like this <foo ... >. // It could still be a <!-- ... --> or something. case ExaminTag: if (Character == '>') { doTag(); processTag(true); initTag(); State = restoreState(); } else if (Character == '/') { State = SingleTag; } else if (Character == '-' && text.ToString().Equals("!-")) { flush(); State = Comment; } else if (Character == '[' && text.ToString().Equals("![CDATA")) { flush(); State = Cdata; } else if (Character == 'E' && text.ToString().Equals("!DOCTYP")) { flush(); State = Pi; } else if (char.IsWhiteSpace((char)Character)) { doTag(); State = TagExamined; } else { text.Append((char)Character); } break; // we know the name of the tag now. case TagExamined: if (Character == '>') { processTag(true); initTag(); State = restoreState(); } else if (Character == '/') { State = SingleTag; } else if (char.IsWhiteSpace((char)Character)) { // empty } else { text.Append((char)Character); State = AttributeKey; } break; // we are processing a closing tag: e.g. </foo> case InClosetag: if (Character == '>') { doTag(); processTag(false); if (!Html && Nested == 0) { return; } State = restoreState(); } else { if (!char.IsWhiteSpace((char)Character)) { text.Append((char)Character); } } break; // we have just seen something like this: <foo a="b"/ // and are looking for the final >. case SingleTag: if (Character != '>') { throwException($"Expected > for tag: <{Tag}/>"); } doTag(); processTag(true); processTag(false); initTag(); if (!Html && Nested == 0) { Doc.EndDocument(); return; } State = restoreState(); break; // we are processing CDATA case Cdata: if (Character == '>' && text.ToString().EndsWith("]]")) { text.Length = text.Length - 2; flush(); State = restoreState(); } else { text.Append((char)Character); } break; // we are processing a comment. We are inside // the <!-- .... --> looking for the -->. case Comment: if (Character == '>' && text.ToString().EndsWith("--")) { text.Length = text.Length - 2; flush(); State = restoreState(); } else { text.Append((char)Character); } break; // We are inside one of these <? ... ?> or one of these <!DOCTYPE ... > case Pi: if (Character == '>') { State = restoreState(); if (State == Text) { State = Unknown; } } break; // we are processing an entity, e.g. <, », etc. case Entity: if (Character == ';') { State = restoreState(); var cent = entity.ToString(); entity.Length = 0; var ce = EntitiesToUnicode.DecodeEntity(cent); if (ce == '\0') { text.Append('&').Append(cent).Append(';'); } else { text.Append(ce); } } else if ((Character != '#' && (Character < '0' || Character > '9') && (Character < 'a' || Character > 'z') && (Character < 'A' || Character > 'Z')) || entity.Length >= 7) { State = restoreState(); PreviousCharacter = Character; text.Append('&').Append(entity); entity.Length = 0; } else { entity.Append((char)Character); } break; // We are processing the quoted right-hand side of an element's attribute. case Quote: if (Html && QuoteCharacter == ' ' && Character == '>') { flush(); processTag(true); initTag(); State = restoreState(); } else if (Html && QuoteCharacter == ' ' && char.IsWhiteSpace((char)Character)) { flush(); State = TagExamined; } else if (Html && QuoteCharacter == ' ') { text.Append((char)Character); } else if (Character == QuoteCharacter) { flush(); State = TagExamined; } else if (" \r\n\u0009".IndexOf(((char)Character).ToString(), StringComparison.Ordinal) >= 0) { text.Append(' '); } else if (Character == '&') { saveState(State); State = Entity; entity.Length = 0; } else { text.Append((char)Character); } break; case AttributeKey: if (char.IsWhiteSpace((char)Character)) { flush(); State = AttributeEqual; } else if (Character == '=') { flush(); State = AttributeValue; } else if (Html && Character == '>') { text.Length = 0; processTag(true); initTag(); State = restoreState(); } else { text.Append((char)Character); } break; case AttributeEqual: if (Character == '=') { State = AttributeValue; } else if (char.IsWhiteSpace((char)Character)) { // empty } else if (Html && Character == '>') { text.Length = 0; processTag(true); initTag(); State = restoreState(); } else if (Html && Character == '/') { flush(); State = SingleTag; } else if (Html) { flush(); text.Append((char)Character); State = AttributeKey; } else { throwException("Error in attribute processing."); } break; case AttributeValue: if (Character == '"' || Character == '\'') { QuoteCharacter = Character; State = Quote; } else if (char.IsWhiteSpace((char)Character)) { // empty } else if (Html && Character == '>') { flush(); processTag(true); initTag(); State = restoreState(); } else if (Html) { text.Append((char)Character); QuoteCharacter = ' '; State = Quote; } else { throwException("Error in attribute processing"); } break; } } }