/* values start with "=" or " = " etc. */ /* doesn't consume the ">" at end of start tag */ public virtual string ParseValue(string name, bool foldCase, MutableBoolean isempty, MutableInteger pdelim) { int len; int start; short map; bool seenGt = false; bool munge = true; int c; int delim, quotewarning; string val; delim = 0; pdelim.Val = '"'; /* Henry Zrepa reports that some folk are using the embed element with script attributes where newlines are significant and must be preserved */ if (Options.LiteralAttribs) munge = false; /* skip white space before the '=' */ for (;;) { c = Input.ReadChar(); if (c == StreamIn.END_OF_STREAM) { Input.UngetChar(c); break; } map = Map((char) c); if ((map & WHITE) == 0) { break; } } /* c should be '=' if there is a value other legal possibilities are white space, '/' and '>' */ if (c != '=') { Input.UngetChar(c); return null; } /* skip white space after '=' */ for (;;) { c = Input.ReadChar(); if (c == StreamIn.END_OF_STREAM) { Input.UngetChar(c); break; } map = Map((char) c); if ((map & WHITE) == 0) break; } /* check for quote marks */ if (c == '"' || c == '\'') delim = c; else if (c == '<') { start = Lexsize; AddCharToLexer(c); pdelim.Val = ParseServerInstruction(); len = Lexsize - start; Lexsize = start; return (len > 0 ? GetString(Lexbuf, start, len) : null); } else { Input.UngetChar(c); } /* and read the value string check for quote mark if needed */ quotewarning = 0; start = Lexsize; c = '\x0000'; for (;;) { int lastc = c; c = Input.ReadChar(); if (c == StreamIn.END_OF_STREAM) { Report.AttrError(this, Token, null, Report.UNEXPECTED_END_OF_FILE); Input.UngetChar(c); break; } if (delim == (char) 0) { if (c == '>') { Input.UngetChar(c); break; } if (c == '"' || c == '\'') { Report.AttrError(this, Token, null, Report.UNEXPECTED_QUOTEMARK); break; } if (c == '<') { /* in.UngetChar(c); */ Report.AttrError(this, Token, null, Report.UNEXPECTED_GT); /* break; */ } /* For cases like <br clear=all/> need to avoid treating /> as part of the attribute value, however care is needed to avoid so treating <a href=http://www.acme.com/> in this way, which would map the <a> tag to <a href="http://www.acme.com"/> */ if (c == '/') { /* peek ahead in case of /> */ c = Input.ReadChar(); if (c == '>' && !AttributeTable.DefaultAttributeTable.IsUrl(name)) { isempty.Val = true; Input.UngetChar(c); break; } /* unget peeked char */ Input.UngetChar(c); c = '/'; } } /* delim is '\'' or '"' */ else { if (c == delim) { break; } /* treat CRLF, CR and LF as single line break */ if (c == '\r') { c = Input.ReadChar(); if (c != '\n') { Input.UngetChar(c); } c = '\n'; } if (c == '\n' || c == '<' || c == '>') ++quotewarning; if (c == '>') seenGt = true; } if (c == '&') { AddCharToLexer(c); ParseEntity(0); continue; } /* kludge for JavaScript attribute values with line continuations in string literals */ if (c == '\\') { c = Input.ReadChar(); if (c != '\n') { Input.UngetChar(c); c = '\\'; } } map = Map((char) c); if ((map & WHITE) != 0) { if (delim == (char) 0) break; if (munge) { c = ' '; if (lastc == ' ') continue; } } else if (foldCase && (map & UPPERCASE) != 0) c += ('a' - 'A'); AddCharToLexer(c); } if (quotewarning > 10 && seenGt && munge) { /* there is almost certainly a missing trailling quote mark as we have see too many newlines, < or > characters. an exception is made for Javascript attributes and the javascript URL scheme which may legitimately include < and > */ if (!AttributeTable.DefaultAttributeTable.IsScript(name) && !(AttributeTable.DefaultAttributeTable.IsUrl(name) && (GetString(Lexbuf, start, 11)).Equals("javascript:"))) Report.Error(this, null, null, Report.SUSPECTED_MISSING_QUOTE); } len = Lexsize - start; Lexsize = start; if (len > 0 || delim != 0) { val = GetString(Lexbuf, start, len); } else { val = null; } /* note delimiter if given */ pdelim.Val = delim != 0 ? delim : '"'; return val; }
/* consumes the '>' terminating start tags */ public virtual string ParseAttribute(MutableBoolean isempty, MutableObject asp, MutableObject php) { int start; // int len = 0; Removed by BUGFIX for 126265 short map; int c; asp.Object = null; /* clear asp pointer */ php.Object = null; /* clear php pointer */ /* skip white space before the attribute */ for (;;) { c = Input.ReadChar(); if (c == '/') { c = Input.ReadChar(); if (c == '>') { isempty.Val = true; return null; } Input.UngetChar(c); c = '/'; break; } if (c == '>') { return null; } if (c == '<') { c = Input.ReadChar(); if (c == '%') { asp.Object = ParseAsp(); return null; } if (c == '?') { php.Object = ParsePhp(); return null; } Input.UngetChar(c); Report.AttrError(this, Token, null, Report.UNEXPECTED_GT); return null; } if (c == '"' || c == '\'') { Report.AttrError(this, Token, null, Report.UNEXPECTED_QUOTEMARK); continue; } if (c == StreamIn.END_OF_STREAM) { Report.AttrError(this, Token, null, Report.UNEXPECTED_END_OF_FILE); Input.UngetChar(c); return null; } map = Map((char) c); if ((map & WHITE) == 0) { break; } } start = Lexsize; for (;;) { /* but push back '=' for parseValue() */ if (c == '=' || c == '>') { Input.UngetChar(c); break; } if (c == '<' || c == StreamIn.END_OF_STREAM) { Input.UngetChar(c); break; } map = Map((char) c); if ((map & WHITE) != 0) break; /* what should be done about non-namechar characters? */ /* currently these are incorporated into the attr name */ if (!Options.XmlTags && (map & UPPERCASE) != 0) { c += ('a' - 'A'); } // ++len; Removed by BUGFIX for 126265 AddCharToLexer(c); c = Input.ReadChar(); } // Following line added by GLP to fix BUG 126265. This is a temporary comment // and should be removed when Tidy is fixed. int len = Lexsize - start; string attr = (len > 0 ? GetString(Lexbuf, start, len) : null); Lexsize = start; return attr; }
/* swallows closing '>' */ public virtual AttVal ParseAttrs(MutableBoolean isempty) { var delim = new MutableInteger(); var asp = new MutableObject(); var php = new MutableObject(); AttVal list = null; while (!EndOfInput()) { string attribute = ParseAttribute(isempty, asp, php); AttVal av; if (attribute == null) { /* check if attributes are created by ASP markup */ if (asp.Object != null) { av = new AttVal(list, null, (Node) asp.Object, null, '\x0000', null, null); list = av; continue; } /* check if attributes are created by PHP markup */ if (php.Object != null) { av = new AttVal(list, null, null, (Node) php.Object, '\x0000', null, null); list = av; continue; } break; } string val = ParseValue(attribute, false, isempty, delim); if (IsValidAttrName(attribute)) { av = new AttVal(list, null, null, null, delim.Val, attribute, val); av.Dict = AttributeTable.DefaultAttributeTable.FindAttribute(av); list = av; } else { //av = new AttVal(null, null, null, null, 0, attribute, val); Report.AttrError(this, Token, val, Report.BAD_ATTRIBUTE_VALUE); } } return list; }
/* modes for GetToken() MixedContent -- for elements which don't accept PCDATA Preformatted -- white space preserved as is IgnoreMarkup -- for CDATA elements such as script, style */ public virtual Node GetToken(short mode) { int c; int badcomment = 0; var isempty = new MutableBoolean(); if (Pushed) { /* duplicate inlines in preference to pushed text nodes when appropriate */ if (Token.Type != Node.TEXT_NODE || (Insert == - 1 && Inode == null)) { Pushed = false; return Token; } } /* at start of block elements, unclosed inline elements are inserted into the token stream */ if (Insert != - 1 || Inode != null) { return InsertedToken(); } Lines = Input.CursorLine; Columns = Input.CursorColumn; Waswhite = false; Txtstart = Lexsize; Txtend = Lexsize; while (true) { c = Input.ReadChar(); if (c == StreamIn.END_OF_STREAM) { break; } if (Insertspace && mode != IGNORE_WHITESPACE) { AddCharToLexer(' '); Waswhite = true; Insertspace = false; } /* treat \r\n as \n and \r as \n */ if (c == '\r') { c = Input.ReadChar(); if (c != '\n') { Input.UngetChar(c); } c = '\n'; } AddCharToLexer(c); short map; switch (State) { case LEX_CONTENT: map = Map((char) c); /* Discard white space if appropriate. Its cheaper to do this here rather than in parser methods for elements that don't have mixed content. */ if (((map & WHITE) != 0) && (mode == IGNORE_WHITESPACE) && Lexsize == Txtstart + 1) { --Lexsize; Waswhite = false; Lines = Input.CursorLine; Columns = Input.CursorColumn; continue; } if (c == '<') { State = LEX_GT; continue; } if ((map & WHITE) != 0) { /* was previous char white? */ if (Waswhite) { if (mode != PREFORMATTED && mode != IGNORE_MARKUP) { --Lexsize; Lines = Input.CursorLine; Columns = Input.CursorColumn; } } /* prev char wasn't white */ else { Waswhite = true; if (mode != PREFORMATTED && mode != IGNORE_MARKUP && c != ' ') { ChangeChar((byte) ' '); } } continue; } if (c == '&' && mode != IGNORE_MARKUP) { ParseEntity(mode); } /* this is needed to avoid trimming trailing whitespace */ if (mode == IGNORE_WHITESPACE) mode = MIXED_CONTENT; Waswhite = false; continue; case LEX_GT: if (c == '/') { c = Input.ReadChar(); if (c == StreamIn.END_OF_STREAM) { Input.UngetChar(c); continue; } AddCharToLexer(c); map = Map((char) c); if ((map & LETTER) != 0) { Lexsize -= 3; Txtend = Lexsize; Input.UngetChar(c); State = LEX_ENDTAG; Lexbuf[Lexsize] = (byte) '\x0000'; /* debug */ Input.CursorColumn -= 2; /* if some text before the </ return it now */ if (Txtend > Txtstart) { /* trim space char before end tag */ if (mode == IGNORE_WHITESPACE && Lexbuf[Lexsize - 1] == (byte) ' ') { Lexsize -= 1; Txtend = Lexsize; } Token = NewNode(Node.TEXT_NODE, Lexbuf, Txtstart, Txtend); return Token; } continue; /* no text so keep going */ } /* otherwise treat as CDATA */ Waswhite = false; State = LEX_CONTENT; continue; } if (mode == IGNORE_MARKUP) { /* otherwise treat as CDATA */ Waswhite = false; State = LEX_CONTENT; continue; } /* look out for comments, doctype or marked sections this isn't quite right, but its getting there ... */ if (c == '!') { c = Input.ReadChar(); if (c == '-') { c = Input.ReadChar(); if (c == '-') { State = LEX_COMMENT; /* comment */ Lexsize -= 2; Txtend = Lexsize; /* if some text before < return it now */ if (Txtend > Txtstart) { Token = NewNode(Node.TEXT_NODE, Lexbuf, Txtstart, Txtend); return Token; } Txtstart = Lexsize; continue; } Report.Warning(this, null, null, Report.MALFORMED_COMMENT); } else if (c == 'd' || c == 'D') { State = LEX_DOCTYPE; /* doctype */ Lexsize -= 2; Txtend = Lexsize; mode = IGNORE_WHITESPACE; /* skip until white space or '>' */ for (;;) { c = Input.ReadChar(); if (c == StreamIn.END_OF_STREAM || c == '>') { Input.UngetChar(c); break; } map = Map((char) c); if ((map & WHITE) == 0) { continue; } /* and skip to end of whitespace */ for (;;) { c = Input.ReadChar(); if (c == StreamIn.END_OF_STREAM || c == '>') { Input.UngetChar(c); break; } map = Map((char) c); if ((map & WHITE) != 0) { continue; } Input.UngetChar(c); break; } break; } /* if some text before < return it now */ if (Txtend > Txtstart) { Token = NewNode(Node.TEXT_NODE, Lexbuf, Txtstart, Txtend); return Token; } Txtstart = Lexsize; continue; } else if (c == '[') { /* Word 2000 embeds <![if ...]> ... <![endif]> sequences */ Lexsize -= 2; State = LEX_SECTION; Txtend = Lexsize; /* if some text before < return it now */ if (Txtend > Txtstart) { Token = NewNode(Node.TEXT_NODE, Lexbuf, Txtstart, Txtend); return Token; } Txtstart = Lexsize; continue; } /* otherwise swallow chars up to and including next '>' */ while (true) { c = Input.ReadChar(); if (c == '>') { break; } if (c == - 1) { Input.UngetChar(c); break; } } Lexsize -= 2; Lexbuf[Lexsize] = (byte) '\x0000'; State = LEX_CONTENT; continue; } /* processing instructions */ if (c == '?') { Lexsize -= 2; State = LEX_PROCINSTR; Txtend = Lexsize; /* if some text before < return it now */ if (Txtend > Txtstart) { Token = NewNode(Node.TEXT_NODE, Lexbuf, Txtstart, Txtend); return Token; } Txtstart = Lexsize; continue; } /* Microsoft ASP's e.g. <% ... server-code ... %> */ if (c == '%') { Lexsize -= 2; State = LEX_ASP; Txtend = Lexsize; /* if some text before < return it now */ if (Txtend > Txtstart) { Token = NewNode(Node.TEXT_NODE, Lexbuf, Txtstart, Txtend); return Token; } Txtstart = Lexsize; continue; } /* Netscapes JSTE e.g. <# ... server-code ... #> */ if (c == '#') { Lexsize -= 2; State = LEX_JSTE; Txtend = Lexsize; /* if some text before < return it now */ if (Txtend > Txtstart) { Token = NewNode(Node.TEXT_NODE, Lexbuf, Txtstart, Txtend); return Token; } Txtstart = Lexsize; continue; } map = Map((char) c); /* check for start tag */ if ((map & LETTER) != 0) { Input.UngetChar(c); /* push back letter */ Lexsize -= 2; /* discard "<" + letter */ Txtend = Lexsize; State = LEX_STARTTAG; /* ready to read tag name */ /* if some text before < return it now */ if (Txtend > Txtstart) { Token = NewNode(Node.TEXT_NODE, Lexbuf, Txtstart, Txtend); return Token; } continue; /* no text so keep going */ } /* otherwise treat as CDATA */ State = LEX_CONTENT; Waswhite = false; continue; case LEX_ENDTAG: Txtstart = Lexsize - 1; Input.CursorColumn += 2; c = ParseTagName(); Token = NewNode(Node.END_TAG, Lexbuf, Txtstart, Txtend, GetString(Lexbuf, Txtstart, Txtend - Txtstart)); Lexsize = Txtstart; Txtend = Txtstart; /* skip to '>' */ while (c != '>') { c = Input.ReadChar(); if (c == StreamIn.END_OF_STREAM) { break; } } if (c == StreamIn.END_OF_STREAM) { Input.UngetChar(c); continue; } State = LEX_CONTENT; Waswhite = false; return Token; /* the endtag token */ case LEX_STARTTAG: Txtstart = Lexsize - 1; /* set txtstart to first letter */ c = ParseTagName(); isempty.Val = false; AttVal attributes = null; Token = NewNode((isempty.Val ? Node.START_END_TAG : Node.START_TAG), Lexbuf, Txtstart, Txtend, GetString(Lexbuf, Txtstart, Txtend - Txtstart)); /* parse attributes, consuming closing ">" */ if (c != '>') { if (c == '/') { Input.UngetChar(c); } attributes = ParseAttrs(isempty); } if (isempty.Val) { Token.Type = Node.START_END_TAG; } Token.Attributes = attributes; Lexsize = Txtstart; Txtend = Txtstart; /* swallow newline following start tag */ /* special check needed for CRLF sequence */ /* this doesn't apply to empty elements */ if (ExpectsContent(Token) || Token.Tag == Options.TagTable.TagBr) { c = Input.ReadChar(); if (c == '\r') { c = Input.ReadChar(); if (c != '\n') { Input.UngetChar(c); } } else if (c != '\n' && c != '\f') { Input.UngetChar(c); } Waswhite = true; /* to swallow leading whitespace */ } else { Waswhite = false; } State = LEX_CONTENT; if (Token.Tag == null) { Report.Error(this, null, Token, Report.UNKNOWN_ELEMENT); } else if (!Options.XmlTags) { Versions &= Token.Tag.Versions; if ((Token.Tag.Versions & HtmlVersion.Proprietary) != 0) { if (!Options.MakeClean && (Token.Tag == Options.TagTable.TagNobr || Token.Tag == Options.TagTable.TagWbr)) { Report.Warning(this, null, Token, Report.PROPRIETARY_ELEMENT); } } if (Token.Tag.CheckAttribs != null) { Token.CheckUniqueAttributes(this); Token.Tag.CheckAttribs.Check(this, Token); } else { Token.CheckAttributes(this); } } return Token; /* return start tag */ case LEX_COMMENT: if (c != '-') { continue; } c = Input.ReadChar(); AddCharToLexer(c); if (c != '-') { continue; } while (true) { c = Input.ReadChar(); if (c == '>') { if (badcomment != 0) { Report.Warning(this, null, null, Report.MALFORMED_COMMENT); } Txtend = Lexsize - 2; // AQ 8Jul2000 Lexbuf[Lexsize] = (byte) '\x0000'; State = LEX_CONTENT; Waswhite = false; Token = NewNode(Node.COMMENT_TAG, Lexbuf, Txtstart, Txtend); /* now look for a line break */ c = Input.ReadChar(); if (c == '\r') { c = Input.ReadChar(); if (c != '\n') { Token.Linebreak = true; } } if (c == '\n') { Token.Linebreak = true; } else { Input.UngetChar(c); } return Token; } /* note position of first such error in the comment */ if (badcomment == 0) { Lines = Input.CursorLine; Columns = Input.CursorColumn - 3; } badcomment++; if (Options.FixComments) { Lexbuf[Lexsize - 2] = (byte) '='; } AddCharToLexer(c); /* if '-' then look for '>' to end the comment */ if (c != '-') { break; } } /* otherwise continue to look for --> */ Lexbuf[Lexsize - 2] = (byte) '='; continue; case LEX_DOCTYPE: map = Map((char) c); if ((map & WHITE) != 0) { if (Waswhite) { Lexsize -= 1; } Waswhite = true; } else { Waswhite = false; } if (c != '>') { continue; } Lexsize -= 1; Txtend = Lexsize; Lexbuf[Lexsize] = (byte) '\x0000'; State = LEX_CONTENT; Waswhite = false; Token = NewNode(Node.DOC_TYPE_TAG, Lexbuf, Txtstart, Txtend); /* make a note of the version named by the doctype */ Doctype = FindGivenVersion(Token); return Token; case LEX_PROCINSTR: if (Lexsize - Txtstart == 3) { if ((GetString(Lexbuf, Txtstart, 3)).Equals("php")) { State = LEX_PHP; continue; } } if (Options.XmlPIs) { /* insist on ?> as terminator */ if (c != '?') { continue; } /* now look for '>' */ c = Input.ReadChar(); if (c == StreamIn.END_OF_STREAM) { Report.Warning(this, null, null, Report.UNEXPECTED_END_OF_FILE); Input.UngetChar(c); continue; } AddCharToLexer(c); } if (c != '>') { continue; } Lexsize -= 1; Txtend = Lexsize; Lexbuf[Lexsize] = (byte) '\x0000'; State = LEX_CONTENT; Waswhite = false; Token = NewNode(Node.PROC_INS_TAG, Lexbuf, Txtstart, Txtend); return Token; case LEX_ASP: if (c != '%') { continue; } /* now look for '>' */ c = Input.ReadChar(); if (c != '>') { Input.UngetChar(c); continue; } Lexsize -= 1; Txtend = Lexsize; Lexbuf[Lexsize] = (byte) '\x0000'; State = LEX_CONTENT; Waswhite = false; Token = NewNode(Node.ASP_TAG, Lexbuf, Txtstart, Txtend); return Token; case LEX_JSTE: if (c != '#') { continue; } /* now look for '>' */ c = Input.ReadChar(); if (c != '>') { Input.UngetChar(c); continue; } Lexsize -= 1; Txtend = Lexsize; Lexbuf[Lexsize] = (byte) '\x0000'; State = LEX_CONTENT; Waswhite = false; Token = NewNode(Node.JSTE_TAG, Lexbuf, Txtstart, Txtend); return Token; case LEX_PHP: if (c != '?') { continue; } /* now look for '>' */ c = Input.ReadChar(); if (c != '>') { Input.UngetChar(c); continue; } Lexsize -= 1; Txtend = Lexsize; Lexbuf[Lexsize] = (byte) '\x0000'; State = LEX_CONTENT; Waswhite = false; Token = NewNode(Node.PHP_TAG, Lexbuf, Txtstart, Txtend); return Token; case LEX_SECTION: if (c == '[') { if (Lexsize == (Txtstart + 6) && (GetString(Lexbuf, Txtstart, 6)).Equals("CDATA[")) { State = LEX_CDATA; Lexsize -= 6; continue; } } if (c != ']') { continue; } /* now look for '>' */ c = Input.ReadChar(); if (c != '>') { Input.UngetChar(c); continue; } Lexsize -= 1; Txtend = Lexsize; Lexbuf[Lexsize] = (byte) '\x0000'; State = LEX_CONTENT; Waswhite = false; Token = NewNode(Node.SECTION_TAG, Lexbuf, Txtstart, Txtend); return Token; case LEX_CDATA: if (c != ']') { continue; } /* now look for ']' */ c = Input.ReadChar(); if (c != ']') { Input.UngetChar(c); continue; } /* now look for '>' */ c = Input.ReadChar(); if (c != '>') { Input.UngetChar(c); continue; } Lexsize -= 1; Txtend = Lexsize; Lexbuf[Lexsize] = (byte) '\x0000'; State = LEX_CONTENT; Waswhite = false; Token = NewNode(Node.CDATA_TAG, Lexbuf, Txtstart, Txtend); return Token; } } if (State == LEX_CONTENT) { /* text string */ Txtend = Lexsize; if (Txtend > Txtstart) { Input.UngetChar(c); if (Lexbuf[Lexsize - 1] == (byte) ' ') { Lexsize -= 1; Txtend = Lexsize; } Token = NewNode(Node.TEXT_NODE, Lexbuf, Txtstart, Txtend); return Token; } } else if (State == LEX_COMMENT) { /* comment */ if (c == StreamIn.END_OF_STREAM) { Report.Warning(this, null, null, Report.MALFORMED_COMMENT); } Txtend = Lexsize; Lexbuf[Lexsize] = (byte) '\x0000'; State = LEX_CONTENT; Waswhite = false; Token = NewNode(Node.COMMENT_TAG, Lexbuf, Txtstart, Txtend); return Token; } return null; }