/* swallows closing '>' */ public virtual AttVal ParseAttrs(MutableBoolean isempty) { AttVal av, list; string attribute, val; MutableInteger delim = new MutableInteger(); MutableObject asp = new MutableObject(); MutableObject php = new MutableObject(); list = null; while (!EndOfInput()) { attribute = ParseAttribute(isempty, asp, php); if (attribute == null) { /* check if attributes are created by ASP markup */ if (asp.Object != null) { av = new AttVal(list, null, (Node) asp.Object, null, '\x0000', null, null); list = av; continue; } /* check if attributes are created by PHP markup */ if (php.Object != null) { av = new AttVal(list, null, null, (Node) php.Object, '\x0000', null, null); list = av; continue; } break; } val = ParseValue(attribute, false, isempty, delim); if (attribute != null && IsValidAttrName(attribute)) { av = new AttVal(list, null, null, null, delim.Val, attribute, val); av.Dict = AttributeTable.DefaultAttributeTable.FindAttribute(av); list = av; } else { av = new AttVal(null, null, null, null, 0, attribute, val); Report.AttrError(this, token, val, Report.BAD_ATTRIBUTE_VALUE); } } return list; }
/* consumes the '>' terminating start tags */ public virtual string ParseAttribute(MutableBoolean isempty, MutableObject asp, MutableObject php) { int start = 0; // int len = 0; Removed by BUGFIX for 126265 short map; string attr; int c = 0; asp.Object = null; /* clear asp pointer */ php.Object = null; /* clear php pointer */ /* skip white space before the attribute */ for (;;) { c = input.ReadChar(); if (c == '/') { c = input.ReadChar(); if (c == '>') { isempty.Val = true; return null; } input.UngetChar(c); c = '/'; break; } if (c == '>') { return null; } if (c == '<') { c = input.ReadChar(); if (c == '%') { asp.Object = ParseAsp(); return null; } else if (c == '?') { php.Object = ParsePhp(); return null; } input.UngetChar(c); Report.AttrError(this, token, null, Report.UNEXPECTED_GT); return null; } if (c == '"' || c == '\'') { Report.AttrError(this, token, null, Report.UNEXPECTED_QUOTEMARK); continue; } if (c == StreamIn.EndOfStream) { Report.AttrError(this, token, null, Report.UNEXPECTED_END_OF_FILE); input.UngetChar(c); return null; } map = MAP((char) c); if ((map & WHITE) == 0) { break; } } start = lexsize; for (;;) { /* but push back '=' for parseValue() */ if (c == '=' || c == '>') { input.UngetChar(c); break; } if (c == '<' || c == StreamIn.EndOfStream) { input.UngetChar(c); break; } map = MAP((char) c); if ((map & WHITE) != 0) break; /* what should be done about non-namechar characters? */ /* currently these are incorporated into the attr name */ if (!Options.XmlTags && (map & UPPERCASE) != 0) { c += (int) ('a' - 'A'); } // ++len; Removed by BUGFIX for 126265 AddCharToLexer(c); c = input.ReadChar(); } // Following line added by GLP to fix BUG 126265. This is a temporary comment // and should be removed when Tidy is fixed. int len = lexsize - start; attr = (len > 0?GetString(lexbuf, start, len):null); lexsize = start; return attr; }
/* values start with "=" or " = " etc. */ /* doesn't consume the ">" at end of start tag */ public virtual string ParseValue(string name, bool foldCase, MutableBoolean isempty, MutableInteger pdelim) { int len = 0; int start; short map; bool seen_gt = false; bool munge = true; int c = 0; int lastc, delim, quotewarning; string val; delim = 0; pdelim.Val = (int) '"'; /* Henry Zrepa reports that some folk are using the embed element with script attributes where newlines are significant and must be preserved */ if (Options.LiteralAttribs) munge = false; /* skip white space before the '=' */ for (; ; ) { c = input.ReadChar(); if (c == StreamIn.EndOfStream) { input.UngetChar(c); break; } map = MAP((char) c); if ((map & WHITE) == 0) { break; } } /* c should be '=' if there is a value other legal possibilities are white space, '/' and '>' */ if (c != '=') { input.UngetChar(c); return null; } /* skip white space after '=' */ for (; ; ) { c = input.ReadChar(); if (c == StreamIn.EndOfStream) { input.UngetChar(c); break; } map = MAP((char) c); if ((map & WHITE) == 0) break; } /* check for quote marks */ if (c == '"' || c == '\'') delim = c; else if (c == '<') { start = lexsize; AddCharToLexer(c); pdelim.Val = ParseServerInstruction(); len = lexsize - start; lexsize = start; return (len > 0?GetString(lexbuf, start, len):null); } else { input.UngetChar(c); } /* and read the value string check for quote mark if needed */ quotewarning = 0; start = lexsize; c = '\x0000'; for (; ; ) { lastc = c; /* track last character */ c = input.ReadChar(); if (c == StreamIn.EndOfStream) { Report.AttrError(this, token, null, Report.UNEXPECTED_END_OF_FILE); input.UngetChar(c); break; } if (delim == (char) 0) { if (c == '>') { input.UngetChar(c); break; } if (c == '"' || c == '\'') { Report.AttrError(this, token, null, Report.UNEXPECTED_QUOTEMARK); break; } if (c == '<') { /* in.UngetChar(c); */ Report.AttrError(this, token, null, Report.UNEXPECTED_GT); /* break; */ } /* For cases like <br clear=all/> need to avoid treating /> as part of the attribute value, however care is needed to avoid so treating <a href=http://www.acme.com/> in this way, which would map the <a> tag to <a href="http://www.acme.com"/> */ if (c == '/') { /* peek ahead in case of /> */ c = input.ReadChar(); if (c == '>' && !AttributeTable.DefaultAttributeTable.IsUrl(name)) { isempty.Val = true; input.UngetChar(c); break; } /* unget peeked char */ input.UngetChar(c); c = '/'; } } /* delim is '\'' or '"' */ else { if (c == delim) { break; } /* treat CRLF, CR and LF as single line break */ if (c == '\r') { c = input.ReadChar(); if (c != '\n') { input.UngetChar(c); } c = '\n'; } if (c == '\n' || c == '<' || c == '>') ++quotewarning; if (c == '>') seen_gt = true; } if (c == '&') { AddCharToLexer(c); ParseEntity((short) 0); continue; } /* kludge for JavaScript attribute values with line continuations in string literals */ if (c == '\\') { c = input.ReadChar(); if (c != '\n') { input.UngetChar(c); c = '\\'; } } map = MAP((char) c); if ((map & WHITE) != 0) { if (delim == (char) 0) break; if (munge) { c = ' '; if (lastc == ' ') continue; } } else if (foldCase && (map & UPPERCASE) != 0) c += (int) ('a' - 'A'); AddCharToLexer(c); } if (quotewarning > 10 && seen_gt && munge) { /* there is almost certainly a missing trailling quote mark as we have see too many newlines, < or > characters. an exception is made for Javascript attributes and the javascript URL scheme which may legitimately include < and > */ if (!AttributeTable.DefaultAttributeTable.IsScript(name) && !(AttributeTable.DefaultAttributeTable.IsUrl(name) && (GetString(lexbuf, start, 11)).Equals("javascript:"))) Report.Error(this, null, null, Report.SUSPECTED_MISSING_QUOTE); } len = lexsize - start; lexsize = start; if (len > 0 || delim != 0) { val = GetString(lexbuf, start, len); } else { val = null; } /* note delimiter if given */ if (delim != 0) pdelim.Val = delim; else pdelim.Val = (int) '"'; return val; }
/* modes for GetToken() MixedContent -- for elements which don't accept PCDATA Preformatted -- white space preserved as is IgnoreMarkup -- for CDATA elements such as script, style */ public virtual Node GetToken(short mode) { short map; int c = 0; int lastc; int badcomment = 0; MutableBoolean isempty = new MutableBoolean(); AttVal attributes; if (pushed) { /* duplicate inlines in preference to pushed text nodes when appropriate */ if (token.Type != Node.TextNode || (insert == - 1 && inode == null)) { pushed = false; return token; } } /* at start of block elements, unclosed inline elements are inserted into the token stream */ if (insert != - 1 || inode != null) { return InsertedToken(); } lines = input.curline; columns = input.curcol; waswhite = false; txtstart = lexsize; txtend = lexsize; while (true) { c = input.ReadChar(); if (c == StreamIn.EndOfStream) { break; } if (insertspace && mode != IgnoreWhitespace) { AddCharToLexer(' '); waswhite = true; insertspace = false; } /* treat \r\n as \n and \r as \n */ if (c == '\r') { c = input.ReadChar(); if (c != '\n') { input.UngetChar(c); } c = '\n'; } AddCharToLexer(c); switch (state) { case LEX_CONTENT: map = MAP((char) c); /* Discard white space if appropriate. Its cheaper to do this here rather than in parser methods for elements that don't have mixed content. */ if (((map & WHITE) != 0) && (mode == IgnoreWhitespace) && lexsize == txtstart + 1) { --lexsize; waswhite = false; lines = input.curline; columns = input.curcol; continue; } if (c == '<') { state = LEX_GT; continue; } if ((map & WHITE) != 0) { /* was previous char white? */ if (waswhite) { if (mode != Preformatted && mode != IgnoreMarkup) { --lexsize; lines = input.curline; columns = input.curcol; } } /* prev char wasn't white */ else { waswhite = true; lastc = c; if (mode != Preformatted && mode != IgnoreMarkup && c != ' ') { ChangeChar((byte) ' '); } } continue; } else if (c == '&' && mode != IgnoreMarkup) { ParseEntity(mode); } /* this is needed to avoid trimming trailing whitespace */ if (mode == IgnoreWhitespace) mode = MixedContent; waswhite = false; continue; case LEX_GT: if (c == '/') { c = input.ReadChar(); if (c == StreamIn.EndOfStream) { input.UngetChar(c); continue; } AddCharToLexer(c); map = MAP((char) c); if ((map & LETTER) != 0) { lexsize -= 3; txtend = lexsize; input.UngetChar(c); state = LEX_ENDTAG; lexbuf[lexsize] = (byte) '\x0000'; /* debug */ input.curcol -= 2; /* if some text before the </ return it now */ if (txtend > txtstart) { /* trim space char before end tag */ if (mode == IgnoreWhitespace && lexbuf[lexsize - 1] == (byte) ' ') { lexsize -= 1; txtend = lexsize; } token = NewNode(Node.TextNode, lexbuf, txtstart, txtend); return token; } continue; /* no text so keep going */ } /* otherwise treat as CDATA */ waswhite = false; state = LEX_CONTENT; continue; } if (mode == IgnoreMarkup) { /* otherwise treat as CDATA */ waswhite = false; state = LEX_CONTENT; continue; } /* look out for comments, doctype or marked sections this isn't quite right, but its getting there ... */ if (c == '!') { c = input.ReadChar(); if (c == '-') { c = input.ReadChar(); if (c == '-') { state = LEX_COMMENT; /* comment */ lexsize -= 2; txtend = lexsize; /* if some text before < return it now */ if (txtend > txtstart) { token = NewNode(Node.TextNode, lexbuf, txtstart, txtend); return token; } txtstart = lexsize; continue; } Report.Warning(this, null, null, Report.MALFORMED_COMMENT); } else if (c == 'd' || c == 'D') { state = LEX_DOCTYPE; /* doctype */ lexsize -= 2; txtend = lexsize; mode = IgnoreWhitespace; /* skip until white space or '>' */ for (; ; ) { c = input.ReadChar(); if (c == StreamIn.EndOfStream || c == '>') { input.UngetChar(c); break; } map = MAP((char) c); if ((map & WHITE) == 0) { continue; } /* and skip to end of whitespace */ for (; ; ) { c = input.ReadChar(); if (c == StreamIn.EndOfStream || c == '>') { input.UngetChar(c); break; } map = MAP((char) c); if ((map & WHITE) != 0) { continue; } input.UngetChar(c); break; } break; } /* if some text before < return it now */ if (txtend > txtstart) { token = NewNode(Node.TextNode, lexbuf, txtstart, txtend); return token; } txtstart = lexsize; continue; } else if (c == '[') { /* Word 2000 embeds <![if ...]> ... <![endif]> sequences */ lexsize -= 2; state = LEX_SECTION; txtend = lexsize; /* if some text before < return it now */ if (txtend > txtstart) { token = NewNode(Node.TextNode, lexbuf, txtstart, txtend); return token; } txtstart = lexsize; continue; } /* otherwise swallow chars up to and including next '>' */ while (true) { c = input.ReadChar(); if (c == '>') { break; } if (c == - 1) { input.UngetChar(c); break; } } lexsize -= 2; lexbuf[lexsize] = (byte) '\x0000'; state = LEX_CONTENT; continue; } /* processing instructions */ if (c == '?') { lexsize -= 2; state = LEX_PROCINSTR; txtend = lexsize; /* if some text before < return it now */ if (txtend > txtstart) { token = NewNode(Node.TextNode, lexbuf, txtstart, txtend); return token; } txtstart = lexsize; continue; } /* Microsoft ASP's e.g. <% ... server-code ... %> */ if (c == '%') { lexsize -= 2; state = LEX_ASP; txtend = lexsize; /* if some text before < return it now */ if (txtend > txtstart) { token = NewNode(Node.TextNode, lexbuf, txtstart, txtend); return token; } txtstart = lexsize; continue; } /* Netscapes JSTE e.g. <# ... server-code ... #> */ if (c == '#') { lexsize -= 2; state = LEX_JSTE; txtend = lexsize; /* if some text before < return it now */ if (txtend > txtstart) { token = NewNode(Node.TextNode, lexbuf, txtstart, txtend); return token; } txtstart = lexsize; continue; } map = MAP((char) c); /* check for start tag */ if ((map & LETTER) != 0) { input.UngetChar(c); /* push back letter */ lexsize -= 2; /* discard "<" + letter */ txtend = lexsize; state = LEX_STARTTAG; /* ready to read tag name */ /* if some text before < return it now */ if (txtend > txtstart) { token = NewNode(Node.TextNode, lexbuf, txtstart, txtend); return token; } continue; /* no text so keep going */ } /* otherwise treat as CDATA */ state = LEX_CONTENT; waswhite = false; continue; case LEX_ENDTAG: txtstart = lexsize - 1; input.curcol += 2; c = ParseTagName(); token = NewNode(Node.EndTag, lexbuf, txtstart, txtend, GetString(lexbuf, txtstart, txtend - txtstart)); lexsize = txtstart; txtend = txtstart; /* skip to '>' */ while (c != '>') { c = input.ReadChar(); if (c == StreamIn.EndOfStream) { break; } } if (c == StreamIn.EndOfStream) { input.UngetChar(c); continue; } state = LEX_CONTENT; waswhite = false; return token; /* the endtag token */ case LEX_STARTTAG: txtstart = lexsize - 1; /* set txtstart to first letter */ c = ParseTagName(); isempty.Val = false; attributes = null; token = NewNode((isempty.Val ? Node.StartEndTag : Node.StartTag), lexbuf, txtstart, txtend, GetString(lexbuf, txtstart, txtend - txtstart)); /* parse attributes, consuming closing ">" */ if (c != '>') { if (c == '/') { input.UngetChar(c); } attributes = ParseAttrs(isempty); } if (isempty.Val) { token.Type = Node.StartEndTag; } token.Attributes = attributes; lexsize = txtstart; txtend = txtstart; /* swallow newline following start tag */ /* special check needed for CRLF sequence */ /* this doesn't apply to empty elements */ if (ExpectsContent(token) || token.Tag == Options.tt.TagBr) { c = input.ReadChar(); if (c == '\r') { c = input.ReadChar(); if (c != '\n') { input.UngetChar(c); } } else if (c != '\n' && c != '\f') { input.UngetChar(c); } waswhite = true; /* to swallow leading whitespace */ } else { waswhite = false; } state = LEX_CONTENT; if (token.Tag == null) { Report.Error(this, null, token, Report.UNKNOWN_ELEMENT); } else if (!Options.XmlTags) { versions &= token.Tag.Versions; if ((token.Tag.Versions & HtmlVersion.Proprietary) != 0) { if (!Options.MakeClean && (token.Tag == Options.tt.TagNobr || token.Tag == Options.tt.TagWbr)) { Report.Warning(this, null, token, Report.PROPRIETARY_ELEMENT); } } if (token.Tag.CheckAttribs != null) { token.CheckUniqueAttributes(this); token.Tag.CheckAttribs.Check(this, this.token); } else { token.CheckAttributes(this); } } return token; /* return start tag */ case LEX_COMMENT: if (c != '-') { continue; } c = input.ReadChar(); AddCharToLexer(c); if (c != '-') { continue; } while (true) { c = input.ReadChar(); if (c == '>') { if (badcomment != 0) { Report.Warning(this, null, null, Report.MALFORMED_COMMENT); } txtend = lexsize - 2; // AQ 8Jul2000 lexbuf[lexsize] = (byte) '\x0000'; state = LEX_CONTENT; waswhite = false; token = NewNode(Node.CommentTag, lexbuf, txtstart, txtend); /* now look for a line break */ c = input.ReadChar(); if (c == '\r') { c = input.ReadChar(); if (c != '\n') { token.Linebreak = true; } } if (c == '\n') { token.Linebreak = true; } else { input.UngetChar(c); } return token; } /* note position of first such error in the comment */ if (badcomment == 0) { lines = input.curline; columns = input.curcol - 3; } badcomment++; if (Options.FixComments) { lexbuf[lexsize - 2] = (byte) '='; } AddCharToLexer(c); /* if '-' then look for '>' to end the comment */ if (c != '-') { break; } } /* otherwise continue to look for --> */ lexbuf[lexsize - 2] = (byte) '='; continue; case LEX_DOCTYPE: map = MAP((char) c); if ((map & WHITE) != 0) { if (waswhite) { lexsize -= 1; } waswhite = true; } else { waswhite = false; } if (c != '>') { continue; } lexsize -= 1; txtend = lexsize; lexbuf[lexsize] = (byte) '\x0000'; state = LEX_CONTENT; waswhite = false; token = NewNode(Node.DocTypeTag, lexbuf, txtstart, txtend); /* make a note of the version named by the doctype */ doctype = FindGivenVersion(token); return token; case LEX_PROCINSTR: if (lexsize - txtstart == 3) { if ((GetString(lexbuf, txtstart, 3)).Equals("php")) { state = LEX_PHP; continue; } } if (Options.XmlPIs) { /* insist on ?> as terminator */ if (c != '?') { continue; } /* now look for '>' */ c = input.ReadChar(); if (c == StreamIn.EndOfStream) { Report.Warning(this, null, null, Report.UNEXPECTED_END_OF_FILE); input.UngetChar(c); continue; } AddCharToLexer(c); } if (c != '>') { continue; } lexsize -= 1; txtend = lexsize; lexbuf[lexsize] = (byte) '\x0000'; state = LEX_CONTENT; waswhite = false; token = NewNode(Node.ProcInsTag, lexbuf, txtstart, txtend); return token; case LEX_ASP: if (c != '%') { continue; } /* now look for '>' */ c = input.ReadChar(); if (c != '>') { input.UngetChar(c); continue; } lexsize -= 1; txtend = lexsize; lexbuf[lexsize] = (byte) '\x0000'; state = LEX_CONTENT; waswhite = false; token = NewNode(Node.AspTag, lexbuf, txtstart, txtend); return this.token; case LEX_JSTE: if (c != '#') { continue; } /* now look for '>' */ c = input.ReadChar(); if (c != '>') { input.UngetChar(c); continue; } lexsize -= 1; txtend = lexsize; lexbuf[lexsize] = (byte) '\x0000'; state = LEX_CONTENT; waswhite = false; token = NewNode(Node.JsteTag, lexbuf, txtstart, txtend); return token; case LEX_PHP: if (c != '?') { continue; } /* now look for '>' */ c = input.ReadChar(); if (c != '>') { input.UngetChar(c); continue; } lexsize -= 1; txtend = lexsize; lexbuf[lexsize] = (byte) '\x0000'; state = LEX_CONTENT; waswhite = false; token = NewNode(Node.PhpTag, lexbuf, txtstart, txtend); return token; case LEX_SECTION: if (c == '[') { if (lexsize == (txtstart + 6) && (GetString(lexbuf, txtstart, 6)).Equals("CDATA[")) { state = LEX_CDATA; lexsize -= 6; continue; } } if (c != ']') { continue; } /* now look for '>' */ c = input.ReadChar(); if (c != '>') { input.UngetChar(c); continue; } lexsize -= 1; txtend = lexsize; lexbuf[lexsize] = (byte) '\x0000'; state = LEX_CONTENT; waswhite = false; token = NewNode(Node.SectionTag, lexbuf, txtstart, txtend); return token; case LEX_CDATA: if (c != ']') { continue; } /* now look for ']' */ c = input.ReadChar(); if (c != ']') { input.UngetChar(c); continue; } /* now look for '>' */ c = input.ReadChar(); if (c != '>') { input.UngetChar(c); continue; } lexsize -= 1; txtend = lexsize; lexbuf[lexsize] = (byte) '\x0000'; state = LEX_CONTENT; waswhite = false; token = NewNode(Node.CDATATag, lexbuf, txtstart, txtend); return token; } } if (state == LEX_CONTENT) { /* text string */ txtend = lexsize; if (txtend > txtstart) { input.UngetChar(c); if (lexbuf[lexsize - 1] == (byte) ' ') { lexsize -= 1; txtend = lexsize; } token = NewNode(Node.TextNode, lexbuf, txtstart, txtend); return token; } } else if (state == LEX_COMMENT) { /* comment */ if (c == StreamIn.EndOfStream) { Report.Warning(this, null, null, Report.MALFORMED_COMMENT); } txtend = lexsize; lexbuf[lexsize] = (byte) '\x0000'; state = LEX_CONTENT; waswhite = false; token = NewNode(Node.CommentTag, lexbuf, txtstart, txtend); return token; } return null; }