/* 1010 A 1011 B 1100 C 1101 D 1110 E 1111 F */ /* return one less that the number of bytes used by UTF-8 char */ /* str points to 1st byte, *ch initialized to 1st byte */ public static int GetUTF8(byte[] str, int start, MutableInteger ch) { int c, n, i, bytes; c = ((int) str[start]) & 0xFF; // Convert to unsigned. if ((c & 0xE0) == 0xC0) { /* 110X XXXX two bytes */ n = c & 31; bytes = 2; } else if ((c & 0xF0) == 0xE0) { /* 1110 XXXX three bytes */ n = c & 15; bytes = 3; } else if ((c & 0xF8) == 0xF0) { /* 1111 0XXX four bytes */ n = c & 7; bytes = 4; } else if ((c & 0xFC) == 0xF8) { /* 1111 10XX five bytes */ n = c & 3; bytes = 5; } else if ((c & 0xFE) == 0xFC) { /* 1111 110X six bytes */ n = c & 1; bytes = 6; } else { /* 0XXX XXXX one byte */ ch.Val = c; return 0; } /* successor bytes should have the form 10XX XXXX */ for (i = 1; i < bytes; ++i) { c = ((int) str[start + i]) & 0xFF; // Convert to unsigned. n = (n << 6) | (c & 0x3F); } ch.Val = n; return bytes - 1; }
public static void ReportVersion(Lexer lexer, Node doctype) { int i, c; int state = 0; string vers = lexer.HtmlVersionName(); MutableInteger cc = new MutableInteger(); StringBuilder message = new StringBuilder(); if (doctype != null) { StringBuilder docTypeStr = new StringBuilder(); for (i = doctype.Start; i < doctype.End; ++i) { c = (int)doctype.Textarray[i]; /* look for UTF-8 multibyte character */ if (c < 0) { i += PPrint.GetUTF8(doctype.Textarray, i, cc); c = cc.Val; } if (c == (char)'"') { ++state; } else if (state == 1) { docTypeStr.Append((char)c); } } lexer.messages.Add(new TidyMessage(lexer, String.Format(GetMessage("doctype_given"), docTypeStr), MessageLevel.Info)); } lexer.messages.Add(new TidyMessage(lexer, String.Format(GetMessage("report_version"), (vers != null ? vers : "HTML proprietary")), MessageLevel.Info)); }
public static void ReportVersion(Lexer lexer, Node doctype) { int i, c; int state = 0; string vers = lexer.HtmlVersionName(); MutableInteger cc = new MutableInteger(); StringBuilder message = new StringBuilder(); if (doctype != null) { StringBuilder docTypeStr = new StringBuilder(); for (i = doctype.Start; i < doctype.End; ++i) { c = (int) doctype.Textarray[i]; /* look for UTF-8 multibyte character */ if (c < 0) { i += PPrint.GetUTF8(doctype.Textarray, i, cc); c = cc.Val; } if (c == (char) '"') { ++state; } else if (state == 1) { docTypeStr.Append((char)c); } } lexer.messages.Add(new TidyMessage(lexer, String.Format(GetMessage("doctype_given"), docTypeStr), MessageLevel.Info)); } lexer.messages.Add(new TidyMessage(lexer, String.Format(GetMessage("report_version"), (vers != null ? vers : "HTML proprietary")), MessageLevel.Info)); }
/* map non-breaking spaces to regular spaces */ private void NormalizeSpaces(Lexer lexer, Node node) { while (node != null) { if (node.Content != null) { NormalizeSpaces(lexer, node.Content); } if (node.Type == Node.TextNode) { int i; MutableInteger c = new MutableInteger(); int p = node.Start; for (i = node.Start; i < node.End; ++i) { c.Val = (int) node.Textarray[i]; /* look for UTF-8 multibyte character */ if (c.Val > 0x7F) { i += PPrint.GetUTF8(node.Textarray, i, c); } if (c.Val == 160) { c.Val = ' '; } p = PPrint.PutUTF8(node.Textarray, p, c.Val); } } node = node.Next; } }
/* The line buffer is uint not char so we can hold Unicode values unencoded. The translation to UTF-8 is deferred to the outc routine called to flush the line buffer. */ private void PrintText(Out fout, int mode, int indent, byte[] textarray, int start, int end) { int i, c; MutableInteger ci = new MutableInteger(); for (i = start; i < end; ++i) { if (indent + linelen >= _options.WrapLen) { WrapLine(fout, indent); } c = ((int) textarray[i]) & 0xFF; // Convert to unsigned. /* look for UTF-8 multibyte character */ if (c > 0x7F) { i += GetUTF8(textarray, i, ci); c = ci.Val; } if (c == '\n') { FlushLine(fout, indent); continue; } PrintChar(c, mode); } }
private void PrintAttrValue(Out fout, int indent, string val, int delim, bool wrappable) { int c; MutableInteger ci = new MutableInteger(); bool wasinstring = false; byte[] valueChars = null; int i; int mode = (wrappable?(int) (NORMAL | ATTRIBVALUE):(int) (PREFORMATTED | ATTRIBVALUE)); if (val != null) { valueChars = Lexer.GetBytes(val); } /* look for ASP, Tango or PHP instructions for computed attribute value */ if (valueChars != null && valueChars.Length >= 5 && valueChars[0] == '<') { char[] tmpChar; tmpChar = new char[valueChars.Length]; valueChars.CopyTo(tmpChar, 0); if (valueChars[1] == '%' || valueChars[1] == '@' || (new string(tmpChar, 0, 5)).Equals("<?php")) mode |= CDATA; } if (delim == 0) { delim = '"'; } AddC('=', linelen++); /* don't wrap after "=" for xml documents */ if (!_options.XmlOut) { if (indent + linelen < _options.WrapLen) { wraphere = linelen; } if (indent + linelen >= _options.WrapLen) { WrapLine(fout, indent); } if (indent + linelen < _options.WrapLen) { wraphere = linelen; } else { CondFlushLine(fout, indent); } } AddC(delim, linelen++); if (val != null) { InString = false; i = 0; while (i < valueChars.Length) { c = ((int) valueChars[i]) & 0xFF; // Convert to unsigned. if (wrappable && c == ' ' && indent + linelen < _options.WrapLen) { wraphere = linelen; wasinstring = InString; } if (wrappable && wraphere > 0 && indent + linelen >= _options.WrapLen) WrapAttrVal(fout, indent, wasinstring); if (c == delim) { string entity; entity = (c == '"'?""":"'"); for (int j = 0; j < entity.Length; j++) { AddC(entity[j], linelen++); } ++i; continue; } else if (c == '"') { if (_options.QuoteMarks) { AddC('&', linelen++); AddC('q', linelen++); AddC('u', linelen++); AddC('o', linelen++); AddC('t', linelen++); AddC(';', linelen++); } else { AddC('"', linelen++); } if (delim == '\'') { InString = !InString; } ++i; continue; } else if (c == '\'') { if (_options.QuoteMarks) { AddC('&', linelen++); AddC('#', linelen++); AddC('3', linelen++); AddC('9', linelen++); AddC(';', linelen++); } else { AddC('\'', linelen++); } if (delim == '"') { InString = !InString; } ++i; continue; } /* look for UTF-8 multibyte character */ if (c > 0x7F) { i += GetUTF8(valueChars, i, ci); c = ci.Val; } ++i; if (c == '\n') { FlushLine(fout, indent); continue; } PrintChar(c, mode); } } InString = false; AddC(delim, linelen++); }
/* swallows closing '>' */ public virtual AttVal ParseAttrs(MutableBoolean isempty) { AttVal av, list; string attribute, val; MutableInteger delim = new MutableInteger(); MutableObject asp = new MutableObject(); MutableObject php = new MutableObject(); list = null; while (!EndOfInput()) { attribute = ParseAttribute(isempty, asp, php); if (attribute == null) { /* check if attributes are created by ASP markup */ if (asp.Object != null) { av = new AttVal(list, null, (Node) asp.Object, null, '\x0000', null, null); list = av; continue; } /* check if attributes are created by PHP markup */ if (php.Object != null) { av = new AttVal(list, null, null, (Node) php.Object, '\x0000', null, null); list = av; continue; } break; } val = ParseValue(attribute, false, isempty, delim); if (attribute != null && IsValidAttrName(attribute)) { av = new AttVal(list, null, null, null, delim.Val, attribute, val); av.Dict = AttributeTable.DefaultAttributeTable.FindAttribute(av); list = av; } else { av = new AttVal(null, null, null, null, 0, attribute, val); Report.AttrError(this, token, val, Report.BAD_ATTRIBUTE_VALUE); } } return list; }
/* values start with "=" or " = " etc. */ /* doesn't consume the ">" at end of start tag */ public virtual string ParseValue(string name, bool foldCase, MutableBoolean isempty, MutableInteger pdelim) { int len = 0; int start; short map; bool seen_gt = false; bool munge = true; int c = 0; int lastc, delim, quotewarning; string val; delim = 0; pdelim.Val = (int) '"'; /* Henry Zrepa reports that some folk are using the embed element with script attributes where newlines are significant and must be preserved */ if (Options.LiteralAttribs) munge = false; /* skip white space before the '=' */ for (; ; ) { c = input.ReadChar(); if (c == StreamIn.EndOfStream) { input.UngetChar(c); break; } map = MAP((char) c); if ((map & WHITE) == 0) { break; } } /* c should be '=' if there is a value other legal possibilities are white space, '/' and '>' */ if (c != '=') { input.UngetChar(c); return null; } /* skip white space after '=' */ for (; ; ) { c = input.ReadChar(); if (c == StreamIn.EndOfStream) { input.UngetChar(c); break; } map = MAP((char) c); if ((map & WHITE) == 0) break; } /* check for quote marks */ if (c == '"' || c == '\'') delim = c; else if (c == '<') { start = lexsize; AddCharToLexer(c); pdelim.Val = ParseServerInstruction(); len = lexsize - start; lexsize = start; return (len > 0?GetString(lexbuf, start, len):null); } else { input.UngetChar(c); } /* and read the value string check for quote mark if needed */ quotewarning = 0; start = lexsize; c = '\x0000'; for (; ; ) { lastc = c; /* track last character */ c = input.ReadChar(); if (c == StreamIn.EndOfStream) { Report.AttrError(this, token, null, Report.UNEXPECTED_END_OF_FILE); input.UngetChar(c); break; } if (delim == (char) 0) { if (c == '>') { input.UngetChar(c); break; } if (c == '"' || c == '\'') { Report.AttrError(this, token, null, Report.UNEXPECTED_QUOTEMARK); break; } if (c == '<') { /* in.UngetChar(c); */ Report.AttrError(this, token, null, Report.UNEXPECTED_GT); /* break; */ } /* For cases like <br clear=all/> need to avoid treating /> as part of the attribute value, however care is needed to avoid so treating <a href=http://www.acme.com/> in this way, which would map the <a> tag to <a href="http://www.acme.com"/> */ if (c == '/') { /* peek ahead in case of /> */ c = input.ReadChar(); if (c == '>' && !AttributeTable.DefaultAttributeTable.IsUrl(name)) { isempty.Val = true; input.UngetChar(c); break; } /* unget peeked char */ input.UngetChar(c); c = '/'; } } /* delim is '\'' or '"' */ else { if (c == delim) { break; } /* treat CRLF, CR and LF as single line break */ if (c == '\r') { c = input.ReadChar(); if (c != '\n') { input.UngetChar(c); } c = '\n'; } if (c == '\n' || c == '<' || c == '>') ++quotewarning; if (c == '>') seen_gt = true; } if (c == '&') { AddCharToLexer(c); ParseEntity((short) 0); continue; } /* kludge for JavaScript attribute values with line continuations in string literals */ if (c == '\\') { c = input.ReadChar(); if (c != '\n') { input.UngetChar(c); c = '\\'; } } map = MAP((char) c); if ((map & WHITE) != 0) { if (delim == (char) 0) break; if (munge) { c = ' '; if (lastc == ' ') continue; } } else if (foldCase && (map & UPPERCASE) != 0) c += (int) ('a' - 'A'); AddCharToLexer(c); } if (quotewarning > 10 && seen_gt && munge) { /* there is almost certainly a missing trailling quote mark as we have see too many newlines, < or > characters. an exception is made for Javascript attributes and the javascript URL scheme which may legitimately include < and > */ if (!AttributeTable.DefaultAttributeTable.IsScript(name) && !(AttributeTable.DefaultAttributeTable.IsUrl(name) && (GetString(lexbuf, start, 11)).Equals("javascript:"))) Report.Error(this, null, null, Report.SUSPECTED_MISSING_QUOTE); } len = lexsize - start; lexsize = start; if (len > 0 || delim != 0) { val = GetString(lexbuf, start, len); } else { val = null; } /* note delimiter if given */ if (delim != 0) pdelim.Val = delim; else pdelim.Val = (int) '"'; return val; }