		1010  A
		1011  B
		1100  C
		1101  D
		1110  E
		1111  F
		/* return one less that the number of bytes used by UTF-8 char */
		/* str points to 1st byte, *ch initialized to 1st byte */
		public static int GetUTF8(byte[] str, int start, MutableInteger ch)
			int c, n, i, bytes;
			c = ((int) str[start]) & 0xFF; // Convert to unsigned.
			if ((c & 0xE0) == 0xC0)
				/* 110X XXXX  two bytes */
				n = c & 31;
				bytes = 2;
			else if ((c & 0xF0) == 0xE0)
				/* 1110 XXXX  three bytes */
				n = c & 15;
				bytes = 3;
			else if ((c & 0xF8) == 0xF0)
				/* 1111 0XXX  four bytes */
				n = c & 7;
				bytes = 4;
			else if ((c & 0xFC) == 0xF8)
				/* 1111 10XX  five bytes */
				n = c & 3;
				bytes = 5;
			else if ((c & 0xFE) == 0xFC)
				/* 1111 110X  six bytes */
				n = c & 1;
				bytes = 6;
				/* 0XXX XXXX one byte */
				ch.Val = c;
				return 0;
			/* successor bytes should have the form 10XX XXXX */
			for (i = 1; i < bytes; ++i)
				c = ((int) str[start + i]) & 0xFF; // Convert to unsigned.
				n = (n << 6) | (c & 0x3F);
			ch.Val = n;
			return bytes - 1;
Exemple #2
        public static void ReportVersion(Lexer lexer, Node doctype)
            int            i, c;
            int            state   = 0;
            string         vers    = lexer.HtmlVersionName();
            MutableInteger cc      = new MutableInteger();
            StringBuilder  message = new StringBuilder();

            if (doctype != null)
                StringBuilder docTypeStr = new StringBuilder();

                for (i = doctype.Start; i < doctype.End; ++i)
                    c = (int)doctype.Textarray[i];

                    /* look for UTF-8 multibyte character */
                    if (c < 0)
                        i += PPrint.GetUTF8(doctype.Textarray, i, cc);
                        c  = cc.Val;

                    if (c == (char)'"')
                    else if (state == 1)

                lexer.messages.Add(new TidyMessage(lexer, String.Format(GetMessage("doctype_given"), docTypeStr), MessageLevel.Info));

            lexer.messages.Add(new TidyMessage(lexer, String.Format(GetMessage("report_version"), (vers != null ? vers : "HTML proprietary")), MessageLevel.Info));
Exemple #3
        public static void ReportVersion(Lexer lexer, Node doctype)
            int i, c;
            int state = 0;
            string vers = lexer.HtmlVersionName();
            MutableInteger cc = new MutableInteger();
            StringBuilder message = new StringBuilder();

            if (doctype != null)
                StringBuilder docTypeStr = new StringBuilder();

                for (i = doctype.Start; i < doctype.End; ++i)
                    c = (int) doctype.Textarray[i];

                    /* look for UTF-8 multibyte character */
                    if (c < 0)
                        i += PPrint.GetUTF8(doctype.Textarray, i, cc);
                        c = cc.Val;

                    if (c == (char) '"')
                    else if (state == 1)

                lexer.messages.Add(new TidyMessage(lexer, String.Format(GetMessage("doctype_given"), docTypeStr), MessageLevel.Info));

            lexer.messages.Add(new TidyMessage(lexer, String.Format(GetMessage("report_version"), (vers != null ? vers : "HTML proprietary")), MessageLevel.Info));
Exemple #4
        /* map non-breaking spaces to regular spaces */
        private void NormalizeSpaces(Lexer lexer, Node node)
            while (node != null)
                if (node.Content != null)
                    NormalizeSpaces(lexer, node.Content);

                if (node.Type == Node.TextNode)
                    int i;
                    MutableInteger c = new MutableInteger();
                    int p = node.Start;

                    for (i = node.Start; i < node.End; ++i)
                        c.Val = (int) node.Textarray[i];

                        /* look for UTF-8 multibyte character */
                        if (c.Val > 0x7F)
                            i += PPrint.GetUTF8(node.Textarray, i, c);

                        if (c.Val == 160)
                            c.Val = ' ';

                        p = PPrint.PutUTF8(node.Textarray, p, c.Val);

                node = node.Next;
Exemple #5
        The line buffer is uint not char so we can
        hold Unicode values unencoded. The translation
        to UTF-8 is deferred to the outc routine called
        to flush the line buffer.
        private void PrintText(Out fout, int mode, int indent, byte[] textarray, int start, int end)
            int i, c;
            MutableInteger ci = new MutableInteger();

            for (i = start; i < end; ++i)
                if (indent + linelen >= _options.WrapLen)
                    WrapLine(fout, indent);

                c = ((int) textarray[i]) & 0xFF; // Convert to unsigned.

                /* look for UTF-8 multibyte character */
                if (c > 0x7F)
                    i += GetUTF8(textarray, i, ci);
                    c = ci.Val;

                if (c == '\n')
                    FlushLine(fout, indent);

                PrintChar(c, mode);
Exemple #6
        private void PrintAttrValue(Out fout, int indent, string val, int delim, bool wrappable)
            int c;
            MutableInteger ci = new MutableInteger();
            bool wasinstring = false;
            byte[] valueChars = null;
            int i;
            int mode = (wrappable?(int) (NORMAL | ATTRIBVALUE):(int) (PREFORMATTED | ATTRIBVALUE));

            if (val != null)
                valueChars = Lexer.GetBytes(val);

            /* look for ASP, Tango or PHP instructions for computed attribute value */
            if (valueChars != null && valueChars.Length >= 5 && valueChars[0] == '<')
                char[] tmpChar;
                tmpChar = new char[valueChars.Length];
                valueChars.CopyTo(tmpChar, 0);
                if (valueChars[1] == '%' || valueChars[1] == '@' || (new string(tmpChar, 0, 5)).Equals("<?php"))
                    mode |= CDATA;

            if (delim == 0)
                delim = '"';

            AddC('=', linelen++);

            /* don't wrap after "=" for xml documents */
            if (!_options.XmlOut)
                if (indent + linelen < _options.WrapLen)
                    wraphere = linelen;

                if (indent + linelen >= _options.WrapLen)
                    WrapLine(fout, indent);

                if (indent + linelen < _options.WrapLen)
                    wraphere = linelen;
                    CondFlushLine(fout, indent);

            AddC(delim, linelen++);

            if (val != null)
                InString = false;

                i = 0;
                while (i < valueChars.Length)
                    c = ((int) valueChars[i]) & 0xFF; // Convert to unsigned.

                    if (wrappable && c == ' ' && indent + linelen < _options.WrapLen)
                        wraphere = linelen;
                        wasinstring = InString;

                    if (wrappable && wraphere > 0 && indent + linelen >= _options.WrapLen)
                        WrapAttrVal(fout, indent, wasinstring);

                    if (c == delim)
                        string entity;

                        entity = (c == '"'?"&quot;":"&#39;");

                        for (int j = 0; j < entity.Length; j++)
                            AddC(entity[j], linelen++);

                    else if (c == '"')
                        if (_options.QuoteMarks)
                            AddC('&', linelen++);
                            AddC('q', linelen++);
                            AddC('u', linelen++);
                            AddC('o', linelen++);
                            AddC('t', linelen++);
                            AddC(';', linelen++);
                            AddC('"', linelen++);

                        if (delim == '\'')
                            InString = !InString;

                    else if (c == '\'')
                        if (_options.QuoteMarks)
                            AddC('&', linelen++);
                            AddC('#', linelen++);
                            AddC('3', linelen++);
                            AddC('9', linelen++);
                            AddC(';', linelen++);
                            AddC('\'', linelen++);

                        if (delim == '"')
                            InString = !InString;


                    /* look for UTF-8 multibyte character */
                    if (c > 0x7F)
                        i += GetUTF8(valueChars, i, ci);
                        c = ci.Val;


                    if (c == '\n')
                        FlushLine(fout, indent);

                    PrintChar(c, mode);

            InString = false;
            AddC(delim, linelen++);
		/* swallows closing '>' */
		public virtual AttVal ParseAttrs(MutableBoolean isempty)
			AttVal av, list;
			string attribute, val;
			MutableInteger delim = new MutableInteger();
			MutableObject asp = new MutableObject();
			MutableObject php = new MutableObject();
			list = null;
			while (!EndOfInput())
				attribute = ParseAttribute(isempty, asp, php);
				if (attribute == null)
					/* check if attributes are created by ASP markup */
					if (asp.Object != null)
						av = new AttVal(list, null, (Node) asp.Object, null, '\x0000', null, null);
						list = av;
					/* check if attributes are created by PHP markup */
					if (php.Object != null)
						av = new AttVal(list, null, null, (Node) php.Object, '\x0000', null, null);
						list = av;
				val = ParseValue(attribute, false, isempty, delim);
				if (attribute != null && IsValidAttrName(attribute))
					av = new AttVal(list, null, null, null, delim.Val, attribute, val);
					av.Dict = AttributeTable.DefaultAttributeTable.FindAttribute(av);
					list = av;
					av = new AttVal(null, null, null, null, 0, attribute, val);
					Report.AttrError(this, token, val, Report.BAD_ATTRIBUTE_VALUE);
			return list;
		/* values start with "=" or " = " etc. */
		/* doesn't consume the ">" at end of start tag */
		public virtual string ParseValue(string name, bool foldCase, MutableBoolean isempty, MutableInteger pdelim)
			int len = 0;
			int start;
			short map;
			bool seen_gt = false;
			bool munge = true;
			int c = 0;
			int lastc, delim, quotewarning;
			string val;
			delim = 0;
			pdelim.Val = (int) '"';
			Henry Zrepa reports that some folk are using the
			embed element with script attributes where newlines
			are significant and must be preserved
			if (Options.LiteralAttribs)
				munge = false;
			/* skip white space before the '=' */
			for (; ; )
				c = input.ReadChar();
				if (c == StreamIn.EndOfStream)
				map = MAP((char) c);
				if ((map & WHITE) == 0)
			c should be '=' if there is a value
			other legal possibilities are white
			space, '/' and '>'
			if (c != '=')
				return null;
			/* skip white space after '=' */
			for (; ; )
				c = input.ReadChar();
				if (c == StreamIn.EndOfStream)
				map = MAP((char) c);
				if ((map & WHITE) == 0)
			/* check for quote marks */
			if (c == '"' || c == '\'')
				delim = c;
			else if (c == '<')
				start = lexsize;
				pdelim.Val = ParseServerInstruction();
				len = lexsize - start;
				lexsize = start;
				return (len > 0?GetString(lexbuf, start, len):null);
			and read the value string
			check for quote mark if needed
			quotewarning = 0;
			start = lexsize;
			c = '\x0000';
			for (; ; )
				lastc = c; /* track last character */
				c = input.ReadChar();
				if (c == StreamIn.EndOfStream)
					Report.AttrError(this, token, null, Report.UNEXPECTED_END_OF_FILE);
				if (delim == (char) 0)
					if (c == '>')
					if (c == '"' || c == '\'')
						Report.AttrError(this, token, null, Report.UNEXPECTED_QUOTEMARK);
					if (c == '<')
						/* in.UngetChar(c); */
						Report.AttrError(this, token, null, Report.UNEXPECTED_GT);
						/* break; */
					For cases like <br clear=all/> need to avoid treating /> as
					part of the attribute value, however care is needed to avoid
					so treating <a href=http://www.acme.com/> in this way, which
					would map the <a> tag to <a href="http://www.acme.com"/>
					if (c == '/')
						/* peek ahead in case of /> */
						c = input.ReadChar();
						if (c == '>' && !AttributeTable.DefaultAttributeTable.IsUrl(name))
							isempty.Val = true;
						/* unget peeked char */
						c = '/';
					/* delim is '\'' or '"' */
					if (c == delim)
					/* treat CRLF, CR and LF as single line break */
					if (c == '\r')
						c = input.ReadChar();
						if (c != '\n')
						c = '\n';
					if (c == '\n' || c == '<' || c == '>')
					if (c == '>')
						seen_gt = true;
				if (c == '&')
					ParseEntity((short) 0);
				kludge for JavaScript attribute values
				with line continuations in string literals
				if (c == '\\')
					c = input.ReadChar();
					if (c != '\n')
						c = '\\';
				map = MAP((char) c);
				if ((map & WHITE) != 0)
					if (delim == (char) 0)
					if (munge)
						c = ' ';
						if (lastc == ' ')
				else if (foldCase && (map & UPPERCASE) != 0)
					c += (int) ('a' - 'A');
			if (quotewarning > 10 && seen_gt && munge)
				there is almost certainly a missing trailling quote mark
				as we have see too many newlines, < or > characters.
				an exception is made for Javascript attributes and the
				javascript URL scheme which may legitimately include < and >
				if (!AttributeTable.DefaultAttributeTable.IsScript(name) && !(AttributeTable.DefaultAttributeTable.IsUrl(name) && (GetString(lexbuf, start, 11)).Equals("javascript:")))
					Report.Error(this, null, null, Report.SUSPECTED_MISSING_QUOTE);
			len = lexsize - start;
			lexsize = start;
			if (len > 0 || delim != 0)
				val = GetString(lexbuf, start, len);
				val = null;
			/* note delimiter if given */
			if (delim != 0)
				pdelim.Val = delim;
				pdelim.Val = (int) '"';
			return val;