Mutable Integer (c) 1998-2000 (W3C) MIT, INRIA, Keio University See Tidy.cs for the copyright notice. Derived from HTML Tidy Release 4 Aug 2000
Пример #1
0
        public static void ReportVersion(Lexer lexer, Node doctype)
        {
            int    state = 0;
            string vers  = lexer.HtmlVersionName();
            var    cc    = new MutableInteger();

            if (doctype != null)
            {
                var docTypeStr = new StringBuilder();

                int i;
                for (i = doctype.Start; i < doctype.End; ++i)
                {
                    int c = doctype.Textarray[i];

                    /* look for UTF-8 multibyte character */
                    if (c < 0)
                    {
                        i += PPrint.GetUtf8(doctype.Textarray, i, cc);
                        c  = cc.Val;
                    }

                    if (c == '"')
                    {
                        ++state;
                    }
                    else if (state == 1)
                    {
                        docTypeStr.Append((char)c);
                    }
                }

                lexer.Messages.Add(new TidyMessage(lexer, String.Format(GetMessage("doctype_given"), docTypeStr),
                                                   MessageLevel.Info));
            }

            lexer.Messages.Add(new TidyMessage(lexer,
                                               String.Format(GetMessage("report_version"),
                                                             (vers ?? "HTML proprietary")),
                                               MessageLevel.Info));
        }
Пример #2
0
        /*
        1010  A
        1011  B
        1100  C
        1101  D
        1110  E
        1111  F
        */
        /* return one less that the number of bytes used by UTF-8 char */
        /* str points to 1st byte, *ch initialized to 1st byte */
        public static int GetUtf8(byte[] str, int start, MutableInteger ch)
        {
            int c, n, i, bytes;

            c = (str[start]) & 0xFF; // Convert to unsigned.

            if ((c & 0xE0) == 0xC0)
            {
                /* 110X XXXX  two bytes */
                n = c & 31;
                bytes = 2;
            }
            else if ((c & 0xF0) == 0xE0)
            {
                /* 1110 XXXX  three bytes */
                n = c & 15;
                bytes = 3;
            }
            else if ((c & 0xF8) == 0xF0)
            {
                /* 1111 0XXX  four bytes */
                n = c & 7;
                bytes = 4;
            }
            else if ((c & 0xFC) == 0xF8)
            {
                /* 1111 10XX  five bytes */
                n = c & 3;
                bytes = 5;
            }
            else if ((c & 0xFE) == 0xFC)
            {
                /* 1111 110X  six bytes */
                n = c & 1;
                bytes = 6;
            }
            else
            {
                /* 0XXX XXXX one byte */
                ch.Val = c;
                return 0;
            }

            /* successor bytes should have the form 10XX XXXX */
            for (i = 1; i < bytes; ++i)
            {
                c = (str[start + i]) & 0xFF; // Convert to unsigned.
                n = (n << 6) | (c & 0x3F);
            }

            ch.Val = n;
            return bytes - 1;
        }
Пример #3
0
        /*
        The line buffer is uint not char so we can
        hold Unicode values unencoded. The translation
        to UTF-8 is deferred to the outc routine called
        to flush the line buffer.
        */
        private void PrintText(Out fout, int mode, int indent, byte[] textarray, int start, int end)
        {
            int i;
            var ci = new MutableInteger();

            for (i = start; i < end; ++i)
            {
                if (indent + _linelen >= _options.WrapLen)
                {
                    WrapLine(fout, indent);
                }

                int c = (textarray[i]) & 0xFF;

                /* look for UTF-8 multibyte character */
                if (c > 0x7F)
                {
                    i += GetUtf8(textarray, i, ci);
                    c = ci.Val;
                }

                if (c == '\n')
                {
                    FlushLine(fout, indent);
                    continue;
                }

                PrintChar(c, mode);
            }
        }
Пример #4
0
        private void PrintAttrValue(Out fout, int indent, string val, int delim, bool wrappable)
        {
            var ci = new MutableInteger();
            bool wasinstring = false;
            byte[] valueChars = null;
            int mode = (wrappable ? (NORMAL | ATTRIBVALUE) : (PREFORMATTED | ATTRIBVALUE));

            if (val != null)
            {
                valueChars = Lexer.GetBytes(val);
            }

            /* look for ASP, Tango or PHP instructions for computed attribute value */
            if (valueChars != null && valueChars.Length >= 5 && valueChars[0] == '<')
            {
                var tmpChar = new char[valueChars.Length];
                valueChars.CopyTo(tmpChar, 0);
                if (valueChars[1] == '%' || valueChars[1] == '@' || (new string(tmpChar, 0, 5)).Equals("<?php"))
                    mode |= CDATA;
            }

            if (delim == 0)
            {
                delim = '"';
            }

            AddC('=', _linelen++);

            /* don't wrap after "=" for xml documents */
            if (!_options.XmlOut)
            {
                if (indent + _linelen < _options.WrapLen)
                {
                    _wraphere = _linelen;
                }

                if (indent + _linelen >= _options.WrapLen)
                {
                    WrapLine(fout, indent);
                }

                if (indent + _linelen < _options.WrapLen)
                {
                    _wraphere = _linelen;
                }
                else
                {
                    CondFlushLine(fout, indent);
                }
            }

            AddC(delim, _linelen++);

            if (val != null)
            {
                _inString = false;

                int i = 0;
                while (valueChars != null && i < valueChars.Length)
                {
                    int c = (valueChars[i]) & 0xFF;

                    if (wrappable && c == ' ' && indent + _linelen < _options.WrapLen)
                    {
                        _wraphere = _linelen;
                        wasinstring = _inString;
                    }

                    if (wrappable && _wraphere > 0 && indent + _linelen >= _options.WrapLen)
                        WrapAttrVal(fout, indent, wasinstring);

                    if (c == delim)
                    {
                        string entity = (c == '"' ? "&quot;" : "&#39;");

                        for (int j = 0; j < entity.Length; j++)
                        {
                            AddC(entity[j], _linelen++);
                        }

                        ++i;
                        continue;
                    }
                    if (c == '"')
                    {
                        if (_options.QuoteMarks)
                        {
                            AddC('&', _linelen++);
                            AddC('q', _linelen++);
                            AddC('u', _linelen++);
                            AddC('o', _linelen++);
                            AddC('t', _linelen++);
                            AddC(';', _linelen++);
                        }
                        else
                        {
                            AddC('"', _linelen++);
                        }

                        if (delim == '\'')
                        {
                            _inString = !_inString;
                        }

                        ++i;
                        continue;
                    }
                    if (c == '\'')
                    {
                        if (_options.QuoteMarks)
                        {
                            AddC('&', _linelen++);
                            AddC('#', _linelen++);
                            AddC('3', _linelen++);
                            AddC('9', _linelen++);
                            AddC(';', _linelen++);
                        }
                        else
                        {
                            AddC('\'', _linelen++);
                        }

                        if (delim == '"')
                        {
                            _inString = !_inString;
                        }

                        ++i;
                        continue;
                    }

                    /* look for UTF-8 multibyte character */
                    if (c > 0x7F)
                    {
                        i += GetUtf8(valueChars, i, ci);
                        c = ci.Val;
                    }

                    ++i;

                    if (c == '\n')
                    {
                        FlushLine(fout, indent);
                        continue;
                    }

                    PrintChar(c, mode);
                }
            }

            _inString = false;
            AddC(delim, _linelen++);
        }
Пример #5
0
        /* map non-breaking spaces to regular spaces */
        private static void NormalizeSpaces(Node node)
        {
            while (node != null)
            {
                if (node.Content != null)
                {
                    NormalizeSpaces(node.Content);
                }

                if (node.Type == Node.TEXT_NODE)
                {
                    int i;
                    var c = new MutableInteger();
                    int p = node.Start;

                    for (i = node.Start; i < node.End; ++i)
                    {
                        c.Val = node.Textarray[i];

                        /* look for UTF-8 multibyte character */
                        if (c.Val > 0x7F)
                        {
                            i += PPrint.GetUtf8(node.Textarray, i, c);
                        }

                        if (c.Val == 160)
                        {
                            c.Val = ' ';
                        }

                        p = PPrint.PutUtf8(node.Textarray, p, c.Val);
                    }
                }

                node = node.Next;
            }
        }
Пример #6
0
        /* values start with "=" or " = " etc. */
        /* doesn't consume the ">" at end of start tag */
        public virtual string ParseValue(string name, bool foldCase, MutableBoolean isempty, MutableInteger pdelim)
        {
            int len;
            int start;
            short map;
            bool seenGt = false;
            bool munge = true;
            int c;
            int delim, quotewarning;
            string val;

            delim = 0;
            pdelim.Val = '"';

            /*
            Henry Zrepa reports that some folk are using the
            embed element with script attributes where newlines
            are significant and must be preserved
            */
            if (Options.LiteralAttribs)
                munge = false;

            /* skip white space before the '=' */

            for (;;)
            {
                c = Input.ReadChar();

                if (c == StreamIn.END_OF_STREAM)
                {
                    Input.UngetChar(c);
                    break;
                }

                map = Map((char) c);

                if ((map & WHITE) == 0)
                {
                    break;
                }
            }

            /*
            c should be '=' if there is a value
            other legal possibilities are white
            space, '/' and '>'
            */

            if (c != '=')
            {
                Input.UngetChar(c);
                return null;
            }

            /* skip white space after '=' */

            for (;;)
            {
                c = Input.ReadChar();
                if (c == StreamIn.END_OF_STREAM)
                {
                    Input.UngetChar(c);
                    break;
                }

                map = Map((char) c);

                if ((map & WHITE) == 0)
                    break;
            }

            /* check for quote marks */

            if (c == '"' || c == '\'')
                delim = c;
            else if (c == '<')
            {
                start = Lexsize;
                AddCharToLexer(c);
                pdelim.Val = ParseServerInstruction();
                len = Lexsize - start;
                Lexsize = start;
                return (len > 0 ? GetString(Lexbuf, start, len) : null);
            }
            else
            {
                Input.UngetChar(c);
            }

            /*
            and read the value string
            check for quote mark if needed
            */

            quotewarning = 0;
            start = Lexsize;
            c = '\x0000';

            for (;;)
            {
                int lastc = c;
                c = Input.ReadChar();

                if (c == StreamIn.END_OF_STREAM)
                {
                    Report.AttrError(this, Token, null, Report.UNEXPECTED_END_OF_FILE);
                    Input.UngetChar(c);
                    break;
                }

                if (delim == (char) 0)
                {
                    if (c == '>')
                    {
                        Input.UngetChar(c);
                        break;
                    }

                    if (c == '"' || c == '\'')
                    {
                        Report.AttrError(this, Token, null, Report.UNEXPECTED_QUOTEMARK);
                        break;
                    }

                    if (c == '<')
                    {
                        /* in.UngetChar(c); */
                        Report.AttrError(this, Token, null, Report.UNEXPECTED_GT);
                        /* break; */
                    }

                    /*
                    For cases like <br clear=all/> need to avoid treating /> as
                    part of the attribute value, however care is needed to avoid
                    so treating <a href=http://www.acme.com/> in this way, which
                    would map the <a> tag to <a href="http://www.acme.com"/>
                    */
                    if (c == '/')
                    {
                        /* peek ahead in case of /> */
                        c = Input.ReadChar();
                        if (c == '>' && !AttributeTable.DefaultAttributeTable.IsUrl(name))
                        {
                            isempty.Val = true;
                            Input.UngetChar(c);
                            break;
                        }

                        /* unget peeked char */
                        Input.UngetChar(c);
                        c = '/';
                    }
                }
                    /* delim is '\'' or '"' */
                else
                {
                    if (c == delim)
                    {
                        break;
                    }

                    /* treat CRLF, CR and LF as single line break */

                    if (c == '\r')
                    {
                        c = Input.ReadChar();
                        if (c != '\n')
                        {
                            Input.UngetChar(c);
                        }

                        c = '\n';
                    }

                    if (c == '\n' || c == '<' || c == '>')
                        ++quotewarning;

                    if (c == '>')
                        seenGt = true;
                }

                if (c == '&')
                {
                    AddCharToLexer(c);
                    ParseEntity(0);
                    continue;
                }

                /*
                kludge for JavaScript attribute values
                with line continuations in string literals
                */
                if (c == '\\')
                {
                    c = Input.ReadChar();

                    if (c != '\n')
                    {
                        Input.UngetChar(c);
                        c = '\\';
                    }
                }

                map = Map((char) c);

                if ((map & WHITE) != 0)
                {
                    if (delim == (char) 0)
                        break;

                    if (munge)
                    {
                        c = ' ';

                        if (lastc == ' ')
                            continue;
                    }
                }
                else if (foldCase && (map & UPPERCASE) != 0)
                    c += ('a' - 'A');

                AddCharToLexer(c);
            }

            if (quotewarning > 10 && seenGt && munge)
            {
                /*
                there is almost certainly a missing trailling quote mark
                as we have see too many newlines, < or > characters.

                an exception is made for Javascript attributes and the
                javascript URL scheme which may legitimately include < and >
                */
                if (!AttributeTable.DefaultAttributeTable.IsScript(name) &&
                    !(AttributeTable.DefaultAttributeTable.IsUrl(name) &&
                      (GetString(Lexbuf, start, 11)).Equals("javascript:")))
                    Report.Error(this, null, null, Report.SUSPECTED_MISSING_QUOTE);
            }

            len = Lexsize - start;
            Lexsize = start;

            if (len > 0 || delim != 0)
            {
                val = GetString(Lexbuf, start, len);
            }
            else
            {
                val = null;
            }

            /* note delimiter if given */
            pdelim.Val = delim != 0 ? delim : '"';

            return val;
        }
Пример #7
0
        /* swallows closing '>' */
        public virtual AttVal ParseAttrs(MutableBoolean isempty)
        {
            var delim = new MutableInteger();
            var asp = new MutableObject();
            var php = new MutableObject();

            AttVal list = null;

            while (!EndOfInput())
            {
                string attribute = ParseAttribute(isempty, asp, php);

                AttVal av;
                if (attribute == null)
                {
                    /* check if attributes are created by ASP markup */
                    if (asp.Object != null)
                    {
                        av = new AttVal(list, null, (Node) asp.Object, null, '\x0000', null, null);
                        list = av;
                        continue;
                    }

                    /* check if attributes are created by PHP markup */
                    if (php.Object != null)
                    {
                        av = new AttVal(list, null, null, (Node) php.Object, '\x0000', null, null);
                        list = av;
                        continue;
                    }

                    break;
                }

                string val = ParseValue(attribute, false, isempty, delim);

                if (IsValidAttrName(attribute))
                {
                    av = new AttVal(list, null, null, null, delim.Val, attribute, val);
                    av.Dict = AttributeTable.DefaultAttributeTable.FindAttribute(av);
                    list = av;
                }
                else
                {
                    //av = new AttVal(null, null, null, null, 0, attribute, val);
                    Report.AttrError(this, Token, val, Report.BAD_ATTRIBUTE_VALUE);
                }
            }

            return list;
        }
Пример #8
0
        public static void ReportVersion(Lexer lexer, Node doctype)
        {
            int state = 0;
            string vers = lexer.HtmlVersionName();
            var cc = new MutableInteger();

            if (doctype != null)
            {
                var docTypeStr = new StringBuilder();

                int i;
                for (i = doctype.Start; i < doctype.End; ++i)
                {
                    int c = doctype.Textarray[i];

                    /* look for UTF-8 multibyte character */
                    if (c < 0)
                    {
                        i += PPrint.GetUtf8(doctype.Textarray, i, cc);
                        c = cc.Val;
                    }

                    if (c == '"')
                    {
                        ++state;
                    }
                    else if (state == 1)
                    {
                        docTypeStr.Append((char) c);
                    }
                }

                lexer.Messages.Add(new TidyMessage(lexer, String.Format(GetMessage("doctype_given"), docTypeStr),
                                                   MessageLevel.Info));
            }

            lexer.Messages.Add(new TidyMessage(lexer,
                                               String.Format(GetMessage("report_version"),
                                                             (vers ?? "HTML proprietary")),
                                               MessageLevel.Info));
        }