Ejemplo n.º 1
0
        public static void Tokenize(string htmlText, IWriter w)
        {
            var r = new CharReader(htmlText);

            htmlText = null;
            var    sb        = new StringBuilder();
            var    lastTagIx = -1;
            string tagName   = null;
            string attrName  = null;
            var    attrs     = new List <Attr>();
            char   attrValQuotes;

            r.Read();
            textCur :;
            if (!r.IsEof && (r.Chr != '<' || !r.Peek().IsHtmlTagBeginning()))
            {
                sb.Append(r.Chr);
                r.Read();
                goto textCur;
            }
            if (sb.Length != 0)
            {
                var text = sb.ToString();
                sb.Clear();
                w.Text(text);
            }
            if (r.IsEof)
            {
                goto eofCur;
            }
            lastTagIx = r.Ix;
            r.Read();
            tagNameCur :;
            if (!r.IsEof && !r.Chr.IsHtmlWhiteSpace() && r.Chr != '>' && (r.Chr != '/' || r.Peek() != '>'))
            {
                sb.Append(r.Chr);
                r.Read();
                goto tagNameCur;
            }
            tagName = sb.ToString();
            sb.Clear();
            if (tagName.StartsWith("!--", StringComparison.Ordinal))
            {
                tagName = null;
                goto commentsCur;
            }
            if (tagName == "!DOCTYPE")
            {
                tagName = null;
                goto doctypeCur;
            }
            tagBodyCur :;
            if (r.Chr.IsHtmlWhiteSpace())
            {
                r.Read();
                goto tagBodyCur;
            }
            if (attrName != null && (r.Chr != '=' || attrName == "/"))
            {
                attrs.Add(new Attr(attrName, null));
                attrName = null;
            }
            if (r.IsEof || r.Chr == '>')
            {
                if (r.Chr == '>')
                {
                    r.Read();
                }
                goto tagDoneCur;
            }
            if (attrName == null)
            {
                goto attrNameCur;
            }
            r.Read();
            goto attrPreValCur;
            attrNameCur :;
            if (!r.IsEof && !r.Chr.IsHtmlWhiteSpace() && r.Chr != '>' && (r.Chr != '/' || sb.Length == 0) && r.Chr != '=')
            {
                sb.Append(r.Chr);
                r.Read();
                goto attrNameCur;
            }
            attrName = sb.ToString();
            sb.Clear();
            goto tagBodyCur;
            attrPreValCur :;
            if (r.Chr.IsHtmlWhiteSpace())
            {
                r.Read();
                goto attrPreValCur;
            }
            if (r.Chr == '"' || r.Chr == '\'')
            {
                attrValQuotes = r.Chr;
                r.Read();
            }
            else
            {
                attrValQuotes = '\0';
            }
            attrValCur :;
            if (!r.IsEof && (attrValQuotes != '\0' ? r.Chr != attrValQuotes : !r.Chr.IsHtmlWhiteSpace() && r.Chr != '>'))
            {
                sb.Append(r.Chr);
                r.Read();
                goto attrValCur;
            }
            if (!r.IsEof && attrValQuotes != '\0')
            {
                r.Read();
            }
            var attrVal = sb.ToString();

            sb.Clear();
            attrs.Add(new Attr(attrName, attrVal));
            attrName = null;
            goto tagBodyCur;
            tagDoneCur :;
            w.Tag(tagName, attrs.AsReadOnly());
            attrs.Clear();
            var tagNameSaved = tagName;

            tagName = null;
            if (r.IsEof || (true &&
                            !tagNameSaved.EqualsIgnoreCase("script") && !tagNameSaved.EqualsIgnoreCase("style") &&
                            !tagNameSaved.EqualsIgnoreCase("iframe") && !tagNameSaved.EqualsIgnoreCase("textarea") && !tagNameSaved.EqualsIgnoreCase("title")
                            ))
            {
                goto textCur;
            }
            var fragmBeginIx = r.Ix;
            var m            = new Regex(string.Format(@"</{0}\b", tagNameSaved), RegexOptions.CultureInvariant | RegexOptions.IgnoreCase | RegexOptions.Singleline)
                               .Match(r.Text, fragmBeginIx);
            var fragmEndIx = m.Success ? m.Index : r.Text.Length;

            if (fragmEndIx == fragmBeginIx)
            {
                goto textCur;
            }
            r.SkipTo(fragmEndIx);
            if (tagNameSaved.EqualsIgnoreCase("script") || tagNameSaved.EqualsIgnoreCase("style"))
            {
                w.Inlay(r.Text.Substring(fragmBeginIx, fragmEndIx - fragmBeginIx), tagNameSaved.EqualsIgnoreCase("script") ? HtmlInlayType.Script : HtmlInlayType.Style);
            }
            else
            {
                w.Text(r.Text.Substring(fragmBeginIx, fragmEndIx - fragmBeginIx));
            }
            goto textCur;
            commentsCur :;
            var commentsBeginIx = lastTagIx;
            var commentsEndIx   = r.Text.IndexOf("-->", commentsBeginIx + 4, StringComparison.Ordinal);

            commentsEndIx = commentsEndIx >= 0 ? commentsEndIx + 3 : r.Text.Length;
            if (!r.IsEof)
            {
                r.SkipTo(commentsEndIx);
            }
            w.Inlay(r.Text.Substring(commentsBeginIx, commentsEndIx - commentsBeginIx), HtmlInlayType.Comments);
            goto textCur;
            doctypeCur :;
            var doctypeBeginIx = lastTagIx;

            m = new Regex(@"<!DOCTYPE\s*(([^\s""'>]+|""[^""]*(""|$)|'[^']*('|$))\s*)*(>|$)", RegexOptions.CultureInvariant | RegexOptions.IgnoreCase | RegexOptions.Singleline)
                .Match(r.Text, doctypeBeginIx);
            Trace.Assert(m.Success);
            var doctypeEndIx = m.Index + m.Length;

            if (!r.IsEof)
            {
                r.SkipTo(doctypeEndIx);
            }
            w.Inlay(r.Text.Substring(doctypeBeginIx, doctypeEndIx - doctypeBeginIx), HtmlInlayType.Doctype);
            goto textCur;
            eofCur :;
            r = null;
            w.Eof();
        }
        public static string Decode(string textHtml)
        {
            var r = new CharReader(textHtml);

            textHtml = null;
            var sb      = new StringBuilder();
            var escIx   = -1;
            var escCode = -1;

            r.Read();
            textCur :;
            if (!r.IsEof && r.Chr != '&')
            {
                sb.Append(r.Chr);
                r.Read();
                goto textCur;
            }
            if (r.IsEof)
            {
                goto eofCur;
            }
            escIx = r.Ix;
            r.Read();
            if (r.Chr.IsHtmlEscLit())
            {
                goto escLitCur;
            }
            if (r.Chr != '#')
            {
                goto escDoneCur;
            }
            r.Read();
            if (r.Chr.IsHtmlEscNum())
            {
                goto escNumCur;
            }
            if (r.Chr != 'X' && r.Chr != 'x')
            {
                goto escDoneCur;
            }
            r.Read();
            if (r.Chr.IsHtmlEscHex())
            {
                goto escHexCur;
            }
            goto escDoneCur;
            escLitCur :;
            if (r.Chr.IsHtmlEscLit())
            {
                r.Read();
                goto escLitCur;
            }
            var escLit = r.Text.Substring(escIx + 1, r.Ix - (escIx + 1));

            if (!HtmlCharCodes.CharNameToCode.TryGetValue(escLit, out escCode))
            {
                escCode = -1;
            }
            goto escDoneCur;
            escNumCur :;
            if (r.Chr.IsHtmlEscNum())
            {
                r.Read();
                goto escNumCur;
            }
            var escNum = r.Text.Substring(escIx + 2, r.Ix - (escIx + 2));

            if (!int.TryParse(escNum, NumberStyles.Integer, CultureInfo.InvariantCulture, out escCode) || escCode > 0xFFFD)
            {
                escCode = 0xFFFD;
            }
            goto escDoneCur;
            escHexCur :;
            if (r.Chr.IsHtmlEscHex())
            {
                r.Read();
                goto escHexCur;
            }
            var escHex = r.Text.Substring(escIx + 3, r.Ix - (escIx + 3));

            if (!int.TryParse(escHex, NumberStyles.HexNumber, CultureInfo.InvariantCulture, out escCode) || escCode > 0xFFFD)
            {
                escCode = 0xFFFD;
            }
            escDoneCur :;
            if (escCode >= 0)
            {
                sb.Append((char)escCode);
                escCode = -1;
                if (r.Chr == ';')
                {
                    r.Read();
                }
                escIx = -1;
            }
            if (escIx >= 0)
            {
                sb.Append(r.Text, escIx, r.Ix - escIx);
                escIx = -1;
            }
            goto textCur;
            eofCur :;
            return(sb.ToString());
        }