public static string Encode(string text, Encoding encoding, char escMode, bool trimWses, out bool leftWsesTrimmed, out bool rightWsesTrimmed) { Trace.Assert(escMode == '"' || escMode == '\'' || escMode == '<' || escMode == '>'); Trace.Assert(escMode != '>' || !trimWses); var r = new CharReader(text); text = null; leftWsesTrimmed = false; rightWsesTrimmed = false; var sb = new StringBuilder(); var isMetNonWs = false; var isWsDeferred = false; while (true) { r.Read(); if (r.IsEof) { rightWsesTrimmed = isWsDeferred; isWsDeferred = false; break; } if (trimWses && r.Chr.IsHtmlWhiteSpace()) { isWsDeferred = true; continue; } if (!isMetNonWs) { isMetNonWs = true; leftWsesTrimmed = isWsDeferred; isWsDeferred = false; } if (isWsDeferred) { isWsDeferred = false; sb.Append(' '); } var peek = r.Peek(); if (true && (r.Chr != '&' || !peek.IsHtmlEscBeginning()) && (r.Chr != escMode || (escMode == '<' && !peek.IsHtmlTagBeginning())) && (escMode != '>' || !r.Chr.IsHtmlWhiteSpace()) && r.Chr.IsInEncoding(encoding) ) { sb.Append(r.Chr); continue; } var chrEscAsLit = r.Chr.HtmlEscAsLit(); var chrEsc = (new[] { chrEscAsLit != null ? chrEscAsLit + (peek.IsHtmlEscLit() ? ";" : "") : null, r.Chr.HtmlEscAsNum() + (peek.IsHtmlEscNum() ? ";" : ""), r.Chr.HtmlEscAsHex() + (peek.IsHtmlEscHex() ? ";" : "") }) .Where(x => x != null).OrderBy(x => x.Length).First(); sb.AppendFormat("&{0}", chrEsc); } return(sb.ToString()); }
public static void Tokenize(string htmlText, IWriter w) { var r = new CharReader(htmlText); htmlText = null; var sb = new StringBuilder(); var lastTagIx = -1; string tagName = null; string attrName = null; var attrs = new List <Attr>(); char attrValQuotes; r.Read(); textCur :; if (!r.IsEof && (r.Chr != '<' || !r.Peek().IsHtmlTagBeginning())) { sb.Append(r.Chr); r.Read(); goto textCur; } if (sb.Length != 0) { var text = sb.ToString(); sb.Clear(); w.Text(text); } if (r.IsEof) { goto eofCur; } lastTagIx = r.Ix; r.Read(); tagNameCur :; if (!r.IsEof && !r.Chr.IsHtmlWhiteSpace() && r.Chr != '>' && (r.Chr != '/' || r.Peek() != '>')) { sb.Append(r.Chr); r.Read(); goto tagNameCur; } tagName = sb.ToString(); sb.Clear(); if (tagName.StartsWith("!--", StringComparison.Ordinal)) { tagName = null; goto commentsCur; } if (tagName == "!DOCTYPE") { tagName = null; goto doctypeCur; } tagBodyCur :; if (r.Chr.IsHtmlWhiteSpace()) { r.Read(); goto tagBodyCur; } if (attrName != null && (r.Chr != '=' || attrName == "/")) { attrs.Add(new Attr(attrName, null)); attrName = null; } if (r.IsEof || r.Chr == '>') { if (r.Chr == '>') { r.Read(); } goto tagDoneCur; } if (attrName == null) { goto attrNameCur; } r.Read(); goto attrPreValCur; attrNameCur :; if (!r.IsEof && !r.Chr.IsHtmlWhiteSpace() && r.Chr != '>' && (r.Chr != '/' || sb.Length == 0) && r.Chr != '=') { sb.Append(r.Chr); r.Read(); goto attrNameCur; } attrName = sb.ToString(); sb.Clear(); goto tagBodyCur; attrPreValCur :; if (r.Chr.IsHtmlWhiteSpace()) { r.Read(); goto attrPreValCur; } if (r.Chr == '"' || r.Chr == '\'') { attrValQuotes = r.Chr; r.Read(); } else { attrValQuotes = '\0'; } attrValCur :; if (!r.IsEof && (attrValQuotes != '\0' ? r.Chr != attrValQuotes : !r.Chr.IsHtmlWhiteSpace() && r.Chr != '>')) { sb.Append(r.Chr); r.Read(); goto attrValCur; } if (!r.IsEof && attrValQuotes != '\0') { r.Read(); } var attrVal = sb.ToString(); sb.Clear(); attrs.Add(new Attr(attrName, attrVal)); attrName = null; goto tagBodyCur; tagDoneCur :; w.Tag(tagName, attrs.AsReadOnly()); attrs.Clear(); var tagNameSaved = tagName; tagName = null; if (r.IsEof || (true && !tagNameSaved.EqualsIgnoreCase("script") && !tagNameSaved.EqualsIgnoreCase("style") && !tagNameSaved.EqualsIgnoreCase("iframe") && !tagNameSaved.EqualsIgnoreCase("textarea") && !tagNameSaved.EqualsIgnoreCase("title") )) { goto textCur; } var fragmBeginIx = r.Ix; var m = new Regex(string.Format(@"</{0}\b", tagNameSaved), RegexOptions.CultureInvariant | RegexOptions.IgnoreCase | RegexOptions.Singleline) .Match(r.Text, fragmBeginIx); var fragmEndIx = m.Success ? m.Index : r.Text.Length; if (fragmEndIx == fragmBeginIx) { goto textCur; } r.SkipTo(fragmEndIx); if (tagNameSaved.EqualsIgnoreCase("script") || tagNameSaved.EqualsIgnoreCase("style")) { w.Inlay(r.Text.Substring(fragmBeginIx, fragmEndIx - fragmBeginIx), tagNameSaved.EqualsIgnoreCase("script") ? HtmlInlayType.Script : HtmlInlayType.Style); } else { w.Text(r.Text.Substring(fragmBeginIx, fragmEndIx - fragmBeginIx)); } goto textCur; commentsCur :; var commentsBeginIx = lastTagIx; var commentsEndIx = r.Text.IndexOf("-->", commentsBeginIx + 4, StringComparison.Ordinal); commentsEndIx = commentsEndIx >= 0 ? commentsEndIx + 3 : r.Text.Length; if (!r.IsEof) { r.SkipTo(commentsEndIx); } w.Inlay(r.Text.Substring(commentsBeginIx, commentsEndIx - commentsBeginIx), HtmlInlayType.Comments); goto textCur; doctypeCur :; var doctypeBeginIx = lastTagIx; m = new Regex(@"<!DOCTYPE\s*(([^\s""'>]+|""[^""]*(""|$)|'[^']*('|$))\s*)*(>|$)", RegexOptions.CultureInvariant | RegexOptions.IgnoreCase | RegexOptions.Singleline) .Match(r.Text, doctypeBeginIx); Trace.Assert(m.Success); var doctypeEndIx = m.Index + m.Length; if (!r.IsEof) { r.SkipTo(doctypeEndIx); } w.Inlay(r.Text.Substring(doctypeBeginIx, doctypeEndIx - doctypeBeginIx), HtmlInlayType.Doctype); goto textCur; eofCur :; r = null; w.Eof(); }
public static string Decode(string textHtml) { var r = new CharReader(textHtml); textHtml = null; var sb = new StringBuilder(); var escIx = -1; var escCode = -1; r.Read(); textCur :; if (!r.IsEof && r.Chr != '&') { sb.Append(r.Chr); r.Read(); goto textCur; } if (r.IsEof) { goto eofCur; } escIx = r.Ix; r.Read(); if (r.Chr.IsHtmlEscLit()) { goto escLitCur; } if (r.Chr != '#') { goto escDoneCur; } r.Read(); if (r.Chr.IsHtmlEscNum()) { goto escNumCur; } if (r.Chr != 'X' && r.Chr != 'x') { goto escDoneCur; } r.Read(); if (r.Chr.IsHtmlEscHex()) { goto escHexCur; } goto escDoneCur; escLitCur :; if (r.Chr.IsHtmlEscLit()) { r.Read(); goto escLitCur; } var escLit = r.Text.Substring(escIx + 1, r.Ix - (escIx + 1)); if (!HtmlCharCodes.CharNameToCode.TryGetValue(escLit, out escCode)) { escCode = -1; } goto escDoneCur; escNumCur :; if (r.Chr.IsHtmlEscNum()) { r.Read(); goto escNumCur; } var escNum = r.Text.Substring(escIx + 2, r.Ix - (escIx + 2)); if (!int.TryParse(escNum, NumberStyles.Integer, CultureInfo.InvariantCulture, out escCode) || escCode > 0xFFFD) { escCode = 0xFFFD; } goto escDoneCur; escHexCur :; if (r.Chr.IsHtmlEscHex()) { r.Read(); goto escHexCur; } var escHex = r.Text.Substring(escIx + 3, r.Ix - (escIx + 3)); if (!int.TryParse(escHex, NumberStyles.HexNumber, CultureInfo.InvariantCulture, out escCode) || escCode > 0xFFFD) { escCode = 0xFFFD; } escDoneCur :; if (escCode >= 0) { sb.Append((char)escCode); escCode = -1; if (r.Chr == ';') { r.Read(); } escIx = -1; } if (escIx >= 0) { sb.Append(r.Text, escIx, r.Ix - escIx); escIx = -1; } goto textCur; eofCur :; return(sb.ToString()); }