/* * Provides a fast-path for Encoder.canEncode, which drastically improves performance on Android post JellyBean. * After KitKat, the implementation of canEncode degrades to the point of being useless. For non ASCII or UTF, * performance may be bad. We can add more encoders for common character sets that are impacted by performance * issues on Android if required. * * Benchmarks: * * OLD toHtml() impl v New (fastpath) in millis * Wiki: 1895, 16 * CNN: 6378, 55 * Alterslash: 3013, 28 * Jsoup: 167, 2 */ private static bool CanEncode(Entities.CoreCharset charset, char c, System.Text.Encoding fallback) { // todo add more charset tests if impacted by Android's bad perf in canEncode switch (charset) { case Entities.CoreCharset.ascii: { return(c < 0x80); } case Entities.CoreCharset.utf: { // real is:!(Character.isLowSurrogate(c) || Character.isHighSurrogate(c)); - but already check above return(true); } default: { return(fallback.CanEncode(c)); } } }
// this method is ugly, and does a lot. but other breakups cause rescanning and stringbuilder generations internal static void Escape(StringBuilder accum, String str, OutputSettings outputSettings, bool inAttribute , bool normaliseWhite, bool stripLeadingWhite) { bool lastWasWhite = false; bool reachedNonWhite = false; Entities.EscapeMode escapeMode = outputSettings.EscapeMode(); System.Text.Encoding encoder = outputSettings.Charset(); Entities.CoreCharset coreCharset = GetCoreCharsetByName(outputSettings.Charset().Name()); IDictionary <char, String> map = escapeMode.GetMap(); int length = str.Length; int codePoint; for (int offset = 0; offset < length; offset += iText.IO.Util.TextUtil.CharCount(codePoint)) { codePoint = str.CodePointAt(offset); if (normaliseWhite) { if (iText.StyledXmlParser.Jsoup.Helper.StringUtil.IsWhitespace(codePoint)) { if ((stripLeadingWhite && !reachedNonWhite) || lastWasWhite) { continue; } accum.Append(' '); lastWasWhite = true; continue; } else { lastWasWhite = false; reachedNonWhite = true; } } // surrogate pairs, split implementation for efficiency on single char common case (saves creating strings, char[]): if (codePoint < iText.IO.Util.TextUtil.CHARACTER_MIN_SUPPLEMENTARY_CODE_POINT) { char c = (char)codePoint; // html specific and required escapes: switch (c) { case '&': { accum.Append("&"); break; } case (char)0xA0: { if (escapeMode != Entities.EscapeMode.xhtml) { accum.Append(" "); } else { accum.Append(" "); } break; } case '<': { // escape when in character data or when in a xml attribue val; not needed in html attr val if (!inAttribute || escapeMode == Entities.EscapeMode.xhtml) { accum.Append("<"); } else { accum.Append(c); } break; } case '>': { if (!inAttribute) { accum.Append(">"); } else { accum.Append(c); } break; } case '"': { if (inAttribute) { accum.Append("""); } else { accum.Append(c); } break; } default: { if (CanEncode(coreCharset, c, encoder)) { accum.Append(c); } else { if (map.ContainsKey(c)) { accum.Append('&').Append(map.Get(c)).Append(';'); } else { accum.Append("&#x").Append(JavaUtil.IntegerToHexString(codePoint)).Append(';'); } } break; } } } else { String c = new String(iText.IO.Util.TextUtil.ToChars(codePoint)); if (encoder.CanEncode(c)) { // uses fallback encoder for simplicity accum.Append(c); } else { accum.Append("&#x").Append(JavaUtil.IntegerToHexString(codePoint)).Append(';'); } } } }