Example #1
0
        /*
         * Provides a fast-path for Encoder.canEncode, which drastically improves performance on Android post JellyBean.
         * After KitKat, the implementation of canEncode degrades to the point of being useless. For non ASCII or UTF,
         * performance may be bad. We can add more encoders for common character sets that are impacted by performance
         * issues on Android if required.
         *
         * Benchmarks:     *
         * OLD toHtml() impl v New (fastpath) in millis
         * Wiki: 1895, 16
         * CNN: 6378, 55
         * Alterslash: 3013, 28
         * Jsoup: 167, 2
         */
        private static bool CanEncode(Entities.CoreCharset charset, char c, System.Text.Encoding fallback)
        {
            // todo add more charset tests if impacted by Android's bad perf in canEncode
            switch (charset)
            {
            case Entities.CoreCharset.ascii: {
                return(c < 0x80);
            }

            case Entities.CoreCharset.utf: {
                // real is:!(Character.isLowSurrogate(c) || Character.isHighSurrogate(c)); - but already check above
                return(true);
            }

            default: {
                return(fallback.CanEncode(c));
            }
            }
        }
Example #2
0
        // this method is ugly, and does a lot. but other breakups cause rescanning and stringbuilder generations
        internal static void Escape(StringBuilder accum, String str, OutputSettings outputSettings, bool inAttribute
                                    , bool normaliseWhite, bool stripLeadingWhite)
        {
            bool lastWasWhite    = false;
            bool reachedNonWhite = false;

            Entities.EscapeMode        escapeMode  = outputSettings.EscapeMode();
            System.Text.Encoding       encoder     = outputSettings.Charset();
            Entities.CoreCharset       coreCharset = GetCoreCharsetByName(outputSettings.Charset().Name());
            IDictionary <char, String> map         = escapeMode.GetMap();
            int length = str.Length;
            int codePoint;

            for (int offset = 0; offset < length; offset += iText.IO.Util.TextUtil.CharCount(codePoint))
            {
                codePoint = str.CodePointAt(offset);
                if (normaliseWhite)
                {
                    if (iText.StyledXmlParser.Jsoup.Helper.StringUtil.IsWhitespace(codePoint))
                    {
                        if ((stripLeadingWhite && !reachedNonWhite) || lastWasWhite)
                        {
                            continue;
                        }
                        accum.Append(' ');
                        lastWasWhite = true;
                        continue;
                    }
                    else
                    {
                        lastWasWhite    = false;
                        reachedNonWhite = true;
                    }
                }
                // surrogate pairs, split implementation for efficiency on single char common case (saves creating strings, char[]):
                if (codePoint < iText.IO.Util.TextUtil.CHARACTER_MIN_SUPPLEMENTARY_CODE_POINT)
                {
                    char c = (char)codePoint;
                    // html specific and required escapes:
                    switch (c)
                    {
                    case '&': {
                        accum.Append("&amp;");
                        break;
                    }

                    case (char)0xA0: {
                        if (escapeMode != Entities.EscapeMode.xhtml)
                        {
                            accum.Append("&nbsp;");
                        }
                        else
                        {
                            accum.Append("&#xa0;");
                        }
                        break;
                    }

                    case '<': {
                        // escape when in character data or when in a xml attribue val; not needed in html attr val
                        if (!inAttribute || escapeMode == Entities.EscapeMode.xhtml)
                        {
                            accum.Append("&lt;");
                        }
                        else
                        {
                            accum.Append(c);
                        }
                        break;
                    }

                    case '>': {
                        if (!inAttribute)
                        {
                            accum.Append("&gt;");
                        }
                        else
                        {
                            accum.Append(c);
                        }
                        break;
                    }

                    case '"': {
                        if (inAttribute)
                        {
                            accum.Append("&quot;");
                        }
                        else
                        {
                            accum.Append(c);
                        }
                        break;
                    }

                    default: {
                        if (CanEncode(coreCharset, c, encoder))
                        {
                            accum.Append(c);
                        }
                        else
                        {
                            if (map.ContainsKey(c))
                            {
                                accum.Append('&').Append(map.Get(c)).Append(';');
                            }
                            else
                            {
                                accum.Append("&#x").Append(JavaUtil.IntegerToHexString(codePoint)).Append(';');
                            }
                        }
                        break;
                    }
                    }
                }
                else
                {
                    String c = new String(iText.IO.Util.TextUtil.ToChars(codePoint));
                    if (encoder.CanEncode(c))
                    {
                        // uses fallback encoder for simplicity
                        accum.Append(c);
                    }
                    else
                    {
                        accum.Append("&#x").Append(JavaUtil.IntegerToHexString(codePoint)).Append(';');
                    }
                }
            }
        }