예제 #1
0
        /// <summary>Parses a Document from an input steam, using the provided Parser.</summary>
        /// <param name="in">input stream to parse. You will need to close it.</param>
        /// <param name="charsetName">character set of input</param>
        /// <param name="baseUri">base URI of document, to resolve relative links against</param>
        /// <param name="parser">
        /// alternate
        /// <see cref="iText.StyledXmlParser.Jsoup.Parser.Parser.XmlParser()">parser</see>
        /// to use.
        /// </param>
        /// <returns>Document</returns>
        /// <exception cref="System.IO.IOException">on IO error</exception>
        public static Document Load(Stream @in, String charsetName, String baseUri, iText.StyledXmlParser.Jsoup.Parser.Parser
                                    parser)
        {
            ByteBuffer byteData = ReadToByteBuffer(@in);

            return(ParseByteData(byteData, charsetName, baseUri, parser));
        }
예제 #2
0
        // reads bytes first into a buffer, then decodes with the appropriate charset. done this way to support
        // switching the chartset midstream when a meta http-equiv tag defines the charset.
        // todo - this is getting gnarly. needs a rewrite.
        internal static Document ParseByteData(ByteBuffer byteData, String charsetName, String baseUri, iText.StyledXmlParser.Jsoup.Parser.Parser
                                               parser)
        {
            String   docData;
            Document doc = null;

            // look for BOM - overrides any other header or input
            charsetName = DetectCharsetFromBom(byteData, charsetName);
            if (charsetName == null)
            {
                // determine from meta. safe first parse as UTF-8
                // look for <meta http-equiv="Content-Type" content="text/html;charset=gb2312"> or HTML5 <meta charset="gb2312">
                docData = EncodingUtil.GetEncoding(defaultCharset).Decode(byteData).ToString();
                doc     = parser.ParseInput(docData, baseUri);
                iText.StyledXmlParser.Jsoup.Nodes.Element meta = doc.Select("meta[http-equiv=content-type], meta[charset]"
                                                                            ).First();
                String foundCharset = null;
                // if not found, will keep utf-8 as best attempt
                if (meta != null)
                {
                    if (meta.HasAttr("http-equiv"))
                    {
                        foundCharset = GetCharsetFromContentType(meta.Attr("content"));
                    }
                    if (foundCharset == null && meta.HasAttr("charset"))
                    {
                        foundCharset = meta.Attr("charset");
                    }
                }
                // look for <?xml encoding='ISO-8859-1'?>
                if (foundCharset == null && doc.ChildNode(0) is XmlDeclaration)
                {
                    XmlDeclaration prolog = (XmlDeclaration)doc.ChildNode(0);
                    if (prolog.Name().Equals("xml"))
                    {
                        foundCharset = prolog.Attr("encoding");
                    }
                }
                foundCharset = ValidateCharset(foundCharset);
                if (foundCharset != null && !foundCharset.Equals(defaultCharset))
                {
                    // need to re-decode
                    foundCharset = iText.IO.Util.StringUtil.ReplaceAll(foundCharset.Trim(), "[\"']", "");
                    charsetName  = foundCharset;
                    byteData.Rewind();
                    docData = EncodingUtil.GetEncoding(foundCharset).Decode(byteData).ToString();
                    doc     = null;
                }
            }
            else
            {
                // specified by content type header (or by user on file load)
                Validate.NotEmpty(charsetName, "Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML"
                                  );
                docData = EncodingUtil.GetEncoding(charsetName).Decode(byteData).ToString();
            }
            if (doc == null)
            {
                doc = parser.ParseInput(docData, baseUri);
                doc.OutputSettings().Charset(charsetName);
            }
            return(doc);
        }
예제 #3
0
 /// <summary>Parse HTML into a Document, using the provided Parser.</summary>
 /// <remarks>
 /// Parse HTML into a Document, using the provided Parser. You can provide an alternate parser, such as a simple XML
 /// (non-HTML) parser.
 /// </remarks>
 /// <param name="html">HTML to parse</param>
 /// <param name="baseUri">
 /// The URL where the HTML was retrieved from. Used to resolve relative URLs to absolute URLs, that occur
 /// before the HTML declares a
 /// <c>&lt;base href&gt;</c>
 /// tag.
 /// </param>
 /// <param name="parser">
 /// alternate
 /// <see cref="iText.StyledXmlParser.Jsoup.Parser.Parser.XmlParser()">parser</see>
 /// to use.
 /// </param>
 /// <returns>sane HTML</returns>
 public static Document Parse(String html, String baseUri, iText.StyledXmlParser.Jsoup.Parser.Parser parser
                              )
 {
     return(parser.ParseInput(html, baseUri));
 }
예제 #4
0
 /// <summary>Read an input stream, and parse it to a Document.</summary>
 /// <remarks>
 /// Read an input stream, and parse it to a Document. You can provide an alternate parser, such as a simple XML
 /// (non-HTML) parser.
 /// </remarks>
 /// <param name="in">input stream to read. Make sure to close it after parsing.</param>
 /// <param name="charsetName">
 /// (optional) character set of file contents. Set to
 /// <see langword="null"/>
 /// to determine from
 /// <c>http-equiv</c>
 /// meta tag, if
 /// present, or fall back to
 /// <c>UTF-8</c>
 /// (which is often safe to do).
 /// </param>
 /// <param name="baseUri">The URL where the HTML was retrieved from, to resolve relative links against.</param>
 /// <param name="parser">
 /// alternate
 /// <see cref="iText.StyledXmlParser.Jsoup.Parser.Parser.XmlParser()">parser</see>
 /// to use.
 /// </param>
 /// <returns>sane HTML</returns>
 public static Document Parse(Stream @in, String charsetName, String baseUri, iText.StyledXmlParser.Jsoup.Parser.Parser
                              parser)
 {
     return(DataUtil.Load(@in, charsetName, baseUri, parser));
 }