Example #1
0
 /// <summary>
 /// Parse HTML into a Document, using the provided Parser.
 /// </summary>
 /// <remarks>
 /// You can provide an alternate parser, such as a simple XML
 /// (non-HTML) parser.
 /// </remarks>
 /// <param name="html">HTML to parse</param>
 /// <param name="baseUri">
 /// The URL where the HTML was retrieved from. Used to resolve relative URLs to absolute URLs, that occur
 /// before the HTML declares a
 /// <c>&lt;base href&gt;</c>
 /// tag.
 /// </param>
 /// <param name="parser">
 /// alternate
 /// <see cref="Parser.XmlParser">parser</see>
 /// to use.
 /// </param>
 /// <returns>sane HTML</returns>
 public static Document Parse(string html, string baseUri, Parser parser)
 {
     return parser.ParseInput(html, baseUri);
 }
Example #2
0
        // reads bytes first into a buffer, then decodes with the appropriate charset. done this way to support
        // switching the chartset midstream when a meta http-equiv tag defines the charset.
        // todo - this is getting gnarly. needs a rewrite.

        internal static Document ParseByteData(byte[] byteData, string charsetName, string baseUri, Parser parser)
        {
            string docData;
            Document doc = null;
            if (charsetName == null)
            {
                // determine from meta. safe parse as UTF-8
                // look for <meta http-equiv="Content-Type" content="text/html;charset=gb2312"> or HTML5 <meta charset="gb2312">
                docData = defaultCharset.GetString(byteData);
                doc = parser.ParseInput(docData, baseUri);
                Element meta = doc.Select("meta[http-equiv=content-type], meta[charset]").First;
                if (meta != null)
                {
                    // if not found, will keep utf-8 as best attempt
                    string foundCharset;
                    if (meta.HasAttr("http-equiv"))
                    {
                        foundCharset = GetCharsetFromContentType(meta.Attr("content"));
                        if (foundCharset == null && meta.HasAttr("charset"))
                        {
                            foundCharset = meta.Attr("charset");
                        }
                    }
                    else
                    {
                        foundCharset = meta.Attr("charset");
                    }
                    if (!string.IsNullOrEmpty(foundCharset) && !foundCharset.Equals(defaultCharset))
                    {
                        // need to re-decode
                        var trimmed = foundCharset
                            .Trim()
                            .Where(c => c != '[' && c != '\"' && c != '\'' && c != ']')
                            .ToArray();
                        charsetName = new string(trimmed);
                        Encoding supportedEncoding = null;
                        try
                        {
                            supportedEncoding = Encoding.GetEncoding(charsetName);
                        }
                        catch(ArgumentException)
                        {
                            // supportedEncoding is null. fallback to default encoding
                        }
                        if (supportedEncoding != null)
                        {
                            // removed when converting
                            // byteData.Rewind();
                            docData = supportedEncoding.GetString(byteData);
                            doc = null;
                        }
                    }
                }
            }
            else
            {
                // specified by content type header (or by user on file load)
                Validate.NotEmpty(charsetName, "Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML");
                docData = Encoding.GetEncoding(charsetName).GetString(byteData);
            }
            // UTF-8 BOM indicator. takes precedence over everything else. rarely used. re-decodes incase above decoded incorrectly
            if (docData.Length > 0 && docData[0] == 65279)
            {
                // removed when converting
                // byteData.Rewind();
                docData = defaultCharset.GetString(byteData);
                docData = docData.Substring(1); /*substring*/
                charsetName = defaultCharset.WebName;
                doc = null;
            }
            if (doc == null)
            {
                doc = parser.ParseInput(docData, baseUri);
                doc.OutputSettings.Charset = Encoding.GetEncoding(charsetName);
            }
            return doc;
        }
Example #3
0
 /// <summary>
 /// Read an input stream, and parse it to a Document.
 /// </summary>
 /// <remarks>
 /// You can provide an alternate parser, such as a simple XML
 /// (non-HTML) parser.
 /// </remarks>
 /// <param name="in">input stream to read. Make sure to close it after parsing.</param>
 /// <param name="charsetName">
 /// (optional) character set of file contents. Set to
 /// <c>null</c>
 /// to determine from
 /// <c>http-equiv</c>
 /// meta tag, if present, or fall back to
 /// <c>UTF-8</c>
 /// (which is often safe to do).
 /// </param>
 /// <param name="baseUri">
 /// The URL where the HTML was retrieved from, to resolve relative links against.
 /// </param>
 /// <param name="parser">
 /// alternate
 /// <see cref="Parser.XmlParser">parser</see>
 /// to use.
 /// </param>
 /// <returns>sane HTML</returns>
 /// <exception cref="System.IO.IOException">
 /// if the file could not be found, or read, or if the charsetName is invalid.
 /// </exception>
 public static Document Parse(Stream @in, string charsetName, string baseUri, Parser parser)
 {
     return DataUtil.Load(@in, charsetName, baseUri, parser);
 }
Example #4
0
 /// <summary>
 /// Parses a Document from an input steam, using the provided Parser.
 /// </summary>
 /// <param name="in">input stream to parse. You will need to close it.</param>
 /// <param name="charsetName">character set of input</param>
 /// <param name="baseUri">base URI of document, to resolve relative links against</param>
 /// <param name="parser">
 /// alternate
 /// <see cref="Parser.XmlParser()">parser</see>
 /// to use.
 /// </param>
 /// <returns>Document</returns>
 /// <exception cref="System.IO.IOException">on IO error</exception>
 public static Document Load(Stream @in, string charsetName, string baseUri, Parser parser)
 {
     byte[] byteData = ReadToByteBuffer(@in);
     return ParseByteData(byteData, charsetName, baseUri, parser);
 }