Пример #1
0
        // reads bytes first into a buffer, then decodes with the appropriate charset. done this way to support
        // switching the chartset midstream when a meta http-equiv tag defines the charset.
        // todo - this is getting gnarly. needs a rewrite.

        internal static Document ParseByteData(byte[] byteData, string charsetName, string baseUri, Parser parser)
        {
            string docData;
            Document doc = null;
            if (charsetName == null)
            {
                // determine from meta. safe parse as UTF-8
                // look for <meta http-equiv="Content-Type" content="text/html;charset=gb2312"> or HTML5 <meta charset="gb2312">
                docData = defaultCharset.GetString(byteData);
                doc = parser.ParseInput(docData, baseUri);
                Element meta = doc.Select("meta[http-equiv=content-type], meta[charset]").First;
                if (meta != null)
                {
                    // if not found, will keep utf-8 as best attempt
                    string foundCharset;
                    if (meta.HasAttr("http-equiv"))
                    {
                        foundCharset = GetCharsetFromContentType(meta.Attr("content"));
                        if (foundCharset == null && meta.HasAttr("charset"))
                        {
                            foundCharset = meta.Attr("charset");
                        }
                    }
                    else
                    {
                        foundCharset = meta.Attr("charset");
                    }
                    if (!string.IsNullOrEmpty(foundCharset) && !foundCharset.Equals(defaultCharset))
                    {
                        // need to re-decode
                        var trimmed = foundCharset
                            .Trim()
                            .Where(c => c != '[' && c != '\"' && c != '\'' && c != ']')
                            .ToArray();
                        charsetName = new string(trimmed);
                        Encoding supportedEncoding = null;
                        try
                        {
                            supportedEncoding = Encoding.GetEncoding(charsetName);
                        }
                        catch(ArgumentException)
                        {
                            // supportedEncoding is null. fallback to default encoding
                        }
                        if (supportedEncoding != null)
                        {
                            // removed when converting
                            // byteData.Rewind();
                            docData = supportedEncoding.GetString(byteData);
                            doc = null;
                        }
                    }
                }
            }
            else
            {
                // specified by content type header (or by user on file load)
                Validate.NotEmpty(charsetName, "Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML");
                docData = Encoding.GetEncoding(charsetName).GetString(byteData);
            }
            // UTF-8 BOM indicator. takes precedence over everything else. rarely used. re-decodes incase above decoded incorrectly
            if (docData.Length > 0 && docData[0] == 65279)
            {
                // removed when converting
                // byteData.Rewind();
                docData = defaultCharset.GetString(byteData);
                docData = docData.Substring(1); /*substring*/
                charsetName = defaultCharset.WebName;
                doc = null;
            }
            if (doc == null)
            {
                doc = parser.ParseInput(docData, baseUri);
                doc.OutputSettings.Charset = Encoding.GetEncoding(charsetName);
            }
            return doc;
        }
Пример #2
0
 /// <summary>
 /// Parse HTML into a Document, using the provided Parser.
 /// </summary>
 /// <remarks>
 /// You can provide an alternate parser, such as a simple XML
 /// (non-HTML) parser.
 /// </remarks>
 /// <param name="html">HTML to parse</param>
 /// <param name="baseUri">
 /// The URL where the HTML was retrieved from. Used to resolve relative URLs to absolute URLs, that occur
 /// before the HTML declares a
 /// <c>&lt;base href&gt;</c>
 /// tag.
 /// </param>
 /// <param name="parser">
 /// alternate
 /// <see cref="Parser.XmlParser">parser</see>
 /// to use.
 /// </param>
 /// <returns>sane HTML</returns>
 public static Document Parse(string html, string baseUri, Parser parser)
 {
     return parser.ParseInput(html, baseUri);
 }
Пример #3
0
        // reads bytes first into a buffer, then decodes with the appropriate charset. done this way to support
        // switching the chartset midstream when a meta http-equiv tag defines the charset.
        // todo - this is getting gnarly. needs a rewrite.

        internal static Document ParseByteData(byte[] byteData, string charsetName, string baseUri, Parser parser)
        {
            string   docData;
            Document doc = null;

            if (charsetName == null)
            {
                // determine from meta. safe parse as UTF-8
                // look for <meta http-equiv="Content-Type" content="text/html;charset=gb2312"> or HTML5 <meta charset="gb2312">
                docData = defaultCharset.GetString(byteData);
                doc     = parser.ParseInput(docData, baseUri);
                Element meta = doc.Select("meta[http-equiv=content-type], meta[charset]").First;
                if (meta != null)
                {
                    // if not found, will keep utf-8 as best attempt
                    string foundCharset;
                    if (meta.HasAttr("http-equiv"))
                    {
                        foundCharset = GetCharsetFromContentType(meta.Attr("content"));
                        if (foundCharset == null && meta.HasAttr("charset"))
                        {
                            foundCharset = meta.Attr("charset");
                        }
                    }
                    else
                    {
                        foundCharset = meta.Attr("charset");
                    }
                    if (!string.IsNullOrEmpty(foundCharset) && !foundCharset.Equals(defaultCharset))
                    {
                        // need to re-decode
                        var trimmed = foundCharset
                                      .Trim()
                                      .Where(c => c != '[' && c != '\"' && c != '\'' && c != ']')
                                      .ToArray();
                        charsetName = new string(trimmed);
                        Encoding supportedEncoding = null;
                        try
                        {
                            supportedEncoding = Encoding.GetEncoding(charsetName);
                        }
                        catch (ArgumentException)
                        {
                            // supportedEncoding is null. fallback to default encoding
                        }
                        if (supportedEncoding != null)
                        {
                            // removed when converting
                            // byteData.Rewind();
                            docData = supportedEncoding.GetString(byteData);
                            doc     = null;
                        }
                    }
                }
            }
            else
            {
                // specified by content type header (or by user on file load)
                Validate.NotEmpty(charsetName, "Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML");
                docData = Encoding.GetEncoding(charsetName).GetString(byteData);
            }
            // UTF-8 BOM indicator. takes precedence over everything else. rarely used. re-decodes incase above decoded incorrectly
            if (docData.Length > 0 && docData[0] == 65279)
            {
                // removed when converting
                // byteData.Rewind();
                docData     = defaultCharset.GetString(byteData);
                docData     = docData.Substring(1); /*substring*/
                charsetName = defaultCharset.WebName;
                doc         = null;
            }
            if (doc == null)
            {
                doc = parser.ParseInput(docData, baseUri);
                doc.OutputSettings.Charset = Encoding.GetEncoding(charsetName);
            }
            return(doc);
        }