C# (CSharp) Supremes.Parsers Parser Examples

Programming Language: C# (CSharp)

Namespace/Package Name: Supremes.Parsers

Class/Type: Parser

Examples at hotexamples.com: 4

C# (CSharp) Supremes.Parsers Parser - 4 examples found. These are the top rated real world C# (CSharp) examples of Supremes.Parsers.Parser extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

ParseInput(1)

Example #1

Show file

File: Dcsoup.cs Project: bkzhn/dcsoup

 /// <summary>
 /// Parse HTML into a Document, using the provided Parser.
 /// </summary>
 /// <remarks>
 /// You can provide an alternate parser, such as a simple XML
 /// (non-HTML) parser.
 /// </remarks>
 /// <param name="html">HTML to parse</param>
 /// <param name="baseUri">
 /// The URL where the HTML was retrieved from. Used to resolve relative URLs to absolute URLs, that occur
 /// before the HTML declares a
 /// <c>&lt;base href&gt;</c>
 /// tag.
 /// </param>
 /// <param name="parser">
 /// alternate
 /// <see cref="Parser.XmlParser">parser</see>
 /// to use.
 /// </param>
 /// <returns>sane HTML</returns>
 public static Document Parse(string html, string baseUri, Parser parser)
 {
     return parser.ParseInput(html, baseUri);
 }

Example #2

Show file

File: DataUtil.cs Project: bkzhn/dcsoup

        // reads bytes first into a buffer, then decodes with the appropriate charset. done this way to support
        // switching the chartset midstream when a meta http-equiv tag defines the charset.
        // todo - this is getting gnarly. needs a rewrite.

        internal static Document ParseByteData(byte[] byteData, string charsetName, string baseUri, Parser parser)
        {
            string docData;
            Document doc = null;
            if (charsetName == null)
            {
                // determine from meta. safe parse as UTF-8
                // look for <meta http-equiv="Content-Type" content="text/html;charset=gb2312"> or HTML5 <meta charset="gb2312">
                docData = defaultCharset.GetString(byteData);
                doc = parser.ParseInput(docData, baseUri);
                Element meta = doc.Select("meta[http-equiv=content-type], meta[charset]").First;
                if (meta != null)
                {
                    // if not found, will keep utf-8 as best attempt
                    string foundCharset;
                    if (meta.HasAttr("http-equiv"))
                    {
                        foundCharset = GetCharsetFromContentType(meta.Attr("content"));
                        if (foundCharset == null && meta.HasAttr("charset"))
                        {
                            foundCharset = meta.Attr("charset");
                        }
                    }
                    else
                    {
                        foundCharset = meta.Attr("charset");
                    }
                    if (!string.IsNullOrEmpty(foundCharset) && !foundCharset.Equals(defaultCharset))
                    {
                        // need to re-decode
                        var trimmed = foundCharset
                            .Trim()
                            .Where(c => c != '[' && c != '\"' && c != '\'' && c != ']')
                            .ToArray();
                        charsetName = new string(trimmed);
                        Encoding supportedEncoding = null;
                        try
                        {
                            supportedEncoding = Encoding.GetEncoding(charsetName);
                        }
                        catch(ArgumentException)
                        {
                            // supportedEncoding is null. fallback to default encoding
                        }
                        if (supportedEncoding != null)
                        {
                            // removed when converting
                            // byteData.Rewind();
                            docData = supportedEncoding.GetString(byteData);
                            doc = null;
                        }
                    }
                }
            }
            else
            {
                // specified by content type header (or by user on file load)
                Validate.NotEmpty(charsetName, "Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML");
                docData = Encoding.GetEncoding(charsetName).GetString(byteData);
            }
            // UTF-8 BOM indicator. takes precedence over everything else. rarely used. re-decodes incase above decoded incorrectly
            if (docData.Length > 0 && docData[0] == 65279)
            {
                // removed when converting
                // byteData.Rewind();
                docData = defaultCharset.GetString(byteData);
                docData = docData.Substring(1); /*substring*/
                charsetName = defaultCharset.WebName;
                doc = null;
            }
            if (doc == null)
            {
                doc = parser.ParseInput(docData, baseUri);
                doc.OutputSettings.Charset = Encoding.GetEncoding(charsetName);
            }
            return doc;
        }

Example #3

Show file

File: Dcsoup.cs Project: bkzhn/dcsoup

 /// <summary>
 /// Read an input stream, and parse it to a Document.
 /// </summary>
 /// <remarks>
 /// You can provide an alternate parser, such as a simple XML
 /// (non-HTML) parser.
 /// </remarks>
 /// <param name="in">input stream to read. Make sure to close it after parsing.</param>
 /// <param name="charsetName">
 /// (optional) character set of file contents. Set to
 /// <c>null</c>
 /// to determine from
 /// <c>http-equiv</c>
 /// meta tag, if present, or fall back to
 /// <c>UTF-8</c>
 /// (which is often safe to do).
 /// </param>
 /// <param name="baseUri">
 /// The URL where the HTML was retrieved from, to resolve relative links against.
 /// </param>
 /// <param name="parser">
 /// alternate
 /// <see cref="Parser.XmlParser">parser</see>
 /// to use.
 /// </param>
 /// <returns>sane HTML</returns>
 /// <exception cref="System.IO.IOException">
 /// if the file could not be found, or read, or if the charsetName is invalid.
 /// </exception>
 public static Document Parse(Stream @in, string charsetName, string baseUri, Parser parser)
 {
     return DataUtil.Load(@in, charsetName, baseUri, parser);
 }

Example #4

Show file

File: DataUtil.cs Project: bkzhn/dcsoup

 /// <summary>
 /// Parses a Document from an input steam, using the provided Parser.
 /// </summary>
 /// <param name="in">input stream to parse. You will need to close it.</param>
 /// <param name="charsetName">character set of input</param>
 /// <param name="baseUri">base URI of document, to resolve relative links against</param>
 /// <param name="parser">
 /// alternate
 /// <see cref="Parser.XmlParser()">parser</see>
 /// to use.
 /// </param>
 /// <returns>Document</returns>
 /// <exception cref="System.IO.IOException">on IO error</exception>
 public static Document Load(Stream @in, string charsetName, string baseUri, Parser parser)
 {
     byte[] byteData = ReadToByteBuffer(@in);
     return ParseByteData(byteData, charsetName, baseUri, parser);
 }