コード例 #1
0
ファイル: Parser.cs プロジェクト: wushian/dcsoup
        /// <summary>
        /// Parse HTML into a Document
        /// </summary>
        /// <param name="html"></param>
        /// <param name="baseUri"></param>
        /// <returns></returns>
        public Document ParseInput(string html, string baseUri)
        {
            errors = CanTrackErrors ? ParseErrorList.Tracking(maxErrors) : ParseErrorList.NoTracking();
            Document doc = treeBuilder.Parse(html, baseUri, errors);

            return(doc);
        }
コード例 #2
0
ファイル: XmlTreeBuilder.cs プロジェクト: wushian/dcsoup
 internal override void InitialiseParse(string input, string baseUri, ParseErrorList errors)
 {
     base.InitialiseParse(input, baseUri, errors);
     stack.AddLast(doc);
     // place the document onto the stack. differs from HtmlTreeBuilder (not on stack)
     doc.OutputSettings.Syntax = DocumentSyntax.Xml;
 }
コード例 #3
0
 // current doc we are building into
 // the stack of open elements
 // current base uri, for creating new elements
 // currentToken is used only for error tracking.
 // null when not tracking errors
 internal virtual void InitialiseParse(string input, string baseUri, ParseErrorList errors)
 {
     Validate.NotNull(input, "String input must not be null");
     Validate.NotNull(baseUri, "BaseURI must not be null");
     doc          = new Document(baseUri);
     reader       = new CharacterReader(input);
     this.errors  = errors;
     tokeniser    = new Tokeniser(reader, errors);
     stack        = new DescendableLinkedList <Element>();
     this.baseUri = baseUri;
 }
コード例 #4
0
ファイル: Tokeniser.cs プロジェクト: wushian/dcsoup
 internal Tokeniser(CharacterReader reader, ParseErrorList errors)
 {
     // replaces null character
     // html input
     // errors found while tokenising
     // current tokenisation state
     // the token we are about to emit on next read
     // buffers characters to output as one token
     // buffers data looking for </script>
     // tag we are building up
     // doctype building up
     // comment building up
     // the last start tag emitted, to test appropriate end tag
     this.reader = reader;
     this.errors = errors;
 }
コード例 #5
0
ファイル: HtmlTreeBuilder.cs プロジェクト: wushian/dcsoup
        internal IReadOnlyList <Node> ParseFragment(string inputFragment, Element context, string baseUri, ParseErrorList errors)
        {
            // context may be null
            state = HtmlTreeBuilderState.Initial;
            InitialiseParse(inputFragment, baseUri, errors);
            contextElement  = context;
            fragmentParsing = true;
            Element root = null;

            if (context != null)
            {
                if (context.OwnerDocument != null)
                {
                    // quirks setup:
                    doc.QuirksMode = context.OwnerDocument.QuirksMode;
                }
                // initialise the tokeniser state:
                string contextTag = context.TagName;
                if (StringUtil.In(contextTag, "title", "textarea"))
                {
                    tokeniser.Transition(TokeniserState.Rcdata);
                }
                else if (StringUtil.In(contextTag, "iframe", "noembed", "noframes", "style", "xmp"))
                {
                    tokeniser.Transition(TokeniserState.Rawtext);
                }
                else if (contextTag.Equals("script"))
                {
                    tokeniser.Transition(TokeniserState.ScriptData);
                }
                else if (contextTag.Equals(("noscript")))
                {
                    tokeniser.Transition(TokeniserState.Data); // if scripting enabled, rawtext
                }
                else if (contextTag.Equals("plaintext"))
                {
                    tokeniser.Transition(TokeniserState.Data);
                }
                else
                {
                    tokeniser.Transition(TokeniserState.Data);
                }

                // default
                root = new Element(Tag.ValueOf("html"), baseUri);
                doc.AppendChild(root);
                stack.Push(root);
                ResetInsertionMode();
                // setup form element to nearest form on context (up ancestor chain). ensures form controls are associated
                // with form correctly
                Elements contextChain = context.Parents;
                contextChain.Insert(0, context);
                foreach (Element parent in contextChain)
                {
                    if (parent is FormElement)
                    {
                        formElement = (FormElement)parent;
                        break;
                    }
                }
            }
            RunParser();
            if (context != null)
            {
                return(root.ChildNodes);
            }
            else
            {
                return(doc.ChildNodes);
            }
        }
コード例 #6
0
ファイル: HtmlTreeBuilder.cs プロジェクト: wushian/dcsoup
 // tag searches
 //private static final String[] TagsScriptStyle = new String[]{"script", "style"};
 // the current state
 // original / marked state
 // the current head element
 // the current form element
 // fragment parse context -- could be null even if fragment parsing
 // active (open) formatting elements
 // chars in table to be shifted out
 // if ok to go into frameset
 // if next inserts should be fostered
 // if parsing a fragment of html
 internal override Document Parse(string input, string baseUri, ParseErrorList errors)
 {
     state             = HtmlTreeBuilderState.Initial;
     baseUriSetFromDoc = false;
     return(base.Parse(input, baseUri, errors));
 }
コード例 #7
0
ファイル: XmlTreeBuilder.cs プロジェクト: wushian/dcsoup
 internal IReadOnlyList <Node> ParseFragment(string inputFragment, string baseUri, ParseErrorList errors)
 {
     InitialiseParse(inputFragment, baseUri, errors);
     RunParser();
     return(doc.ChildNodes);
 }
コード例 #8
0
ファイル: Parser.cs プロジェクト: wushian/dcsoup
        /// <summary>
        /// Utility method to unescape HTML entities from a string
        /// </summary>
        /// <param name="string">HTML escaped string</param>
        /// <param name="inAttribute">if the string is to be escaped in strict mode (as attributes are)</param>
        /// <returns>an unescaped string</returns>
        public static string UnescapeEntities(string @string, bool inAttribute)
        {
            Tokeniser tokeniser = new Tokeniser(new CharacterReader(@string), ParseErrorList.NoTracking());

            return(tokeniser.UnescapeEntities(inAttribute));
        }
コード例 #9
0
ファイル: Parser.cs プロジェクト: wushian/dcsoup
        /// <summary>
        /// Parse a fragment of XML into a list of nodes.
        /// </summary>
        /// <param name="fragmentXml">the fragment of XML to parse</param>
        /// <param name="baseUri">base URI of document (i.e. original fetch location), for resolving relative URLs.</param>
        /// <returns>list of nodes parsed from the input XML.</returns>
        public static IReadOnlyList <Node> ParseXmlFragment(string fragmentXml, string baseUri)
        {
            XmlTreeBuilder treeBuilder = new XmlTreeBuilder();

            return(treeBuilder.ParseFragment(fragmentXml, baseUri, ParseErrorList.NoTracking()));
        }
コード例 #10
0
ファイル: Parser.cs プロジェクト: wushian/dcsoup
        /// <summary>
        /// Parse a fragment of HTML into a list of nodes.
        /// </summary>
        /// <remarks>
        /// The context element, if supplied, supplies parsing context.
        /// </remarks>
        /// <param name="fragmentHtml">the fragment of HTML to parse</param>
        /// <param name="context">
        /// (optional) the element that this HTML fragment is being parsed for (i.e. for inner HTML). This
        /// provides stack context (for implicit element creation).
        /// </param>
        /// <param name="baseUri">
        /// base URI of document (i.e. original fetch location), for resolving relative URLs.
        /// </param>
        /// <returns>
        /// list of nodes parsed from the input HTML. Note that the context element, if supplied, is not modified.
        /// </returns>
        public static IReadOnlyList <Node> ParseFragment(string fragmentHtml, Element context, string baseUri)
        {
            HtmlTreeBuilder treeBuilder = new HtmlTreeBuilder();

            return(treeBuilder.ParseFragment(fragmentHtml, context, baseUri, ParseErrorList.NoTracking()));
        }
コード例 #11
0
ファイル: Parser.cs プロジェクト: wushian/dcsoup
        // utility methods

        /// <summary>
        /// Parse HTML into a Document.
        /// </summary>
        /// <param name="html">HTML to parse</param>
        /// <param name="baseUri">base URI of document (i.e. original fetch location), for resolving relative URLs.</param>
        /// <returns>parsed Document</returns>
        public static Document Parse(string html, string baseUri)
        {
            TreeBuilder treeBuilder = new HtmlTreeBuilder();

            return(treeBuilder.Parse(html, baseUri, ParseErrorList.NoTracking()));
        }
コード例 #12
0
 internal virtual Document Parse(string input, string baseUri, ParseErrorList errors)
 {
     InitialiseParse(input, baseUri, errors);
     RunParser();
     return(doc);
 }
コード例 #13
0
 internal Document Parse(string input, string baseUri)
 {
     return(Parse(input, baseUri, ParseErrorList.NoTracking()));
 }