/// <summary> /// Parse HTML into a Document /// </summary> /// <param name="html"></param> /// <param name="baseUri"></param> /// <returns></returns> public Document ParseInput(string html, string baseUri) { errors = CanTrackErrors ? ParseErrorList.Tracking(maxErrors) : ParseErrorList.NoTracking(); Document doc = treeBuilder.Parse(html, baseUri, errors); return(doc); }
internal override void InitialiseParse(string input, string baseUri, ParseErrorList errors) { base.InitialiseParse(input, baseUri, errors); stack.AddLast(doc); // place the document onto the stack. differs from HtmlTreeBuilder (not on stack) doc.OutputSettings.Syntax = DocumentSyntax.Xml; }
// current doc we are building into // the stack of open elements // current base uri, for creating new elements // currentToken is used only for error tracking. // null when not tracking errors internal virtual void InitialiseParse(string input, string baseUri, ParseErrorList errors) { Validate.NotNull(input, "String input must not be null"); Validate.NotNull(baseUri, "BaseURI must not be null"); doc = new Document(baseUri); reader = new CharacterReader(input); this.errors = errors; tokeniser = new Tokeniser(reader, errors); stack = new DescendableLinkedList <Element>(); this.baseUri = baseUri; }
internal Tokeniser(CharacterReader reader, ParseErrorList errors) { // replaces null character // html input // errors found while tokenising // current tokenisation state // the token we are about to emit on next read // buffers characters to output as one token // buffers data looking for </script> // tag we are building up // doctype building up // comment building up // the last start tag emitted, to test appropriate end tag this.reader = reader; this.errors = errors; }
internal IReadOnlyList <Node> ParseFragment(string inputFragment, Element context, string baseUri, ParseErrorList errors) { // context may be null state = HtmlTreeBuilderState.Initial; InitialiseParse(inputFragment, baseUri, errors); contextElement = context; fragmentParsing = true; Element root = null; if (context != null) { if (context.OwnerDocument != null) { // quirks setup: doc.QuirksMode = context.OwnerDocument.QuirksMode; } // initialise the tokeniser state: string contextTag = context.TagName; if (StringUtil.In(contextTag, "title", "textarea")) { tokeniser.Transition(TokeniserState.Rcdata); } else if (StringUtil.In(contextTag, "iframe", "noembed", "noframes", "style", "xmp")) { tokeniser.Transition(TokeniserState.Rawtext); } else if (contextTag.Equals("script")) { tokeniser.Transition(TokeniserState.ScriptData); } else if (contextTag.Equals(("noscript"))) { tokeniser.Transition(TokeniserState.Data); // if scripting enabled, rawtext } else if (contextTag.Equals("plaintext")) { tokeniser.Transition(TokeniserState.Data); } else { tokeniser.Transition(TokeniserState.Data); } // default root = new Element(Tag.ValueOf("html"), baseUri); doc.AppendChild(root); stack.Push(root); ResetInsertionMode(); // setup form element to nearest form on context (up ancestor chain). ensures form controls are associated // with form correctly Elements contextChain = context.Parents; contextChain.Insert(0, context); foreach (Element parent in contextChain) { if (parent is FormElement) { formElement = (FormElement)parent; break; } } } RunParser(); if (context != null) { return(root.ChildNodes); } else { return(doc.ChildNodes); } }
// tag searches //private static final String[] TagsScriptStyle = new String[]{"script", "style"}; // the current state // original / marked state // the current head element // the current form element // fragment parse context -- could be null even if fragment parsing // active (open) formatting elements // chars in table to be shifted out // if ok to go into frameset // if next inserts should be fostered // if parsing a fragment of html internal override Document Parse(string input, string baseUri, ParseErrorList errors) { state = HtmlTreeBuilderState.Initial; baseUriSetFromDoc = false; return(base.Parse(input, baseUri, errors)); }
internal IReadOnlyList <Node> ParseFragment(string inputFragment, string baseUri, ParseErrorList errors) { InitialiseParse(inputFragment, baseUri, errors); RunParser(); return(doc.ChildNodes); }
/// <summary> /// Utility method to unescape HTML entities from a string /// </summary> /// <param name="string">HTML escaped string</param> /// <param name="inAttribute">if the string is to be escaped in strict mode (as attributes are)</param> /// <returns>an unescaped string</returns> public static string UnescapeEntities(string @string, bool inAttribute) { Tokeniser tokeniser = new Tokeniser(new CharacterReader(@string), ParseErrorList.NoTracking()); return(tokeniser.UnescapeEntities(inAttribute)); }
/// <summary> /// Parse a fragment of XML into a list of nodes. /// </summary> /// <param name="fragmentXml">the fragment of XML to parse</param> /// <param name="baseUri">base URI of document (i.e. original fetch location), for resolving relative URLs.</param> /// <returns>list of nodes parsed from the input XML.</returns> public static IReadOnlyList <Node> ParseXmlFragment(string fragmentXml, string baseUri) { XmlTreeBuilder treeBuilder = new XmlTreeBuilder(); return(treeBuilder.ParseFragment(fragmentXml, baseUri, ParseErrorList.NoTracking())); }
/// <summary> /// Parse a fragment of HTML into a list of nodes. /// </summary> /// <remarks> /// The context element, if supplied, supplies parsing context. /// </remarks> /// <param name="fragmentHtml">the fragment of HTML to parse</param> /// <param name="context"> /// (optional) the element that this HTML fragment is being parsed for (i.e. for inner HTML). This /// provides stack context (for implicit element creation). /// </param> /// <param name="baseUri"> /// base URI of document (i.e. original fetch location), for resolving relative URLs. /// </param> /// <returns> /// list of nodes parsed from the input HTML. Note that the context element, if supplied, is not modified. /// </returns> public static IReadOnlyList <Node> ParseFragment(string fragmentHtml, Element context, string baseUri) { HtmlTreeBuilder treeBuilder = new HtmlTreeBuilder(); return(treeBuilder.ParseFragment(fragmentHtml, context, baseUri, ParseErrorList.NoTracking())); }
// utility methods /// <summary> /// Parse HTML into a Document. /// </summary> /// <param name="html">HTML to parse</param> /// <param name="baseUri">base URI of document (i.e. original fetch location), for resolving relative URLs.</param> /// <returns>parsed Document</returns> public static Document Parse(string html, string baseUri) { TreeBuilder treeBuilder = new HtmlTreeBuilder(); return(treeBuilder.Parse(html, baseUri, ParseErrorList.NoTracking())); }
internal virtual Document Parse(string input, string baseUri, ParseErrorList errors) { InitialiseParse(input, baseUri, errors); RunParser(); return(doc); }
internal Document Parse(string input, string baseUri) { return(Parse(input, baseUri, ParseErrorList.NoTracking())); }