/// <summary>Parse a fragment of HTML into a list of nodes.</summary> /// <remarks>Parse a fragment of HTML into a list of nodes. The context element, if supplied, supplies parsing context. /// </remarks> /// <param name="fragmentHtml">the fragment of HTML to parse</param> /// <param name="context"> /// (optional) the element that this HTML fragment is being parsed for (i.e. for inner HTML). This /// provides stack context (for implicit element creation). /// </param> /// <param name="baseUri">base URI of document (i.e. original fetch location), for resolving relative URLs.</param> /// <returns>list of nodes parsed from the input HTML. Note that the context element, if supplied, is not modified. /// </returns> public static IList <iText.StyledXmlParser.Jsoup.Nodes.Node> ParseFragment(String fragmentHtml, iText.StyledXmlParser.Jsoup.Nodes.Element context, String baseUri) { HtmlTreeBuilder treeBuilder = new HtmlTreeBuilder(); return(treeBuilder.ParseFragment(fragmentHtml, context, baseUri, ParseErrorList.NoTracking())); }
/// <summary>Parse a fragment of XML into a list of nodes.</summary> /// <param name="fragmentXml">the fragment of XML to parse</param> /// <param name="baseUri">base URI of document (i.e. original fetch location), for resolving relative URLs.</param> /// <returns>list of nodes parsed from the input XML.</returns> public static IList <iText.StyledXmlParser.Jsoup.Nodes.Node> ParseXmlFragment(String fragmentXml, String baseUri ) { XmlTreeBuilder treeBuilder = new XmlTreeBuilder(); return(treeBuilder.ParseFragment(fragmentXml, baseUri, ParseErrorList.NoTracking())); }
internal override void InitialiseParse(String input, String baseUri, ParseErrorList errors) { base.InitialiseParse(input, baseUri, errors); stack.Add(doc); // place the document onto the stack. differs from HtmlTreeBuilder (not on stack) doc.OutputSettings().Syntax(iText.StyledXmlParser.Jsoup.Nodes.Syntax.xml); }
// current doc we are building into // the stack of open elements // current base uri, for creating new elements // currentToken is used only for error tracking. // null when not tracking errors // start tag to process internal virtual void InitialiseParse(String input, String baseUri, ParseErrorList errors) { Validate.NotNull(input, "String input must not be null"); Validate.NotNull(baseUri, "BaseURI must not be null"); doc = new Document(baseUri); reader = new CharacterReader(input); this.errors = errors; tokeniser = new Tokeniser(reader, errors); stack = new List <iText.StyledXmlParser.Jsoup.Nodes.Element>(32); this.baseUri = baseUri; }
internal Tokeniser(CharacterReader reader, ParseErrorList errors) { // html input // errors found while tokenising // current tokenisation state // the token we are about to emit on next read // characters pending an emit. Will fall to charsBuilder if more than one // buffers characters to output as one token, if more than one emit per read // buffers data looking for </script> // tag we are building up // doctype building up // comment building up // the last start tag emitted, to test appropriate end tag this.reader = reader; this.errors = errors; }
internal virtual Document Parse(String input, String baseUri, ParseErrorList errors) { InitialiseParse(input, baseUri, errors); RunParser(); return(doc); }
internal virtual Document Parse(String input, String baseUri) { return(Parse(input, baseUri, ParseErrorList.NoTracking())); }
public virtual Document ParseInput(String html, String baseUri) { errors = IsTrackErrors() ? ParseErrorList.Tracking(maxErrors) : ParseErrorList.NoTracking(); return(treeBuilder.Parse(html, baseUri, errors)); }
/// <summary>Utility method to unescape HTML entities from a string</summary> /// <param name="string">HTML escaped string</param> /// <param name="inAttribute">if the string is to be escaped in strict mode (as attributes are)</param> /// <returns>an unescaped string</returns> public static String UnescapeEntities(String @string, bool inAttribute) { Tokeniser tokeniser = new Tokeniser(new CharacterReader(@string), ParseErrorList.NoTracking()); return(tokeniser.UnescapeEntities(inAttribute)); }
/// <summary>Parse XML into a Document.</summary> /// <param name="xml">XML to parse</param> /// <param name="baseUri">base URI of document (i.e. original fetch location), for resolving relative URLs.</param> /// <returns>parsed Document</returns> public static Document ParseXml(String xml, String baseUri) { TreeBuilder treeBuilder = new XmlTreeBuilder(); return(treeBuilder.Parse(xml, baseUri, ParseErrorList.NoTracking())); }
internal virtual IList <iText.StyledXmlParser.Jsoup.Nodes.Node> ParseFragment(String inputFragment, String baseUri, ParseErrorList errors) { InitialiseParse(inputFragment, baseUri, errors); RunParser(); return(doc.ChildNodes()); }
internal Tokeniser(CharacterReader reader, ParseErrorList errors) { this.reader = reader; this.errors = errors; }
internal virtual IList <iText.StyledXmlParser.Jsoup.Nodes.Node> ParseFragment(String inputFragment, iText.StyledXmlParser.Jsoup.Nodes.Element context, String baseUri, ParseErrorList errors) { // context may be null state = HtmlTreeBuilderState.Initial; InitialiseParse(inputFragment, baseUri, errors); contextElement = context; fragmentParsing = true; iText.StyledXmlParser.Jsoup.Nodes.Element root = null; if (context != null) { if (context.OwnerDocument() != null) { // quirks setup: doc.QuirksMode(context.OwnerDocument().QuirksMode()); } // initialise the tokeniser state: String contextTag = context.TagName(); if (iText.StyledXmlParser.Jsoup.Helper.StringUtil.In(contextTag, "title", "textarea")) { tokeniser.Transition(TokeniserState.Rcdata); } else { if (iText.StyledXmlParser.Jsoup.Helper.StringUtil.In(contextTag, "iframe", "noembed", "noframes", "style", "xmp")) { tokeniser.Transition(TokeniserState.Rawtext); } else { if (contextTag.Equals("script")) { tokeniser.Transition(TokeniserState.ScriptData); } else { if (contextTag.Equals(("noscript"))) { tokeniser.Transition(TokeniserState.Data); } else { // if scripting enabled, rawtext if (contextTag.Equals("plaintext")) { tokeniser.Transition(TokeniserState.Data); } else { tokeniser.Transition(TokeniserState.Data); } } } } } // default root = new iText.StyledXmlParser.Jsoup.Nodes.Element(iText.StyledXmlParser.Jsoup.Parser.Tag.ValueOf("html" ), baseUri); doc.AppendChild(root); stack.Add(root); ResetInsertionMode(); // setup form element to nearest form on context (up ancestor chain). ensures form controls are associated // with form correctly Elements contextChain = context.Parents(); contextChain.Add(0, context); foreach (iText.StyledXmlParser.Jsoup.Nodes.Element parent in contextChain) { if (parent is FormElement) { formElement = (FormElement)parent; break; } } } RunParser(); if (context != null && root != null) { return(root.ChildNodes()); } else { return(doc.ChildNodes()); } }
// tag searches // the current state // original / marked state // the current head element // the current form element // fragment parse context -- could be null even if fragment parsing // active (open) formatting elements // chars in table to be shifted out // reused empty end tag // if ok to go into frameset // if next inserts should be fostered // if parsing a fragment of html internal override Document Parse(String input, String baseUri, ParseErrorList errors) { state = HtmlTreeBuilderState.Initial; baseUriSetFromDoc = false; return(base.Parse(input, baseUri, errors)); }