public override void ParseFragment(XmlNode node, HtmlTextReader reader, ParserOptions options) { XmlDomBuilder dom = new XmlDomBuilder(node); HtmlStreamParser <XmlNode> parser = new HtmlStreamParser <XmlNode>(); parser.Parse(dom, reader, options, InsersionMode.InBody); }
public override void Parse(XmlDocument doc, HtmlTextReader reader, ParserOptions options) { XmlDomBuilder dom = new XmlDomBuilder(doc); HtmlStreamParser <XmlNode> parser = new HtmlStreamParser <XmlNode>(); parser.Parse(dom, reader, options); }
public static void LoadHtmlFragment(XNode node, TextReader reader, ParserOptions options) { XDomBuilder dom = new XDomBuilder(node); HtmlStreamParser <XNode> parser = new HtmlStreamParser <XNode>(); HtmlTextReader htmlTextReader = new HtmlTextReader(reader); parser.Parse(dom, htmlTextReader, options, InsersionMode.InBody); }
internal static void LoadHtml(XDocument doc, TextReader reader, ParserOptions options) { XDomBuilder dom = new XDomBuilder(doc); HtmlStreamParser <XNode> parser = new HtmlStreamParser <XNode>(); HtmlTextReader htmlTextReader = new HtmlTextReader(reader); parser.Parse(dom, htmlTextReader, options); }
internal static async Task LoadWebPageAsync(XmlDocument doc, string url, LoaderOptions options) { LoaderOptions optionsToUse = options == null ? new LoaderOptions() : options; optionsToUse.ParserOptions.BaseUrl = string.IsNullOrEmpty(optionsToUse.ParserOptions.BaseUrl) ? url : optionsToUse.ParserOptions.BaseUrl; XmlDomBuilder dom = new XmlDomBuilder(doc); HtmlStreamParser <XmlNode> parser = new HtmlStreamParser <XmlNode>(); // Get the Html asynchronously and Parse it into an Xml Document using (HtmlTextReader htmlReader = await HtmlClient.GetHtmlTextReaderAsync(url, optionsToUse)) parser.Parse(dom, htmlReader, optionsToUse.ParserOptions); }
public abstract void ParseFragment(XmlNode node, HtmlTextReader htmlTextReader, ParserOptions options);
public abstract void Parse(XmlDocument doc, HtmlTextReader htmlTextReader, ParserOptions options);
// Read all attributes onto the given tag... private static void AddAttributes(DomBuilder <DomNode> dom, DomNode tag, string tagName, HtmlTextReader reader, string originatingUrl = null) { while (reader.ParseState == ParseState.AttributeName) { // Get the attribute Name and Value string attrName = reader.ReadAttributeName(); string attrValue = reader.ParseState == ParseState.AttributeValue ? reader.ReadAttributeValue() : string.Empty; // Make sure we have a value for the attribute name if (string.IsNullOrEmpty(attrName)) { continue; } // Make sure the attribute name is a valid XML name... attrName = XmlConvert.EncodeLocalName(attrName); // Values can have html encodings - we want them decoded attrValue = HtmlDecode(attrValue); // Fully-qualify UrlAttributes if a BaseUrl was supplied if (!string.IsNullOrEmpty(originatingUrl)) { // See if the given attribute is a url attribute inside the given tag Dictionary <string, bool> relevantTags = null; _urlAttributes.TryGetValue(attrName, out relevantTags); if (relevantTags != null && relevantTags.ContainsKey(tagName) && !attrValue.Contains("://")) { Uri baseUri = new Uri(originatingUrl); Uri compbinedUri = new Uri(baseUri, attrValue); attrValue = compbinedUri.ToString(); } } // Add the attribute, if it does not already exist dom.AddAttribute(tag, attrName, attrValue); } }
public void Parse(DomBuilder <DomNode> dom, HtmlTextReader reader, ParserOptions options, InsersionMode mode = InsersionMode.BeforeHtml) { ParserOptions parserOptions = options == null ? new ParserOptions() : options; DateTime startTime = DateTime.Now; Beginning: // Make sure root node is clear dom.RemoveAll(dom.RootNode); // DOM Node pointers DomNode currNode = dom.RootNode; DomNode htmlNode = default(DomNode); DomNode headNode = default(DomNode); DomNode bodyNode = default(DomNode); // If we are parsing a fragment, the initial insersion mode will // be: 'InBody', otherwise it will be: 'BeforeHtml' InsersionMode insertionMode = mode; while (reader.ParseState != ParseState.Done) { ParseState state = reader.ParseState; // Read the next token. Ignore empty tokens string tok = reader.ReadNext(); if (IsNullOrWhiteSpace(tok)) { continue; } // Find the insertion point for the token based on our mode switch (insertionMode) { case InsersionMode.BeforeHtml: // Comment is valid at the top if (state == ParseState.Comment) { break; } // <html> is valid at the top - switch to BeforeHead if (state == ParseState.OpenTag && tok == "html") { htmlNode = dom.AddElement(dom.RootNode, "html"); currNode = htmlNode; insertionMode = InsersionMode.BeforeHead; break; } // Got a tag that should be in the head. Create html/head node structure... if (state == ParseState.OpenTag && IsHeadTag(tok)) { htmlNode = dom.AddElement(dom.RootNode, "html"); headNode = dom.AddElement(htmlNode, "head"); currNode = headNode; insertionMode = InsersionMode.InHead; break; } // Got anything else - put it in the body else { htmlNode = dom.AddElement(dom.RootNode, "html"); headNode = dom.AddElement(htmlNode, "head"); bodyNode = dom.AddElement(htmlNode, "body"); currNode = bodyNode; insertionMode = InsersionMode.InBody; break; } case InsersionMode.BeforeHead: // Comment is valid if (state == ParseState.Comment) { break; } // <head> is valid here - switch to InHead if (state == ParseState.OpenTag && tok == "head") { headNode = dom.AddElement(htmlNode, "head"); currNode = headNode; insertionMode = InsersionMode.InHead; break; } // Got a tag that 'should' be in the head. Create head... if (state == ParseState.OpenTag && IsHeadTag(tok)) { headNode = dom.AddElement(htmlNode, "head"); currNode = headNode; insertionMode = InsersionMode.InHead; break; } // Got anything else, including <body> - put it in the body headNode = dom.AddElement(htmlNode, "head"); bodyNode = dom.AddElement(htmlNode, "body"); currNode = bodyNode; insertionMode = InsersionMode.InBody; break; case InsersionMode.InHead: // Comment is valid here if (state == ParseState.Comment) { break; } // Head tags are valid here if (state == ParseState.OpenTag && IsHeadTag(tok)) { break; } // Anything else must go into the body bodyNode = dom.AddElement(htmlNode, "body"); currNode = bodyNode; insertionMode = InsersionMode.InBody; break; } // Add the token to the DOM switch (state) { case ParseState.Comment: // Xml Comments cannot have '--', and they cannot end in '-' string commentText = tok.Replace("-", "#"); dom.AddComment(currNode, commentText); break; case ParseState.Text: // Decode the text, to convert all encoded values (eg: '>' to '>') string textContent = HtmlDecode(tok); dom.AddText(currNode, textContent); break; case ParseState.OpenTag: // Look up the properties of the tag TagProperties properties = TagProperties.None; _tagProperties.TryGetValue(tok, out properties); // For top-level <html>, & <body> tags, all we do is add attributes to // the already create nodes. <head> tags get ignored if ((properties & TagProperties.TopLevel) > 0) { if (tok == "html" && htmlNode != null) { AddAttributes(dom, htmlNode, tok, reader); break; } if (tok == "body" && bodyNode != null) { AddAttributes(dom, bodyNode, tok, reader); break; } break; } // Create the new tag, add attributes, and append to DOM string tagName = XmlConvert.EncodeLocalName(tok); DomNode tag = dom.AddElement(currNode, tagName); AddAttributes(dom, tag, tagName, reader, (parserOptions.FullyQualifyUrls ? parserOptions.BaseUrl : null)); // If this is a meta tag, and our underlying stream // lets us ReWind, and we are tentative about the encoding, then check for a new charset if (((properties & TagProperties.IsHeadTag) > 0) && (reader.CurrentEncodingConfidence == EncodingConfidence.Tentative) && reader.CanRewind && (tok == "meta")) { Encoding encoding = CheckForNewEncoding(tag, dom); // If we found a new encoding encoding, we will need to start over! string newEncodingName = encoding != null ? encoding.WebName : String.Empty; string currentEncodingName = reader.CurrentEncoding != null ? reader.CurrentEncoding.WebName : String.Empty; if (encoding != null && (newEncodingName != currentEncodingName)) { // Start over from the beginning! reader.Rewind(encoding); // Rewind underlying stream, set new encoding goto Beginning; } } // If this is a self closing tag, we are done. Don't move pointer. if ((properties & TagProperties.SelfClosing) > 0) { break; } // If this is an RCData tag, get the text value for it and add it. if ((properties & TagProperties.RCData) > 0) { tok = reader.ReadRCData(tok); dom.AddText(tag, tok); break; } // Set current tag pointer to the newly added tag currNode = tag; break; case ParseState.CloseTag: // Look up our ancestor chain for the corresponding open tag DomNode parent = dom.FindAncestor(currNode, tok); currNode = parent != null ? parent : currNode; // If we moved up beyond the body (for example if there were tags or text // after the </html> tag - set the pointer to the <body> tag. if (Equals(currNode, htmlNode) || Equals(currNode, dom.RootNode)) { if (bodyNode != null) // possible we don't have a body node { currNode = bodyNode; } } break; } } DateTime endTime = DateTime.Now; // See if the user want's to include metadata in <head> if (parserOptions.IncludeMetaData && headNode != null) { // Parse Time DomNode totalMillisecondsNode = dom.AddElement(headNode, "meta"); dom.AddAttribute(totalMillisecondsNode, "source", "HtmlStreamParser"); dom.AddAttribute(totalMillisecondsNode, "totalMilliseconds", endTime.Subtract(startTime).TotalMilliseconds.ToString()); // Originating URL DomNode originatingUrlNode = dom.AddElement(headNode, "meta"); dom.AddAttribute(originatingUrlNode, "source", "HtmlStreamParser"); dom.AddAttribute(originatingUrlNode, "originatingUrl", reader.OriginatingUrl); // Detected Encodings DomNode encodingNode = dom.AddElement(headNode, "meta"); dom.AddAttribute(encodingNode, "source", "HtmlStreamParser"); dom.AddAttribute(encodingNode, "encoding", reader.CurrentEncoding.WebName); dom.AddAttribute(encodingNode, "confidence", reader.CurrentEncodingConfidence.ToString()); DomNode encodingNode2 = dom.AddElement(headNode, "meta"); dom.AddAttribute(encodingNode2, "source", "HtmlStreamParser"); dom.AddAttribute(encodingNode2, "initalEncoding", reader.InitialEncoding.WebName); dom.AddAttribute(encodingNode2, "confidence", reader.InitialEncodingConfidence.ToString()); // Add headers metadata foreach (var header in reader.OriginatingHttpHeaders) { DomNode headerNode = dom.AddElement(headNode, "meta"); dom.AddAttribute(headerNode, "source", "HttpHeaders"); dom.AddAttribute(headerNode, "name", header.Key); dom.AddAttribute(headerNode, "content", header.Value); } } }