/// <summary> /// Constructs a DOM (System.Xml.Linq.XDocument) from HTML markup. /// </summary> /// <param name="htmlContent">HTML markup from which the DOM is to be constructed.</param> /// <returns>System.Linq.Xml.XDocument instance which is a DOM of the provided HTML markup.</returns> public XDocument BuildDocument(string htmlContent) { if (htmlContent == null) { throw new ArgumentNullException("htmlContent"); } if (htmlContent.Trim().Length == 0) { return(new XDocument()); } // "trim end" htmlContent to ...</html>$ (codinghorror.com puts some scripts after the </html> - sic!) const string htmlEnd = "</html"; int indexOfHtmlEnd = htmlContent.LastIndexOf(htmlEnd); if (indexOfHtmlEnd != -1) { int indexOfHtmlEndBracket = htmlContent.IndexOf('>', indexOfHtmlEnd); if (indexOfHtmlEndBracket != -1) { htmlContent = htmlContent.Substring(0, indexOfHtmlEndBracket + 1); } } XDocument document; try { document = LoadDocument(htmlContent); } catch (InvalidOperationException exc) { // sometimes SgmlReader doesn't handle <script> tags well and XDocument.Load() throws, // so we can retry with the html content with <script> tags stripped off if (!exc.Message.Contains("EndOfFile")) { throw; } htmlContent = HtmlUtils.RemoveScriptTags(htmlContent); document = LoadDocument(htmlContent); } return(document); }
/// <summary> /// Constructs a DOM (System.Xml.Linq.XDocument) from HTML markup. /// </summary> /// <param name="htmlContent">HTML markup from which the DOM is to be constructed.</param> /// <returns>System.Linq.Xml.XDocument instance which is a DOM of the provided HTML markup.</returns> public XDocument BuildDocument(string htmlContent) { if (htmlContent == null) { throw new ArgumentNullException("htmlContent"); } if (htmlContent.Trim().Length == 0) { return(new XDocument()); } // Remove all conditional comments (SgmlDomBuilder doesn't understand them correctly) htmlContent = Regex.Replace(htmlContent, @"<!--\s*\[if .*?\]\s*>.*?<!\s*\[endif\]\s*-->", string.Empty, RegexOptions.Singleline | RegexOptions.IgnoreCase | RegexOptions.Compiled); htmlContent = Regex.Replace(htmlContent, @"<!--\s*\[if .*?\]\s*>\s*(-->)?", string.Empty, RegexOptions.IgnoreCase | RegexOptions.Compiled); htmlContent = Regex.Replace(htmlContent, @"(<!--\s*)?<!\s*\[endif\]\s*-->", string.Empty, RegexOptions.IgnoreCase | RegexOptions.Compiled); // remove malformed conditionals htmlContent = Regex.Replace(htmlContent, @"<!--\s*\[if .*?>\s*-->", string.Empty, RegexOptions.IgnoreCase | RegexOptions.Compiled); // "trim end" htmlContent to ...</html>$ (codinghorror.com puts some scripts after the </html> - sic!) const string htmlEnd = "</html"; int indexOfHtmlEnd = htmlContent.LastIndexOf(htmlEnd); if (indexOfHtmlEnd != -1) { int indexOfHtmlEndBracket = htmlContent.IndexOf('>', indexOfHtmlEnd); if (indexOfHtmlEndBracket != -1) { htmlContent = htmlContent.Substring(0, indexOfHtmlEndBracket + 1); } } // "trim start" htmlContent to ...^<html (some sites put scripts before the <html>..) const string htmlStart = "<html"; int indexOfHtmlStart = htmlContent.IndexOf(htmlStart); if (indexOfHtmlStart != -1) { htmlContent = htmlContent.Substring(indexOfHtmlStart); } XDocument document; try { document = LoadDocument(htmlContent); } catch (InvalidOperationException exc) { // sometimes SgmlReader doesn't handle <script> tags well and XDocument.Load() throws, // so we can retry with the html content with <script> tags stripped off if (!exc.Message.Contains("EndOfFile")) { throw; } htmlContent = HtmlUtils.RemoveScriptTags(htmlContent); try { document = LoadDocument(htmlContent); } catch (InvalidOperationException exc2) { // if removing the script tags wasn't enough, we can retry with the // <style> tags also stripped off if (!exc2.Message.Contains("EndOfFile")) { throw; } htmlContent = HtmlUtils.RemoveStyleTags(htmlContent); document = LoadDocument(htmlContent); } } // remove any *extra* <body> or <html> tags Action <XDocument, string> fnRemoveExtra = (doc, tagName) => { var bodyElements = doc.GetElementsByTagName(tagName); if (bodyElements != null && bodyElements.Count() > 1) { // Skip the first (top-most) match, then reverse the list so they're processed bottom-up foreach (var bodyElem in bodyElements.Skip(1).Reverse()) { // Push any child nodes up a level, then delete the body element var children = bodyElem.Descendants(); bodyElem.AddAfterSelf(children.ToArray()); bodyElem.Remove(); } } }; fnRemoveExtra(document, "html"); fnRemoveExtra(document, "body"); return(document); }