/// <summary> /// Helper function that returns an HTML document from text /// </summary> private static HtmlDocument GetHtml(string source) { HtmlDocument html = new HtmlDocument(); html.OptionFixNestedTags = true; html.OptionAutoCloseOnEnd = true; html.OptionDefaultStreamEncoding = Encoding.UTF8; html.LoadHtml(source ?? ""); // Encode any code blocks independently so they won't // be stripped out completely when we do a final cleanup foreach (var n in html.DocumentNode.DescendantsAndSelf()) { if (n.Name == "code") { //** Code tag attribute vulnerability fix 28-9-12 (thanks to Natd) HtmlAttribute[] attr = n.Attributes.ToArray(); foreach (HtmlAttribute a in attr) { if (a.Name != "style" && a.Name != "class") { a.Remove(); } } //** End fix n.InnerHtml = System.Net.WebUtility.HtmlEncode(System.Net.WebUtility.HtmlDecode(n.InnerHtml)); } } return html; }
/// <summary> /// Begins the process of downloading an internet resource /// </summary> /// <param name="uri">Url to the html document</param> /// <param name="encoding">The encoding to use while downloading the document</param> /// <param name="credentials">The credentials to use for authenticating the web request</param> public async Task<HtmlDocument> LoadFromWebAsync(Uri uri, Encoding encoding, NetworkCredential credentials) { var clientHandler = new HttpClientHandler(); if (credentials == null) clientHandler.UseDefaultCredentials = true; else clientHandler.Credentials = credentials; var client = new HttpClient(clientHandler); var e = await client.GetAsync(uri); if (e.StatusCode == HttpStatusCode.OK) { var html = string.Empty; if (encoding != null) { using (var sr = new StreamReader(await e.Content.ReadAsStreamAsync(), encoding)) { html = sr.ReadToEnd(); } } else html = await e.Content.ReadAsStringAsync(); var doc = new HtmlDocument(); if (PreHandleDocument != null) PreHandleDocument(doc); doc.LoadHtml(html); return doc; } throw new Exception("Error downloading html"); }
/// <summary> /// Creates an HTML node from a string representing literal HTML. /// </summary> /// <param name="html">The HTML text.</param> /// <returns>The newly created node instance.</returns> public static HtmlNode CreateNode(string html) { // REVIEW: this is *not* optimum... HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(html); return doc.DocumentNode.FirstChild; }