/// <summary> /// Initializes HtmlNode, providing type, owner and where it exists in a collection /// </summary> /// <param name="type"></param> /// <param name="ownerdocument"></param> /// <param name="index"></param> public HtmlNode(HtmlNodeType type, HtmlDocument ownerdocument, int index) { _nodetype = type; _ownerdocument = ownerdocument; _outerstartindex = index; switch (type) { case HtmlNodeType.Comment: Name = HtmlNodeTypeNameComment; _endnode = this; break; case HtmlNodeType.Document: Name = HtmlNodeTypeNameDocument; _endnode = this; break; case HtmlNodeType.Text: Name = HtmlNodeTypeNameText; _endnode = this; break; } if (_ownerdocument._openednodes != null) { if (!Closed) { // we use the index as the key // -1 means the node comes from public if (-1 != index) { _ownerdocument._openednodes.Add(index, this); } } } if ((-1 != index) || (type == HtmlNodeType.Comment) || (type == HtmlNodeType.Text)) return; // innerhtml and outerhtml must be calculated _outerchanged = true; _innerchanged = true; }
/// <summary> /// Creates an HTML node from a string representing literal HTML. /// </summary> /// <param name="html">The HTML text.</param> /// <returns>The newly created node instance.</returns> public static HtmlNode CreateNode(string html) { // REVIEW: this is *not* optimum... HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(html); return doc.DocumentNode.FirstChild; }
internal HtmlNodeNavigator(HtmlDocument doc, HtmlNode currentNode) { if (currentNode == null) { throw new ArgumentNullException("currentNode"); } if (currentNode.OwnerDocument != doc) { throw new ArgumentException(HtmlDocument.HtmlExceptionRefNotChild); } InternalTrace(null); _doc = doc; Reset(); _currentnode = currentNode; }
private HtmlNodeNavigator(HtmlNodeNavigator nav) { if (nav == null) { throw new ArgumentNullException("nav"); } InternalTrace(null); _doc = nav._doc; _currentnode = nav._currentnode; _attindex = nav._attindex; _nametable = nav._nametable; // REVIEW: should we do this? }
public override string[] GetTopicUrls(string html) { if (html == null || html.Trim().Length == 0) return new string[0]; var urls = new List<string>(); var doc = new HtmlDocument(); HtmlNodeCollection nodes; bool allowAdd = false; try { doc.LoadHtml(html); nodes = doc.DocumentNode.SelectNodes("//table[@class='forumline']"); var tables = (from n in nodes where (n.InnerHtml.Contains("class=\"topictitle\"") || n.InnerHtml.Contains("class='topictitle'")) && n.InnerHtml.Contains("Replies") select n).ToArray(); if (tables[tables.Length - 1].InnerHtml.Contains("<td class=\"row3\" colspan=\"6\" height=\"21\">")) { /*foreach (var n in table[table.Length - 1].SelectNodes(".//td[1]")) { if (n.InnerHtml.Contains("<b>Forum Topics</b>")) { allowAdd = false; break; } }*/ foreach (var n in tables[tables.Length - 1].SelectNodes(".//tr")) { if (!allowAdd) { nodes = n.SelectNodes(".//td[@class='row3']"); if (nodes.Count > 0 && nodes[0].InnerHtml.Contains("Topics") && !nodes[0].InnerHtml.Contains("Sticky") && nodes[0].GetAttributeValue("colspan", "") == "6") { allowAdd = true; } } else { nodes = n.SelectNodes(".//a[@class='topictitle']"); if (nodes.Count > 0) { var anchors = (from a in nodes where !a.ParentNode.InnerHtml.Contains("Announcement:</b>") && !a.ParentNode.InnerHtml.Contains("Sticky:</b>") select HttpUtility.HtmlDecode(a.GetAttributeValue("href", "")).Trim()).ToArray(); if (anchors.Length > 0 && anchors[0].Length > 0) { urls.Add((anchors[0].StartsWith("http:")) ? anchors[0] : this.BaseUrl + "/" + anchors[0]); } } } } } else { nodes = doc.DocumentNode.SelectNodes("//a[@class='topictitle']"); var anchors = from n in nodes where !n.ParentNode.InnerHtml.Contains("Announcement:</b>") && !n.ParentNode.InnerHtml.Contains("Sticky:</b>") select HttpUtility.HtmlDecode(n.GetAttributeValue("href", "")).Trim(); foreach (string a in anchors) { if (a.Length > 0) urls.Add((a.StartsWith("http:")) ? a : this.BaseUrl + "/" + a); } } return urls.ToArray(); } catch (Exception error) { ErrorLog.LogException(error); return new string[0]; } }
public static HtmlDocument GetDoc(string url) { var doc = new HtmlDocument(); doc.LoadHtml(Get(url).Data); return doc; }
public override SiteTopic GetTopic(string url) { if (!this.User.IsLoggedIn) return null; HtmlDocument doc = new HtmlDocument(); HttpWebRequest req; HttpResult result; req = Http.Prepare(url); req.Method = "GET"; req.Referer = url; try { result = this.AllowRedirects ? Http.HandleRedirects(Http.Request(req), false) : Http.Request(req); doc.LoadHtml(result.Data); ErrorLog.LogException(result.Error); HtmlNodeCollection nodes = doc.DocumentNode.SelectNodes("//a[@title='Reply directly to this post']"); string link = HttpUtility.HtmlDecode(nodes[0].GetAttributeValue("href", String.Empty)); req = Http.Prepare(link); req.Method = "GET"; req.Referer = url; result = this.AllowRedirects ? Http.HandleRedirects(Http.Request(req), false) : Http.Request(req); doc.LoadHtml(result.Data); ErrorLog.LogException(result.Error); string title = (from n in doc.DocumentNode.SelectNodes("//h2[@class='maintitle']") where n.InnerText.Trim().Contains("Replying to ") select n.InnerText.Replace("Replying to ", "")).ToArray()[0]; string content = doc.DocumentNode.SelectNodes("//textarea[@name='Post']")[0].InnerText; content = HttpUtility.HtmlDecode(content.Substring(content.IndexOf(']') + 1)).Trim(); content = content.Substring(0, content.Length - "[/quote]".Length); // Fix IPB3 quotes string pattern = @"(?i)\[quote [\w\d " + '"' + @"'-=]+\]"; string replace = "[quote]"; content = Regex.Replace(content, pattern, replace); return new SiteTopic( HttpUtility.HtmlDecode(title).Trim(), content.Trim(), 0, 0, url ); } catch (Exception error) { ErrorLog.LogException(error); return null; } }
private HtmlDocument LoadUrl(Uri uri, string method, WebProxy proxy, NetworkCredential creds) { HtmlDocument doc = new HtmlDocument(); doc.OptionAutoCloseOnEnd = false; doc.OptionFixNestedTags = true; _statusCode = Get(uri, method, null, doc, proxy, creds); if (_statusCode == HttpStatusCode.NotModified) { // read cached encoding doc.DetectEncodingAndLoad(GetCachePath(uri)); } return doc; }
public override string[] GetTopicUrls(string html) { if (html == null || html.Trim().Length == 0) return new string[0]; var urls = new List<string>(); var doc = new HtmlDocument(); HtmlNodeCollection nodes; try { doc.LoadHtml(html); nodes = doc.DocumentNode.SelectNodes("//a"); var anchors = from n in nodes where n.GetAttributeValue("id", "").StartsWith("thread_title_") && !n.ParentNode.InnerHtml.Contains("Sticky:") select HttpUtility.HtmlDecode(n.GetAttributeValue("href", "")).Trim(); //throw new Exception(anchors.ToArray().Length.ToString()); foreach (string a in anchors) { if (a.Length > 0) urls.Add((a.StartsWith("http:")) ? a : this.BaseUrl + "/" + a); } return urls.ToArray(); } catch (Exception error) { ErrorLog.LogException(error); return new string[0]; } }
public override string[] GetTopicUrls(string html) { if (html == null || html.Trim().Length == 0) return new string[0]; var allowAdd = true; var urls = new List<string>(); var doc = new HtmlDocument(); HtmlNodeCollection nodes; Uri uri; try { doc.LoadHtml(html); nodes = doc.DocumentNode.SelectNodes("//table"); var table = (from n in nodes where n.InnerHtml.Contains("id=\"tid-link-") && n.InnerHtml.Contains("topic_toggle_folder") && n.InnerHtml.Contains("<!-- Begin Topic Entry ") select n).ToArray(); //throw new Exception(table[table.Length - 1].SelectNodes(".//td[1]").Count.ToString()); foreach (var n in table[table.Length - 1].SelectNodes(".//td[1]")) { if (n.InnerHtml.Contains("<b>Forum Topics</b>")) { allowAdd = false; break; } } foreach (var n in table[table.Length - 1].SelectNodes(".//tr")) { if (!allowAdd) { nodes = n.SelectNodes(".//td[1]"); if (nodes.Count > 0 && nodes[0].InnerHtml.Contains("<b>Forum Topics</b>")) { allowAdd = true; } } else { nodes = n.SelectNodes(".//a"); if (nodes.Count > 0) { var links = (from link in nodes where link.GetAttributeValue("id", "").StartsWith("tid-link-") && link.GetAttributeValue("href", "").StartsWith("http:") select HttpUtility.HtmlDecode(link.GetAttributeValue("href", ""))). ToArray(); if (links.Length > 0 && Uri.TryCreate(links[0].Trim(), UriKind.Absolute, out uri)) { urls.Add(links[0].Trim()); } } } } return urls.ToArray(); } catch (Exception error) { ErrorLog.LogException(error); return new string[0]; } }
public override SiteTopic GetTopic(string url) { if (!this.User.IsLoggedIn) return null; HtmlDocument doc = new HtmlDocument(); HttpWebRequest req; HttpResult result; req = Http.Prepare(url); req.Method = "GET"; req.Referer = url; try { result = this.AllowRedirects ? Http.HandleRedirects(Http.Request(req), false) : Http.Request(req); doc.LoadHtml(result.Data); ErrorLog.LogException(result.Error); HtmlNodeCollection nodes = doc.DocumentNode.SelectNodes("//img[@alt='Reply With Quote']"); string link = HttpUtility.HtmlDecode(nodes[0].ParentNode.GetAttributeValue("href", String.Empty)); nodes = doc.DocumentNode.SelectNodes("//span[@class='threadtitle']"); string title = HttpUtility.HtmlDecode(nodes[0].InnerText).Trim(); req = Http.Prepare((link.StartsWith("http:")) ? link : this.BaseUrl + "/" + link); req.Method = "GET"; req.Referer = url; result = this.AllowRedirects ? Http.HandleRedirects(Http.Request(req), false) : Http.Request(req); doc.LoadHtml(result.Data); ErrorLog.LogException(result.Error); string content = doc.DocumentNode.SelectNodes("//textarea[@name='message']")[0].InnerText; content = HttpUtility.HtmlDecode(content.Substring(content.IndexOf(']') + 1)).Trim(); content = content.Substring(0, content.Length - "[/quote]".Length); return new SiteTopic( title.Trim(), content.Trim(), 0, 0, url ); } catch (Exception error) { ErrorLog.LogException(error); return null; } }
public override string[] GetTopicUrls(string html) { if (html == null || html.Trim().Length == 0) return new string[0]; var urls = new List<string>(); var doc = new HtmlDocument(); HtmlNodeCollection nodes; HtmlNode p; string url; bool allowAdd = true; try { doc.LoadHtml(html); var links = from link in doc.DocumentNode.SelectNodes("//a") where link.GetAttributeValue("class", "").Contains("topictitle") select link; foreach (var a in links) { p = a.ParentNode.ParentNode; if (p.Name.ToLower() == "tr" || p.Name.ToLower() == "td") { nodes = p.SelectNodes(".//img"); foreach (var n in nodes) { if (n.GetAttributeValue("src", "").Contains("announce") || n.GetAttributeValue("src", "").Contains("sticky")) { allowAdd = false; break; } } if (allowAdd) { url = HttpUtility.HtmlDecode(a.GetAttributeValue("href", "")).TrimStart("./".ToCharArray()); urls.Add((url.StartsWith("http:")) ? url : this.BaseUrl + "/" + url); } allowAdd = true; } else if (!p.ParentNode.InnerHtml.Contains("announce_") && !p.ParentNode.InnerHtml.Contains("sticky_")) { url = HttpUtility.HtmlDecode(a.GetAttributeValue("href", "")).TrimStart("./".ToCharArray()); urls.Add((url.StartsWith("http:")) ? url : this.BaseUrl + "/" + url); } } return urls.ToArray(); } catch (Exception error) { ErrorLog.LogException(error); return new string[0]; } }
public override SiteTopic GetTopic(string url) { if (!this.User.IsLoggedIn) return null; HtmlDocument doc = new HtmlDocument(); HttpWebRequest req; HttpResult result; req = Http.Prepare(url); req.Method = "GET"; req.Referer = url; try { result = this.AllowRedirects ? Http.HandleRedirects(Http.Request(req), false) : Http.Request(req); doc.LoadHtml(result.Data); ErrorLog.LogException(result.Error); HtmlNodeCollection nodes = doc.DocumentNode.SelectNodes("//a"); var links = (from n in nodes where HttpUtility.HtmlDecode(n.GetAttributeValue("href", "")).Contains("posting.php?mode=quote") select HttpUtility.HtmlDecode(n.GetAttributeValue("href", ""))).ToArray(); string link = links[0].TrimStart("./".ToCharArray()); req = Http.Prepare((link.StartsWith("http:")) ? link : this.BaseUrl + "/" + link); req.Method = "GET"; req.Referer = url; result = this.AllowRedirects ? Http.HandleRedirects(Http.Request(req), false) : Http.Request(req); doc.LoadHtml(result.Data); ErrorLog.LogException(result.Error); string title = doc.DocumentNode.SelectNodes("//input[@name='subject']")[0].GetAttributeValue("value", String.Empty); string content = doc.DocumentNode.SelectNodes("//textarea[@name='message']")[0].InnerText; title = HttpUtility.HtmlDecode(title); title = title.Substring(title.IndexOf(':') + 1).Trim(); content = HttpUtility.HtmlDecode(content.Substring(content.IndexOf(']') + 1)).Trim(); content = content.Substring(0, content.Length - "[/quote]".Length); return new SiteTopic( title.Trim(), content.Trim(), 0, 0, url ); } catch (Exception error) { ErrorLog.LogException(error); return null; } }
/// <summary> /// Loads an HTML document from an Internet resource. /// </summary> /// <param name="url">The requested URL, such as "http://Myserver/Mypath/Myfile.asp".</param> /// <param name="method">The HTTP method used to open the connection, such as GET, POST, PUT, or PROPFIND.</param> /// <param name="proxy">Proxy to use with this request</param> /// <param name="credentials">Credentials to use when authenticating</param> /// <returns>A new HTML document.</returns> public HtmlDocument Load(string url, string method, WebProxy proxy, NetworkCredential credentials) { Uri uri = new Uri(url); HtmlDocument doc; if ((uri.Scheme == Uri.UriSchemeHttps) || (uri.Scheme == Uri.UriSchemeHttp)) { doc = LoadUrl(uri, method, proxy, credentials); } else { if (uri.Scheme == Uri.UriSchemeFile) { doc = new HtmlDocument(); doc.OptionAutoCloseOnEnd = false; doc.OptionAutoCloseOnEnd = true; doc.DetectEncodingAndLoad(url, _autoDetectEncoding); } else { throw new HtmlWebException("Unsupported uri scheme: '" + uri.Scheme + "'."); } } if (PreHandleDocument != null) { PreHandleDocument(doc); } return doc; }
public override string[] GetTopicUrls(string html) { if (html == null || html.Trim().Length == 0) return new string[0]; var urls = new List<string>(); var doc = new HtmlDocument(); HtmlNodeCollection nodes; Uri uri; try { doc.LoadHtml(html); nodes = doc.DocumentNode.SelectNodes("//a[@class='topic_title']"); var anchors = from n in nodes where !n.ParentNode.InnerHtml.Contains("class=\"topic_prefix\"") && n.GetAttributeValue("id", "").StartsWith("tid-link-") && n.GetAttributeValue("href", "").StartsWith("http:") select HttpUtility.HtmlDecode(n.GetAttributeValue("href", "")).Trim(); //throw new Exception(anchors.ToArray().Length.ToString()); foreach (string a in anchors) { if (Uri.TryCreate(a, UriKind.Absolute, out uri)) urls.Add(a); } return urls.ToArray(); } catch (Exception error) { ErrorLog.LogException(error); return new string[0]; } }
private HttpStatusCode Get(Uri uri, string method, string path, HtmlDocument doc, IWebProxy proxy, ICredentials creds) { string cachePath = null; HttpWebRequest req; bool oldFile = false; req = WebRequest.Create(uri) as HttpWebRequest; req.Method = method; if (proxy != null) { if (creds != null) { proxy.Credentials = creds; req.Credentials = creds; } else { proxy.Credentials = CredentialCache.DefaultCredentials; req.Credentials = CredentialCache.DefaultCredentials; } req.Proxy = proxy; } _fromCache = false; _requestDuration = 0; int tc = Environment.TickCount; if (UsingCache) { cachePath = GetCachePath(req.RequestUri); if (File.Exists(cachePath)) { req.IfModifiedSince = File.GetLastAccessTime(cachePath); oldFile = true; } } if (_cacheOnly) { if (!File.Exists(cachePath)) { throw new HtmlWebException("File was not found at cache path: '" + cachePath + "'"); } if (path != null) { IOLibrary.CopyAlways(cachePath, path); // touch the file File.SetLastWriteTime(path, File.GetLastWriteTime(cachePath)); } _fromCache = true; return HttpStatusCode.NotModified; } if (_useCookies) { req.CookieContainer = new CookieContainer(); } if (PreRequest != null) { // allow our user to change the request at will if (!PreRequest(req)) { return HttpStatusCode.ResetContent; } // dump cookie // if (_useCookies) // { // foreach(Cookie cookie in req.CookieContainer.GetCookies(req.RequestUri)) // { // HtmlLibrary.Trace("Cookie " + cookie.Name + "=" + cookie.Value + " path=" + cookie.Path + " domain=" + cookie.Domain); // } // } } HttpWebResponse resp; try { resp = req.GetResponse() as HttpWebResponse; } catch (WebException we) { _requestDuration = Environment.TickCount - tc; resp = (HttpWebResponse)we.Response; if (resp == null) { if (oldFile) { if (path != null) { IOLibrary.CopyAlways(cachePath, path); // touch the file File.SetLastWriteTime(path, File.GetLastWriteTime(cachePath)); } return HttpStatusCode.NotModified; } throw; } } catch (Exception) { _requestDuration = Environment.TickCount - tc; throw; } // allow our user to get some info from the response if (PostResponse != null) { PostResponse(req, resp); } _requestDuration = Environment.TickCount - tc; _responseUri = resp.ResponseUri; bool html = IsHtmlContent(resp.ContentType); Encoding respenc; if ((resp.ContentEncoding != null) && (resp.ContentEncoding.Length > 0)) { respenc = Encoding.GetEncoding(resp.ContentEncoding); } else { respenc = null; } if (resp.StatusCode == HttpStatusCode.NotModified) { if (UsingCache) { _fromCache = true; if (path != null) { IOLibrary.CopyAlways(cachePath, path); // touch the file File.SetLastWriteTime(path, File.GetLastWriteTime(cachePath)); } return resp.StatusCode; } else { // this should *never* happen... throw new HtmlWebException("Server has send a NotModifed code, without cache enabled."); } } Stream s = resp.GetResponseStream(); if (s != null) { if (UsingCache) { // NOTE: LastModified does not contain milliseconds, so we remove them to the file SaveStream(s, cachePath, RemoveMilliseconds(resp.LastModified), _streamBufferSize); // save headers SaveCacheHeaders(req.RequestUri, resp); if (path != null) { // copy and touch the file IOLibrary.CopyAlways(cachePath, path); File.SetLastWriteTime(path, File.GetLastWriteTime(cachePath)); } } else { // try to work in-memory if ((doc != null) && (html)) { if (respenc != null) { doc.Load(s, respenc); } else { doc.Load(s, true); } } } resp.Close(); } return resp.StatusCode; }
public override void MakeReady(int sectionId) { if (!this.User.IsLoggedIn) return; AsyncHelper.Run(() => { string url = this.BaseUrl + this.NewTopicPath.Replace("{0}", sectionId.ToString()); var doc = new HtmlDocument(); var replacements = new Dictionary<string, string>(); HttpWebRequest req = Http.Prepare(url); req.Method = "GET"; HttpResult result = this.AllowRedirects ? Http.HandleRedirects(Http.Request(req), false) : Http.Request(req); // Did the request fail? if (result.HasError || result.Data.Trim().Length == 0) { ErrorLog.LogException(result.Error); this.OnReadyChanged(this, new EventArgs()); return; } doc.LoadHtml(result.Data); // Extract required data foreach (HtmlNode n in doc.DocumentNode.SelectNodes("//input[@type='hidden']")) { switch (n.Attributes["name"].Value) { case "auth_key": replacements.Add("[auth_key]", n.Attributes["value"].Value); break; case "attach_post_key": replacements.Add("[attach_post_key]", n.Attributes["value"].Value); break; case "s": replacements.Add("[s]", n.Attributes["value"].Value); break; default: break; } } // Check if we got the needed info if (replacements.Count != 3) replacements.Clear(); // Done this.TemplateReplacements = replacements; this.OnReadyChanged(this, new EventArgs()); if (!this.IsReady) { var error = new Exception(String.Format( "MakeReady({0}) failed for site '{1}'.\r\nUrl used: {2}.", sectionId, this.BaseUrl, url )); ErrorLog.LogException(error); } }); }
internal HtmlAttribute(HtmlDocument ownerdocument) { _ownerdocument = ownerdocument; }
public override void LoginUser(string username, string password) { AsyncHelper.Run(() => { var loginPath = "/index.php?app=core&module=global§ion=login"; var html = Http.Get(this.BaseUrl + loginPath).Data; var doc = new HtmlDocument(); doc.LoadHtml(html); var auth = doc.DocumentNode.SelectSingleNode("//input[@name='auth_key']"); string url = this.BaseUrl + this.LoginPath; var details = new SiteLoginDetails(false, username, password); var data = String.Format( Res.IPBoard_314_Login, details.GetUrlSafeUsername(this.SiteEncoding), details.GetUrlSafePassword(this.SiteEncoding), auth.GetAttributeValue("value", ""), HttpUtility.UrlEncode(this.BaseUrl + loginPath, this.SiteEncoding) ); byte[] rawData = this.SiteEncoding.GetBytes(data); int check = 0; int parse = -1; this.LogoutUser(); HttpWebRequest req = Http.Prepare(url); Stream stream; req.Method = "POST"; req.Referer = url; req.ContentType = Res.FormContentType; req.ContentLength = rawData.Length; stream = req.GetRequestStream(); stream.Write(rawData, 0, rawData.Length); stream.Close(); HttpResult result = this.AllowRedirects ? Http.HandleRedirects(Http.Request(req), true) : Http.Request(req); // Did the request fail? if (result.HasError || Http.SessionCookies.Count < 2) { ErrorLog.LogException(result.Error); this.User = details; this.OnLogin(this, new LoginEventArgs(details)); return; } if (result.HasResponse) this.SiteEncoding = Encoding.GetEncoding(result.Response.CharacterSet); // Check if we did login foreach (Cookie c in Http.GetDomainCookies(req.RequestUri)) { if (c.Name.EndsWith("member_id")) { if (c.Value.Length > 0 && int.TryParse(c.Value, out parse) && parse > 0) check++; } else if (c.Name.EndsWith("pass_hash")) { if (c.Value.Length > 1) check++; } } if (check > 1) { details.IsLoggedIn = true; foreach (var c in Http.GetDomainCookies(req.RequestUri)) { details.Cookies.Add(c); } } else { var error = new Exception(String.Format( "Login check failed for '{0}'.\r\nCheck count: {1}.", this.BaseUrl, check )); ErrorLog.LogException(error); } this.User = details; this.OnLogin(this, new LoginEventArgs(details)); return; }); }