/// <summary> /// Get thread info from the provided page. /// </summary> /// <param name="page">A web page from a forum that this adapter can handle.</param> /// <returns>Returns thread information that can be gleaned from that page.</returns> public ThreadInfo GetThreadInfo(HtmlDocument page) { if (page == null) { throw new ArgumentNullException(nameof(page)); } string title; string author = string.Empty; // vBulletin doesn't show the thread author int pages = 1; HtmlNode doc = page.DocumentNode.Element("html"); // Find the page title title = doc.Element("head").Element("title")?.InnerText; title = PostText.CleanupWebString(title); var threadViewTab = page.GetElementbyId("thread-view-tab"); var pageNavControls = threadViewTab?.GetDescendantWithClass("div", "pagenav-controls"); var pageTotalSpan = pageNavControls?.GetDescendantWithClass("span", "pagetotal"); if (pageTotalSpan != null) { pages = int.Parse(pageTotalSpan.InnerText); } ThreadInfo info = new ThreadInfo(title, author, pages); return(info); }
/// <summary> /// Get thread info from the provided page. /// </summary> /// <param name="page">A web page from a forum that this adapter can handle.</param> /// <returns>Returns thread information that can be gleaned from that page.</returns> public ThreadInfo GetThreadInfo(HtmlDocument page) { if (page == null) { throw new ArgumentNullException(nameof(page)); } string title; string author; int pages; HtmlNode doc = page.DocumentNode; // Find the page title title = PostText.CleanupWebString(doc.Element("html").Element("head")?.Element("title")?.InnerText); // Find a common parent for other data HtmlNode pageContent = GetPageContent(page, PageType.Thread); if (pageContent == null) { throw new InvalidOperationException("Cannot find content on page."); } // Find the thread author HtmlNode titleBar = pageContent.GetDescendantWithClass("titleBar"); // Non-thread pages (such as threadmark pages) won't have a title bar. if (titleBar == null) { throw new InvalidOperationException("Not a valid forum thread."); } var pageDesc = page.GetElementbyId("pageDescription"); var authorNode = pageDesc?.GetChildWithClass("username"); author = PostText.CleanupWebString(authorNode?.InnerText); // Find the number of pages in the thread var pageNavLinkGroup = pageContent.GetDescendantWithClass("div", "pageNavLinkGroup"); var pageNav = pageNavLinkGroup?.GetChildWithClass("PageNav"); string lastPage = pageNav?.GetAttributeValue("data-last", ""); if (string.IsNullOrEmpty(lastPage)) { pages = 1; } else { pages = Int32.Parse(lastPage); } // Create a ThreadInfo object to hold the acquired information. ThreadInfo info = new ThreadInfo(title, author, pages); return(info); }
/// <summary> /// Get a completed post from the provided HTML list item node. /// </summary> /// <param name="li">List item node that contains the post.</param> /// <returns>Returns a post object with required information.</returns> private PostComponents GetPost(HtmlNode li) { if (li == null) { throw new ArgumentNullException(nameof(li)); } string author = ""; string id = ""; string text = ""; int number = 0; // ID id = li.Id.Substring("post_".Length); // Number var postCount = li.OwnerDocument.GetElementbyId($"postcount{id}"); if (postCount != null) { number = int.Parse(postCount.GetAttributeValue("name", "0")); } HtmlNode postDetails = li.Elements("div").FirstOrDefault(n => n.GetAttributeValue("class", "") == "postdetails"); if (postDetails != null) { // Author HtmlNode userinfo = postDetails.GetChildWithClass("div", "userinfo"); HtmlNode username = userinfo?.GetChildWithClass("a", "username"); author = PostText.CleanupWebString(username?.InnerText); // Text string postMessageId = "post_message_" + id; var message = li.OwnerDocument.GetElementbyId(postMessageId)?.Element("blockquote"); // Predicate filtering out elements that we don't want to include var exclusion = PostText.GetClassExclusionPredicate("bbcode_quote"); // Get the full post text. text = PostText.ExtractPostText(message, exclusion, Host); } PostComponents post; try { post = new PostComponents(author, id, text, number); } catch { post = null; } return(post); }
/// <summary> /// Get thread info from the provided page. /// </summary> /// <param name="page">A web page from a forum that this adapter can handle.</param> /// <returns>Returns thread information that can be gleaned from that page.</returns> public ThreadInfo GetThreadInfo(HtmlDocument page) { if (page == null) { throw new ArgumentNullException(nameof(page)); } string title; string author = string.Empty; int pages = 1; HtmlNode doc = page.DocumentNode.Element("html"); // Find the page title title = PostText.CleanupWebString(doc.Element("head")?.Element("title")?.InnerText); // Find the number of pages var pagebody = page.GetElementbyId("page-body"); if (pagebody != null) { // Different versions of the forum have different methods of showing page numbers var topicactions = pagebody.GetChildWithClass("topic-actions"); if (topicactions != null) { var pagination = topicactions.GetChildWithClass("pagination"); string paginationText = pagination?.InnerText; if (paginationText != null) { Regex pageOf = new Regex(@"Page\s*\d+\s*of\s*(?<pages>\d+)"); Match m = pageOf.Match(paginationText); if (m.Success) { pages = int.Parse(m.Groups["pages"].Value); } } } else { var actionbar = pagebody.GetChildWithClass("action-bar"); var pagination = actionbar?.GetChildWithClass("pagination"); var ul = pagination?.Element("ul"); var lastPageLink = ul?.Elements("li")?.LastOrDefault(n => !n.GetAttributeValue("class", "").Split(' ').Contains("next")); if (lastPageLink != null) { pages = int.Parse(lastPageLink.InnerText); } } } ThreadInfo info = new ThreadInfo(title, author, pages); return(info); }
/// <summary> /// Get a completed post from the provided HTML div node. /// </summary> /// <param name="div">Div node that contains the post.</param> /// <returns>Returns a post object with required information.</returns> private PostComponents GetPost(HtmlNode div, IQuest quest) { if (div == null) { throw new ArgumentNullException(nameof(div)); } string author = ""; string id; string text; int number = 0; // id="p12345" id = div.Id.Substring(1); var inner = div.GetChildWithClass("div", "inner"); var postbody = inner.GetChildWithClass("div", "postbody"); var authorNode = postbody.GetChildWithClass("p", "author"); var authorStrong = authorNode.Descendants("strong").FirstOrDefault(); var authorAnchor = authorStrong.Element("a"); author = PostText.CleanupWebString(authorAnchor.InnerText); // No way to get the post number?? // Get the full post text. Two different layout variants. var content = postbody.GetChildWithClass("div", "content"); if (content == null) { content = postbody.Elements("div").FirstOrDefault(n => n.Id.StartsWith("post_content", StringComparison.Ordinal)); } text = PostText.ExtractPostText(content, n => false, Host); PostComponents post; try { post = new PostComponents(author, id, text, number); } catch { post = null; } return(post); }
/// <summary> /// Get thread info from the provided page. /// </summary> /// <param name="page">A web page from a forum that this adapter can handle.</param> /// <returns>Returns thread information that can be gleaned from that page.</returns> public ThreadInfo GetThreadInfo(HtmlDocument page) { if (page == null) { throw new ArgumentNullException(nameof(page)); } string title; string author = string.Empty; // vBulletin doesn't show thread authors int pages = 1; HtmlNode doc = page.DocumentNode; // Find the page title title = doc.Element("html").Element("head").Element("title")?.InnerText; title = PostText.CleanupWebString(title); // Get the number of pages from the navigation elements var paginationTop = page.GetElementbyId("pagination_top"); var paginationForm = paginationTop.Element("form"); // If there is no form, that means there's only one page in the thread. if (paginationForm != null) { var firstSpan = paginationForm.Element("span"); var firstSpanA = firstSpan?.Element("a"); var pagesText = firstSpanA?.InnerText; if (pagesText != null) { Regex pageNumsRegex = new Regex(@"Page \d+ of (?<pages>\d+)"); Match m = pageNumsRegex.Match(pagesText); if (m.Success) { pages = int.Parse(m.Groups["pages"].Value); } } } ThreadInfo info = new ThreadInfo(title, author, pages); return(info); }
/// <summary> /// Get thread info from the provided page. /// </summary> /// <param name="page">A web page from a forum that this adapter can handle.</param> /// <returns>Returns thread information that can be gleaned from that page.</returns> public ThreadInfo GetThreadInfo(HtmlDocument page) { if (page == null) { throw new ArgumentNullException(nameof(page)); } string title; string author = string.Empty; // vBulletin doesn't show thread authors int pages = 1; HtmlNode doc = page.DocumentNode.Element("html"); // Find the page title title = PostText.CleanupWebString(doc.Element("head")?.Element("title")?.InnerText); // If there's no pagenav div, that means there's no navigation to alternate pages, // which means there's only one page in the thread. var pageNavDiv = doc.GetDescendantWithClass("div", "pagenav"); if (pageNavDiv != null) { var vbMenuControl = pageNavDiv.GetDescendantWithClass("td", "vbmenu_control"); if (vbMenuControl != null) { Regex pageNumsRegex = new Regex(@"Page \d+ of (?<pages>\d+)"); Match m = pageNumsRegex.Match(vbMenuControl.InnerText); if (m.Success) { pages = int.Parse(m.Groups["pages"].Value); } } } ThreadInfo info = new ThreadInfo(title, author, pages); return(info); }
/// <summary> /// Get thread info from the provided page. /// </summary> /// <param name="page">A web page from a forum that this adapter can handle.</param> /// <returns>Returns thread information that can be gleaned from that page.</returns> public ThreadInfo GetThreadInfo(HtmlDocument page) { if (page == null) { throw new ArgumentNullException(nameof(page)); } string title; string author = string.Empty; int pages = 1; HtmlNode doc = page.DocumentNode.Element("html"); // Find the page title title = PostText.CleanupWebString(doc.Element("head")?.Element("title")?.InnerText); // Find the number of pages var main = page.GetElementbyId("content"); var paginationContainer = main.GetDescendantWithClass("div", "pagination-container"); if (paginationContainer != null) { var lastPage = paginationContainer.Element("ul").Elements("li").LastOrDefault(n => n.GetAttributeValue("class", "").Split(' ').Contains("page")); var lastPageNumber = lastPage?.Element("a")?.GetAttributeValue("data-page", "1"); if (lastPageNumber != null) { pages = int.Parse(lastPageNumber); } } ThreadInfo info = new ThreadInfo(title, author, pages); return(info); }
/// <summary> /// Get a completed post from the provided HTML list item node. /// </summary> /// <param name="li">List item node that contains the post.</param> /// <returns>Returns a post object with required information.</returns> private PostComponents GetPost(HtmlNode li) { if (li == null) { throw new ArgumentNullException(nameof(li)); } string author; string id; string text; int number; // Author and ID are in the basic list item attributes author = PostText.CleanupWebString(li.GetAttributeValue("data-author", "")); id = li.Id.Substring("post-".Length); if (AdvancedOptions.Instance.DebugMode) { author = $"{author}_{id}"; } // Get the primary content of the list item HtmlNode primaryContent = li.GetChildWithClass("primaryContent"); // On one branch, we can get the post text HtmlNode messageContent = primaryContent.GetChildWithClass("messageContent"); HtmlNode postBlock = messageContent.Element("article").Element("blockquote"); // Predicate filtering out elements that we don't want to include List <string> excludedClasses = new List <string> { "bbCodeQuote", "messageTextEndMarker", "advbbcodebar_encadre", "advbbcodebar_article", "adv_tabs_wrapper", "adv_slider_wrapper" }; if (AdvancedOptions.Instance.IgnoreSpoilers) { excludedClasses.Add("bbCodeSpoilerContainer"); } var exclusions = PostText.GetClassesExclusionPredicate(excludedClasses); // Get the full post text. text = PostText.ExtractPostText(postBlock, exclusions, Host); // On another branch of the primary content, we can get the post number. HtmlNode messageMeta = primaryContent.GetChildWithClass("messageMeta"); // HTML parsing of the post was corrupted somehow. if (messageMeta == null) { return(null); } HtmlNode publicControls = messageMeta.GetChildWithClass("publicControls"); HtmlNode postNumber = publicControls.GetChildWithClass("postNumber"); string postNumberText = postNumber.InnerText; // Skip the leading # character. if (postNumberText.StartsWith("#", StringComparison.Ordinal)) { postNumberText = postNumberText.Substring(1); } number = int.Parse(postNumberText); PostComponents post; try { post = new PostComponents(author, id, text, number); } catch (Exception e) { ErrorLog.Log(e); post = null; } return(post); }
/// <summary> /// Get a completed post from the provided HTML list item. /// </summary> /// <param name="li">List item that contains the post.</param> /// <returns>Returns a post object with required information.</returns> private PostComponents GetPost(HtmlNode li) { if (li == null) { throw new ArgumentNullException(nameof(li)); } string author = ""; string id = ""; string text = ""; int number = 0; // ID id = li.GetAttributeValue("data-node-id", ""); if (string.IsNullOrEmpty(id)) { return(null); } // Author var postAuthorNode = li.Descendants("div").FirstOrDefault(a => a.GetAttributeValue("itemprop", "") == "author"); var authorNode = postAuthorNode?.GetDescendantWithClass("div", "author"); if (authorNode != null) { author = PostText.CleanupWebString(authorNode.InnerText); } var contentArea = li.GetDescendantWithClass("div", "b-post__content"); // Number var postCountAnchor = contentArea.GetDescendantWithClass("a", "b-post__count"); if (postCountAnchor != null) { string postNumText = postCountAnchor.InnerText; if (postNumText.StartsWith("#", StringComparison.Ordinal)) { postNumText = postNumText.Substring(1); } number = int.Parse(postNumText); } // Text var postTextNode = contentArea.Descendants("div").FirstOrDefault(a => a.GetAttributeValue("itemprop", "") == "text"); // Predicate filtering out elements that we don't want to include var exclusion = PostText.GetClassExclusionPredicate("bbcode_quote"); // Get the full post text. text = PostText.ExtractPostText(postTextNode, exclusion, Host); PostComponents post; try { post = new PostComponents(author, id, text, number); } catch { post = null; } return(post); }