/// <summary> /// Get thread info from the provided page. /// </summary> /// <param name="page">A web page from a forum that this adapter can handle.</param> /// <returns>Returns thread information that can be gleaned from that page.</returns> public ThreadInfo GetThreadInfo(HtmlDocument page) { if (page == null) { throw new ArgumentNullException(nameof(page)); } string title; string author = string.Empty; // vBulletin doesn't show the thread author int pages = 1; HtmlNode doc = page.DocumentNode.Element("html"); // Find the page title title = doc.Element("head").Element("title")?.InnerText ?? ""; title = ForumPostTextConverter.CleanupWebString(title); var threadViewTab = page.GetElementbyId("thread-view-tab"); var pageNavControls = threadViewTab?.GetDescendantWithClass("div", "pagenav-controls"); var pageTotalSpan = pageNavControls?.GetDescendantWithClass("span", "pagetotal"); if (pageTotalSpan != null) { pages = int.Parse(pageTotalSpan.InnerText); } ThreadInfo info = new ThreadInfo(title, author, pages); return(info); }
private string GetPageAuthor(HtmlNode headerNode) { var descripNode = headerNode.GetChildWithClass("div", "p-description"); var authorNode = descripNode?.GetDescendantWithClass("a", "username"); return(ForumPostTextConverter.CleanupWebString(authorNode?.InnerText.Trim() ?? "")); }
private int GetMaxPageNumberOfThread(HtmlNode bodyNode) { var mainNode = bodyNode.GetChildWithClass("div", "p-body-main") ?? throw new InvalidOperationException("Unable to find p-body-main."); var navNode = mainNode.GetDescendantWithClass("nav", "pageNavWrapper"); if (navNode != null) { var navItems = navNode.GetDescendantWithClass("ul", "pageNav-main")?.Elements("li").Where(n => n.HasClass("pageNav-page")); if (navItems != null && navItems.Any()) { var lastItem = ForumPostTextConverter.CleanupWebString(navItems.Last().Element("a").InnerText.Trim()); if (int.TryParse(lastItem, NumberStyles.AllowThousands, CultureInfo.InvariantCulture, out int pages)) { if (pages == 0) { pages = 1; } return(pages); } } } return(1); }
/// <summary> /// Get a completed post from the provided HTML list item node. /// </summary> /// <param name="li">List item node that contains the post.</param> /// <returns>Returns a post object with required information.</returns> private Post?GetPost(HtmlNode li, IQuest quest) { if (li == null) { throw new ArgumentNullException(nameof(li)); } string author = ""; string id = ""; string text = ""; int number = 0; // ID id = li.Id.Substring("post_".Length); // Number var postCount = li.OwnerDocument.GetElementbyId($"postcount{id}"); if (postCount != null) { number = int.Parse(postCount.GetAttributeValue("name", "0")); } HtmlNode postDetails = li.Elements("div").FirstOrDefault(n => n.GetAttributeValue("class", "") == "postdetails"); if (postDetails != null) { // Author HtmlNode?userinfo = postDetails.GetChildWithClass("div", "userinfo"); HtmlNode?username = userinfo?.GetChildWithClass("a", "username"); author = ForumPostTextConverter.CleanupWebString(username?.InnerText); // Text string postMessageId = "post_message_" + id; var message = li.OwnerDocument.GetElementbyId(postMessageId)?.Element("blockquote"); // Predicate filtering out elements that we don't want to include var exclusion = ForumPostTextConverter.GetClassExclusionPredicate("bbcode_quote"); // Get the full post text. text = ForumPostTextConverter.ExtractPostText(message, exclusion, Host); } Post?post; try { Origin origin = new Origin(author, id, number, Site, GetPermalinkForId(id)); post = new Post(origin, text); } catch { post = null; } return(post); }
/// <summary> /// Get thread info from the provided page. /// </summary> /// <param name="page">A web page from a forum that this adapter can handle.</param> /// <returns>Returns thread information that can be gleaned from that page.</returns> public ThreadInfo GetThreadInfo(HtmlDocument page) { if (page == null) { throw new ArgumentNullException(nameof(page)); } string title; string author; int pages; HtmlNode doc = page.DocumentNode; // Find the page title title = ForumPostTextConverter.CleanupWebString(doc.Element("html").Element("head")?.Element("title")?.InnerText); // Find a common parent for other data HtmlNode pageContent = GetPageContent(page, PageType.Thread); if (pageContent == null) { throw new InvalidOperationException("Cannot find content on page."); } // Find the thread author HtmlNode?titleBar = pageContent.GetDescendantWithClass("titleBar"); // Non-thread pages (such as threadmark pages) won't have a title bar. if (titleBar == null) { throw new InvalidOperationException("Not a valid forum thread."); } HtmlNode?pageDesc = page.GetElementbyId("pageDescription"); HtmlNode?authorNode = pageDesc?.GetChildWithClass("username"); author = ForumPostTextConverter.CleanupWebString(authorNode?.InnerText ?? ""); // Find the number of pages in the thread var pageNavLinkGroup = pageContent.GetDescendantWithClass("div", "pageNavLinkGroup"); var pageNav = pageNavLinkGroup?.GetChildWithClass("PageNav"); string lastPage = pageNav?.GetAttributeValue("data-last", "") ?? ""; if (string.IsNullOrEmpty(lastPage)) { pages = 1; } else { pages = Int32.Parse(lastPage); } // Create a ThreadInfo object to hold the acquired information. ThreadInfo info = new ThreadInfo(title, author, pages); return(info); }
/// <summary> /// Get thread info from the provided page. /// </summary> /// <param name="page">A web page from a forum that this adapter can handle.</param> /// <returns>Returns thread information that can be gleaned from that page.</returns> public ThreadInfo GetThreadInfo(HtmlDocument page) { if (page == null) { throw new ArgumentNullException(nameof(page)); } string title; string author = string.Empty; int pages = 1; HtmlNode doc = page.DocumentNode.Element("html"); // Find the page title title = ForumPostTextConverter.CleanupWebString(doc.Element("head")?.Element("title")?.InnerText); // Find the number of pages var pagebody = page.GetElementbyId("page-body"); if (pagebody != null) { // Different versions of the forum have different methods of showing page numbers var topicactions = pagebody.GetChildWithClass("topic-actions"); if (topicactions != null) { HtmlNode?pagination = topicactions.GetChildWithClass("pagination"); string? paginationText = pagination?.InnerText; if (paginationText != null) { Regex pageOf = new Regex(@"Page\s*\d+\s*of\s*(?<pages>\d+)"); Match m = pageOf.Match(paginationText); if (m.Success) { pages = int.Parse(m.Groups["pages"].Value); } } } else { var actionbar = pagebody.GetChildWithClass("action-bar"); var pagination = actionbar?.GetChildWithClass("pagination"); var ul = pagination?.Element("ul"); var lastPageLink = ul?.Elements("li")?.LastOrDefault(n => !n.GetAttributeValue("class", "").Split(' ').Contains("next")); if (lastPageLink != null) { pages = int.Parse(lastPageLink.InnerText); } } } ThreadInfo info = new ThreadInfo(title, author, pages); return(info); }
private string GetPageTitle(HtmlDocument page) { return(ForumPostTextConverter.CleanupWebString( page.DocumentNode .Element("html") .Element("head") ?.Element("title") ?.InnerText)); }
private string GetPostAuthor(HtmlNode div) { HtmlNode?inner = div.GetChildWithClass("div", "inner"); HtmlNode?postbody = inner?.GetChildWithClass("div", "postbody"); HtmlNode?authorNode = postbody?.GetChildWithClass("p", "author"); HtmlNode?authorStrong = authorNode?.Descendants("strong").FirstOrDefault(); HtmlNode?authorAnchor = authorStrong?.Element("a"); return(ForumPostTextConverter.CleanupWebString(authorAnchor?.InnerText)); }
/// <summary> /// Get a completed post from the provided HTML div node. /// </summary> /// <param name="div">Div node that contains the post.</param> /// <returns>Returns a post object with required information.</returns> private Post?GetPost(HtmlNode div, IQuest quest) { if (div == null) { throw new ArgumentNullException(nameof(div)); } string author = ""; string id; string text; int number = 0; // id="p12345" id = div.Id.Substring(1); HtmlNode?inner = div.GetChildWithClass("div", "inner"); HtmlNode?postbody = inner?.GetChildWithClass("div", "postbody"); HtmlNode?authorNode = postbody?.GetChildWithClass("p", "author"); HtmlNode?authorStrong = authorNode?.Descendants("strong").FirstOrDefault(); HtmlNode?authorAnchor = authorStrong?.Element("a"); author = ForumPostTextConverter.CleanupWebString(authorAnchor?.InnerText); // No way to get the post number?? // Get the full post text. Two different layout variants. var content = postbody?.GetChildWithClass("div", "content"); if (content == null) { content = postbody?.Elements("div").FirstOrDefault(n => n.Id.StartsWith("post_content", StringComparison.Ordinal)); } text = ForumPostTextConverter.ExtractPostText(content, n => false, Host); Post?post; try { Origin origin = new Origin(author, id, number, Site, GetPermalinkForId(id)); post = new Post(origin, text); } catch { post = null; } return(post); }
private string GetPostAuthor(HtmlNode li) { string author = ""; var postAuthorNode = li.Descendants("div").FirstOrDefault(a => a.GetAttributeValue("itemprop", "") == "author"); var authorNode = postAuthorNode?.GetDescendantWithClass("div", "author"); if (authorNode != null) { author = ForumPostTextConverter.CleanupWebString(authorNode.InnerText); } return(author); }
private string GetPageTitle(HtmlDocument page, HtmlNode headerNode) { //var titleNode = headerNode.GetChildWithClass("div", "p-title"); //string title = ForumPostTextConverter.CleanupWebString(titleNode?.Element("h1")?.InnerText.Trim()); //if (!string.IsNullOrEmpty(title)) // return title; return(ForumPostTextConverter.CleanupWebString( page.DocumentNode .Element("html") .Element("head") ?.Element("title") ?.InnerText)); }
private string GetPostAuthor(HtmlNode li) { string author = ""; HtmlNode postDetails = li.Elements("div").FirstOrDefault(n => n.GetAttributeValue("class", "") == "postdetails"); if (postDetails != null) { // Author HtmlNode?userinfo = postDetails.GetChildWithClass("div", "userinfo"); HtmlNode?username = userinfo?.GetChildWithClass("a", "username"); author = ForumPostTextConverter.CleanupWebString(username?.InnerText); } return(author); }
/// <summary> /// Get thread info from the provided page. /// </summary> /// <param name="page">A web page from a forum that this adapter can handle.</param> /// <returns>Returns thread information that can be gleaned from that page.</returns> public ThreadInfo GetThreadInfo(HtmlDocument page) { if (page == null) { throw new ArgumentNullException(nameof(page)); } string title; string author = string.Empty; // vBulletin doesn't show thread authors int pages = 1; HtmlNode doc = page.DocumentNode; // Find the page title title = doc.Element("html").Element("head").Element("title")?.InnerText ?? ""; title = ForumPostTextConverter.CleanupWebString(title); // Get the number of pages from the navigation elements var paginationTop = page.GetElementbyId("pagination_top"); var paginationForm = paginationTop.Element("form"); // If there is no form, that means there's only one page in the thread. if (paginationForm != null) { var firstSpan = paginationForm.Element("span"); var firstSpanA = firstSpan?.Element("a"); var pagesText = firstSpanA?.InnerText; if (pagesText != null) { Regex pageNumsRegex = new Regex(@"Page \d+ of (?<pages>\d+)"); Match m = pageNumsRegex.Match(pagesText); if (m.Success) { pages = int.Parse(m.Groups["pages"].Value); } } } ThreadInfo info = new ThreadInfo(title, author, pages); return(info); }
private string GetPageAuthor(HtmlDocument page) { // Find a common parent for other data HtmlNode?pageContent = GetPageContent(page, PageType.Thread); if (pageContent == null) { throw new InvalidOperationException("Cannot find content on page."); } // Non-thread pages (such as threadmark pages) won't have a title bar. HtmlNode?titleBar = pageContent.GetDescendantWithClass("titleBar") ?? throw new InvalidOperationException("Not a valid forum thread."); // Find the thread author HtmlNode?authorNode = page.GetElementbyId("pageDescription")?.GetChildWithClass("username"); return(ForumPostTextConverter.CleanupWebString(authorNode?.InnerText ?? "")); }
/// <summary> /// Get thread info from the provided page. /// </summary> /// <param name="page">A web page from a forum that this adapter can handle.</param> /// <returns>Returns thread information that can be gleaned from that page.</returns> public ThreadInfo GetThreadInfo(HtmlDocument page) { if (page == null) { throw new ArgumentNullException(nameof(page)); } string title; string author = string.Empty; // vBulletin doesn't show thread authors int pages = 1; HtmlNode doc = page.DocumentNode.Element("html"); // Find the page title title = ForumPostTextConverter.CleanupWebString(doc.Element("head")?.Element("title")?.InnerText); // If there's no pagenav div, that means there's no navigation to alternate pages, // which means there's only one page in the thread. var pageNavDiv = doc.GetDescendantWithClass("div", "pagenav"); if (pageNavDiv != null) { var vbMenuControl = pageNavDiv.GetDescendantWithClass("td", "vbmenu_control"); if (vbMenuControl != null) { Regex pageNumsRegex = new Regex(@"Page \d+ of (?<pages>\d+)"); Match m = pageNumsRegex.Match(vbMenuControl.InnerText); if (m.Success) { pages = int.Parse(m.Groups["pages"].Value); } } } ThreadInfo info = new ThreadInfo(title, author, pages); return(info); }
/// <summary> /// Get a completed post from the provided HTML list item node. /// </summary> /// <param name="article">List item node that contains the post.</param> /// <returns>Returns a post object with required information.</returns> private Post?GetPost(HtmlNode article, IQuest quest) { if (article == null) { throw new ArgumentNullException(nameof(article)); } string author; string id; string text; int number; // Author and ID are in the basic list item attributes author = ForumPostTextConverter.CleanupWebString(article.GetAttributeValue("data-author", "")); id = ForumPostTextConverter.CleanupWebString(article.GetAttributeValue("data-content", "post-").Substring("post-".Length)); if (AdvancedOptions.Instance.DebugMode) { author = $"{author}_{id}"; } var attribution = article.GetDescendantWithClass("header", "message-attribution"); if (attribution == null) { return(null); } string postNum = attribution.Descendants("a").LastOrDefault(c => c.ChildNodes.Count == 1)?.InnerText.Trim() ?? ""; if (string.IsNullOrEmpty(postNum)) { return(null); } if (postNum[0] == '#') { var numSpan = postNum.AsSpan()[1..];
private string GetPostAuthor(HtmlDocument page, string id) { string author = ""; string postAuthorDivID = $"postmenu_{id}"; var authorAnchor = page.GetElementbyId(postAuthorDivID).Element("a"); if (authorAnchor != null) { // ?? if (authorAnchor.Element("span") != null) { author = authorAnchor.Element("span").InnerText; } else { author = authorAnchor.InnerText; } } return(ForumPostTextConverter.CleanupWebString(author)); }
/// <summary> /// Get thread info from the provided page. /// </summary> /// <param name="page">A web page from a forum that this adapter can handle.</param> /// <returns>Returns thread information that can be gleaned from that page.</returns> public ThreadInfo GetThreadInfo(HtmlDocument page) { if (page == null) { throw new ArgumentNullException(nameof(page)); } string title; string author = string.Empty; int pages = 1; HtmlNode doc = page.DocumentNode.Element("html"); // Find the page title title = ForumPostTextConverter.CleanupWebString(doc.Element("head")?.Element("title")?.InnerText); // Find the number of pages var main = page.GetElementbyId("content"); var paginationContainer = main.GetDescendantWithClass("div", "pagination-container"); if (paginationContainer != null) { var lastPage = paginationContainer.Element("ul").Elements("li").LastOrDefault(n => n.GetAttributeValue("class", "").Split(' ').Contains("page")); var lastPageNumber = lastPage?.Element("a")?.GetAttributeValue("data-page", "1"); if (lastPageNumber != null) { pages = int.Parse(lastPageNumber); } } ThreadInfo info = new ThreadInfo(title, author, pages); return(info); }
private string GetPostAuthor(HtmlNode li) { return(ForumPostTextConverter.CleanupWebString(li.GetAttributeValue("data-author", ""))); }
/// <summary> /// Get a completed post from the provided HTML list item node. /// </summary> /// <param name="li">List item node that contains the post.</param> /// <returns>Returns a post object with required information.</returns> private Post?GetPost(HtmlNode li, IQuest quest) { if (li == null) { throw new ArgumentNullException(nameof(li)); } string author; string id; string text; int number; // Author and ID are in the basic list item attributes author = ForumPostTextConverter.CleanupWebString(li.GetAttributeValue("data-author", "")); id = li.Id.Substring("post-".Length); if (AdvancedOptions.Instance.DebugMode) { author = $"{author}_{id}"; } // Get the primary content of the list item HtmlNode?primaryContent = li.GetChildWithClass("primaryContent"); // On one branch, we can get the post text HtmlNode?messageContent = primaryContent?.GetChildWithClass("messageContent"); HtmlNode?postBlock = messageContent?.Element("article")?.Element("blockquote"); // Predicate filtering out elements that we don't want to include List <string> excludedClasses = new List <string> { "bbCodeQuote", "messageTextEndMarker", "advbbcodebar_encadre", "advbbcodebar_article", "adv_tabs_wrapper", "adv_slider_wrapper" }; if (quest.IgnoreSpoilers) { excludedClasses.Add("bbCodeSpoilerContainer"); } var exclusions = ForumPostTextConverter.GetClassesExclusionPredicate(excludedClasses); // Get the full post text. text = ForumPostTextConverter.ExtractPostText(postBlock, exclusions, Host); // On another branch of the primary content, we can get the post number. HtmlNode?messageMeta = primaryContent?.GetChildWithClass("messageMeta"); // HTML parsing of the post was corrupted somehow. if (messageMeta == null) { return(null); } HtmlNode?publicControls = messageMeta.GetChildWithClass("publicControls"); HtmlNode?postNumber = publicControls?.GetChildWithClass("postNumber"); if (postNumber == null) { return(null); } string postNumberText = postNumber.InnerText; // Skip the leading # character. if (postNumberText.StartsWith("#", StringComparison.Ordinal)) { postNumberText = postNumberText.Substring(1); } number = int.Parse(postNumberText); Post?post; try { Origin origin = new Origin(author, id, number, Site, GetPermalinkForId(id)); post = new Post(origin, text); } catch (Exception e) { Logger2.LogError(e, $"Attempt to create new post failed. (Author:{author}, ID:{id}, Number:{number}, Quest:{quest.DisplayName})"); post = null; } return(post); }
/// <summary> /// Get a completed post from the provided HTML list item. /// </summary> /// <param name="li">List item that contains the post.</param> /// <returns>Returns a post object with required information.</returns> private Post?GetPost(HtmlNode li, IQuest quest) { if (li == null) { throw new ArgumentNullException(nameof(li)); } string author = ""; string id = ""; string text = ""; int number = 0; // ID id = li.GetAttributeValue("data-node-id", ""); if (string.IsNullOrEmpty(id)) { return(null); } // Author var postAuthorNode = li.Descendants("div").FirstOrDefault(a => a.GetAttributeValue("itemprop", "") == "author"); var authorNode = postAuthorNode?.GetDescendantWithClass("div", "author"); if (authorNode != null) { author = ForumPostTextConverter.CleanupWebString(authorNode.InnerText); } HtmlNode?contentArea = li.GetDescendantWithClass("div", "b-post__content"); // Number HtmlNode?postCountAnchor = contentArea?.GetDescendantWithClass("a", "b-post__count"); if (postCountAnchor != null) { string postNumText = postCountAnchor.InnerText; if (postNumText.StartsWith("#", StringComparison.Ordinal)) { postNumText = postNumText.Substring(1); } number = int.Parse(postNumText); } // Text var postTextNode = contentArea?.Descendants("div").FirstOrDefault(a => a.GetAttributeValue("itemprop", "") == "text"); // Predicate filtering out elements that we don't want to include var exclusion = ForumPostTextConverter.GetClassExclusionPredicate("bbcode_quote"); // Get the full post text. text = ForumPostTextConverter.ExtractPostText(postTextNode, exclusion, Host); Post?post; try { Origin origin = new Origin(author, id, number, Site, GetPermalinkForId(id)); post = new Post(origin, text); } catch { post = null; } return(post); }
/// <summary> /// Get thread info from the provided page. /// </summary> /// <param name="page">A web page from a forum that this adapter can handle.</param> /// <returns>Returns thread information that can be gleaned from that page.</returns> public ThreadInfo GetThreadInfo(HtmlDocument page) { if (page == null) { throw new ArgumentNullException(nameof(page)); } string title; string author; int pages = 0; HtmlNode doc = page.DocumentNode; // Start at the top of the structure var topNode = page.GetElementbyId("top"); var bodyNode = topNode.GetChildWithClass("div", "p-body") ?? topNode.GetDescendantWithClass("div", "p-body") ?? throw new InvalidOperationException("Unable to find p-body."); if (bodyNode.Elements("div").Any(n => n.HasClass("p-body-inner"))) { bodyNode = bodyNode.GetChildWithClass("p-body-inner") !; } var headerNode = bodyNode.GetChildWithClass("div", "p-body-header") ?? throw new InvalidOperationException("Unable to find p-body-header."); { var titleNode = headerNode.GetChildWithClass("div", "p-title"); title = ForumPostTextConverter.CleanupWebString(titleNode?.Element("h1")?.InnerText.Trim()); var descripNode = headerNode.GetChildWithClass("div", "p-description"); var authorNode = descripNode?.GetDescendantWithClass("a", "username"); author = ForumPostTextConverter.CleanupWebString(authorNode?.InnerText.Trim() ?? ""); } var mainNode = bodyNode.GetChildWithClass("div", "p-body-main") ?? throw new InvalidOperationException("Unable to find p-body-main."); var navNode = mainNode.GetDescendantWithClass("nav", "pageNavWrapper"); if (navNode != null) { var navItems = navNode.GetDescendantWithClass("ul", "pageNav-main")?.Elements("li").Where(n => n.HasClass("pageNav-page")); if (navItems != null && navItems.Any()) { var lastItem = ForumPostTextConverter.CleanupWebString(navItems.Last().Element("a").InnerText.Trim()); _ = int.TryParse(lastItem, NumberStyles.AllowThousands, CultureInfo.InvariantCulture, out pages); } } if (pages == 0) { pages = 1; } // Create a ThreadInfo object to hold the acquired information. ThreadInfo info = new ThreadInfo(title, author, pages); return(info); }
private string GetPostId(HtmlNode article) { return(ForumPostTextConverter.CleanupWebString(article.GetAttributeValue("data-content", "post-") .Substring("post-".Length))); }