public static ForumPageMetadata ParseForumPage(HtmlDocument doc) { var top = doc.DocumentNode; var page = new ForumPageMetadata(); int pageNumber = -1; // first, let's find the forum id var formNode = top.Descendants("form") .Where(node => node.GetAttributeValue("id", "").Equals("ac_timemachine")) .FirstOrDefault(); if (formNode != null) { string idString = formNode.GetAttributeValue("action", ""); // strip undesiriable stuff off idString = idString.Replace("/forumdisplay.php?", ""); idString = idString.Split('=').Last(); page.ForumID = idString; } // then, let's find the page number var pageNumberNode = top.Descendants("span") .Where(node => node.GetAttributeValue("class", "").Equals("curpage")) .FirstOrDefault(); if (pageNumberNode != null) { var pageNumberText = pageNumberNode.InnerText; if (!int.TryParse(pageNumberText, out pageNumber)) { pageNumber = -1; } } page.PageNumber = pageNumber; HandleMaxPages(page, top); HandleThreads(page, top); return page; }
// TODO: Remember to sort thread data by new posts private static IList<ThreadMetadata> GenerateThreadData(ForumPageMetadata page, IEnumerable<HtmlNode> threadsInfo) { //Logger.AddEntry("AwfulForumPage - Generating thread data..."); List<ThreadMetadata> data = new List<ThreadMetadata>(); foreach (var node in threadsInfo) { var thread = ThreadParser.ParseThread(node); data.Add(thread); } return data; }
private static void HandleThreads(ForumPageMetadata page, HtmlNode node) { var forumThreadsTable = node.Descendants("table") .Where(n => n.Id.Equals("forum")).FirstOrDefault(); // do we have any thread items to parse? if (forumThreadsTable != null) { var threadList = forumThreadsTable.Descendants("tbody").First(); var threadsInfo = threadList.Descendants("tr"); page.Threads = GenerateThreadData(page, threadsInfo); } else { page.Threads = new List<ThreadMetadata>(); } }
private static void HandleMaxPages(ForumPageMetadata page, HtmlNode node) { var maxPagesNode = node.Descendants("div") .Where(n => n.GetAttributeValue("class", "").Contains("pages")) .FirstOrDefault(); if (maxPagesNode == null) { //Logger.AddEntry("AwfulForumPage - Could not parse maxPagesNode."); page.PageCount = 1; } else { page.PageCount = ExtractMaxForumPages(maxPagesNode); //Logger.AddEntry(string.Format("AwfulForumPage - maxPagesNode parsed. Value: {0}", page.Parent.TotalPages)); } }
private static void HandleFilters(ForumPageMetadata page, HtmlNode top) { var tagsListNode = top.Descendants("div") .Where(node => node.GetAttributeValue("class", "") .Equals("thread_tags")) .FirstOrDefault(); if (null != tagsListNode) { var filterNodes = tagsListNode.Descendants("a").ToList(); var filters = new List<FilterTagMetadata>(filterNodes.Count) { FilterTagMetadata.NoFilter }; page.Filters = filters; foreach (var filterNode in filterNodes) { string href = filterNode.GetAttributeValue("href", string.Empty); string title = filterNode.FirstChild.GetAttributeValue("title", string.Empty); string src = filterNode.FirstChild.GetAttributeValue("src", string.Empty); FilterTagMetadata filter = new FilterTagMetadata() { FilterUri = WebUtility.HtmlDecode(href), Title = WebUtility.HtmlDecode(title), TagUri = WebUtility.HtmlDecode(src) }; filters.Add(filter); } } }
private static void HandleThreads(ForumPageMetadata page, HtmlNode node) { var forumThreadsTable = node.Descendants("table") .Where(n => n.Id.Equals("forum")) .First(); var threadList = forumThreadsTable.Descendants("tbody").First(); var threadsInfo = threadList.Descendants("tr"); page.Threads = GenerateThreadData(page, threadsInfo); }