private int GetRealTotalPages(HtmlNode node, AnnTaskModel task) { var n = node.SelectSingleNode("/html[1]/body[1]/div[2]"); var table = n.Element("table"); if (table.ChildNodes.Any(f => f.Name == "tbody")) { table = table.Element("tbody"); } var t = table.Element("tr").Element("td"); var allAs = t.Elements("a"); if (allAs.Count() == 0) { return(0); } if (allAs.Last().InnerText.Contains("All")) { return(int.Parse(allAs.ElementAt(allAs.Count() - 2).InnerText)); } return(int.Parse(allAs.ElementAt(allAs.Count() - 1).InnerText)); }
private AnnTaskModel HandleRow(IEnumerable <HtmlNode> nodes) { var model = new AnnTaskModel(); for (int i = 2; i < nodes.Count(); i++) { var ele = nodes.ElementAt(i); switch (i) { case (2): //title and url var a = ele.Element("span").Element("a"); model.PostTitle = a.InnerText.Replace("�", ""); model.PostUrl = a.Attributes.Where(f => f.Name == "href").FirstOrDefault().Value; break; case (3): // Profile model.Author = ele.Element("a").InnerText.Replace("�", ""); break; case (4): model.Replies = int.Parse(Regex.Replace(ele.Element("#text").InnerText, @"\t|\n|\r", "")); break; case (5): model.Views = int.Parse(Regex.Replace(ele.Element("#text").InnerText, @"\t|\n|\r", "")); break; default: break; } } return(model); }
public void ParsePost(HtmlNode node, HtmlDocument doc, int postNum, AnnTaskModel task) { if (node.InnerText.Contains("They may be unsafe, untrustworthy, or illegal in your jurisdiction.")) //ITS AN AD! { return; } var model = new PostModel(task); model.PostNumber = postNum; model.TopicTitle = model.TopicTitle.RemoveEmojis(); var td = node.Element("td"); if (td == null) { throw new Exception("Could not find any tds as children"); } if (!td.HasChildNodes) { return; } var table = td.Element("table"); if (table.ChildNodes.Any(f => f.Name == "tbody")) { table = table.Element("tbody"); } table = table.Element("tr").Element("td").Element("table"); if (table.ChildNodes.Any(f => f.Name == "tbody")) { table = table.Element("tbody"); } var details = table.Element("tr"); GetDetails(details, model); GetPost(details, model); model.IsScamHeaderPresent = IsPossibleScam(doc); var context = new MariaContext(); context.Posts.Add(model); context.SaveChanges(); context.Dispose(); }
private void Parse(HtmlDocument doc, AnnTaskModel task) { var random = new Random(); Log.Information($"Starting on task id:{task.Id}"); int pageNumber = 1; int postNumber = 0; bool isWorking = true; int totalPages = GetRealTotalPages(doc.DocumentNode, task); Log.Information($"task {task.Id} has {totalPages} pages, which is PROJECTED to be {totalPages * 20} total posts"); var baseCol = doc.DocumentNode.SelectSingleNode(PostXpaths.BaseSelector); while (isWorking) { var tr = baseCol.Elements("tr"); foreach (var row in tr) { postNumber++; Log.Information($"task {task.Id} scraping post number {postNumber} on page {pageNumber}"); ParsePost(row, doc, postNumber, task); } if (totalPages <= pageNumber) { Log.Information($"Finished scraping {task.Id} with {postNumber} posts."); isWorking = false; break; } pageNumber++; HtmlWeb web = new HtmlWeb(); while (CanGo == false) { Thread.Sleep(random.Next(4000)); } CanGo = false; var page = web.Load(MakeUrl(task.PostUrl, pageNumber)); if (page.DocumentNode.InnerText.Contains("you are accessing the forum too quickly")) { Log.Error("uh oh, we are rate limited!!! Stoping the timer for 1 minute"); Timer.Stop(); Thread.Sleep(60000); Log.Information("Resuming operations"); Timer.Start(); } baseCol = page.DocumentNode.SelectSingleNode(PostXpaths.BaseSelector); if (baseCol == null) { Log.Error($"BaseCol is nullllll"); } } var c = new MariaContext(); c.UpdateTaskStatusToComplete(task); Inprogress--; }
public async void UpdateTaskStatusToComplete(AnnTaskModel task) { task.Status = AnnStatus.Complete; this.Entry(task).State = EntityState.Modified; this.SaveChanges(); }