private void Timer_Elapsed(object sender, ElapsedEventArgs e) { Log.Information("timer elapsed"); var context = new MariaContext(); //reinstaniating like this means its thread safe var status = context.NextProfile(); Log.Information("NextProfile finished successfully"); if (status == null) { context.Dispose(); return; } if (status.Id < End) { isWorking = true; HtmlWeb web = new HtmlWeb(); var doc = web.Load(MakeUrl(status.Id)); Log.Information("Loaded page successfully"); var result = this.Parse(status.Id, doc, status); Log.Information("parsed successfully"); if (result != null) { status.Status = core.models.ProfileStatus.Complete; context.SetStatusForId(status); context.Users.Add(result); context.SaveChanges(); } else { status.Status = core.models.ProfileStatus.ProfileNotPresent; context.SetStatusForId(status); } } else { Log.Information("oh no, in a bad place! <- could be the root cause!"); timer.Stop(); isRunning = false; } context.Dispose(); }
public void ParsePost(HtmlNode node, HtmlDocument doc, int postNum, AnnTaskModel task) { if (node.InnerText.Contains("They may be unsafe, untrustworthy, or illegal in your jurisdiction.")) //ITS AN AD! { return; } var model = new PostModel(task); model.PostNumber = postNum; model.TopicTitle = model.TopicTitle.RemoveEmojis(); var td = node.Element("td"); if (td == null) { throw new Exception("Could not find any tds as children"); } if (!td.HasChildNodes) { return; } var table = td.Element("table"); if (table.ChildNodes.Any(f => f.Name == "tbody")) { table = table.Element("tbody"); } table = table.Element("tr").Element("td").Element("table"); if (table.ChildNodes.Any(f => f.Name == "tbody")) { table = table.Element("tbody"); } var details = table.Element("tr"); GetDetails(details, model); GetPost(details, model); model.IsScamHeaderPresent = IsPossibleScam(doc); var context = new MariaContext(); context.Posts.Add(model); context.SaveChanges(); context.Dispose(); }
private UserPageModel Parse(int id, HtmlDocument doc, UserProfileScrapingStatus userProfileStatus) { if (doc.DocumentNode.InnerHtml.Contains("An Error Has Occurred!")) { Log.Information("parsing - Profile doesn't exist"); return(null); } if (doc.DocumentNode.InnerText.Contains("403")) { Log.Information("rate limited!!!! <- could be the root cause!"); var context = new MariaContext(); userProfileStatus.Status = ProfileStatus.Error; context.SetStatusForId(userProfileStatus); context.Dispose(); throw new Exception("Error! getting 403 response. Quitting so we don't get locked out for longer!"); } var item = new UserPageModel(id); item.Name = handleItem(doc.DocumentNode.SelectNodes(XpathSelectors.NameSelector)); var baseCol = doc.DocumentNode.SelectSingleNode(XpathSelectors.baseSelector); if (baseCol == null) { throw new Exception("Error, should never be null!"); } item.Merit = handleItem(baseCol.SelectNodes($"{DynamicXpath(baseCol,"Merit")}/td[2]")); item.Position = handleItem(baseCol.SelectNodes($"{DynamicXpath(baseCol, "Position")}/td[2]")); item.Posts = handleItem(baseCol.SelectNodes($"{DynamicXpath(baseCol, "Posts")}/td[2]")); item.Activity = handleItem(baseCol.SelectNodes($"{DynamicXpath(baseCol, "Activity")}/td[2]")); item.DateRegistered = handleItem(baseCol.SelectNodes($"{DynamicXpath(baseCol, "Date Registered")}/td[2]")); item.LastActive = handleItem(baseCol.SelectNodes($"{DynamicXpath(baseCol, "Last Active")}/td[2]")); item.Gender = handleItem(baseCol.SelectNodes($"{DynamicXpath(baseCol, "Gender")}/td[2]")); item.Age = handleItem(baseCol.SelectNodes($"{DynamicXpath(baseCol, "Age")}/td[2]")); item.Location = handleItem(baseCol.SelectNodes($"{DynamicXpath(baseCol, "Location")}/td[2]")); item.LocalTime = handleItem(baseCol.SelectNodes($"{DynamicXpath(baseCol, "Local Time")}/td[2]")); Log.Information("Finished successfully"); return(item); }
private async void ParseRowAsync(HtmlNode node) { if (!node.InnerHtml.ToLower().Contains("ann")) { //not an ann return; } var td = node.Elements("td"); if (td == null) { throw new Exception("Could not find any tds as children!"); } var model = HandleRow(td); var context = new MariaContext(); // await context.AnnTasks.AddAsync(model); // await context.SaveChangesAsync(); context.Dispose(); return; }
private async void Timer_Elapsed(object sender, System.Timers.ElapsedEventArgs e) { if (CanGo == false) { CanGo = true; } if (Inprogress > 7) { return; } Inprogress++; var c = new MariaContext(); var task = await c.NextTask(); c.Dispose(); if (task == null) { return; } HtmlWeb web = new HtmlWeb(); Parse(web.Load(MakeUrl(task.PostUrl, 0)), task); }
private void Parse(HtmlDocument doc, AnnTaskModel task) { var random = new Random(); Log.Information($"Starting on task id:{task.Id}"); int pageNumber = 1; int postNumber = 0; bool isWorking = true; int totalPages = GetRealTotalPages(doc.DocumentNode, task); Log.Information($"task {task.Id} has {totalPages} pages, which is PROJECTED to be {totalPages * 20} total posts"); var baseCol = doc.DocumentNode.SelectSingleNode(PostXpaths.BaseSelector); while (isWorking) { var tr = baseCol.Elements("tr"); foreach (var row in tr) { postNumber++; Log.Information($"task {task.Id} scraping post number {postNumber} on page {pageNumber}"); ParsePost(row, doc, postNumber, task); } if (totalPages <= pageNumber) { Log.Information($"Finished scraping {task.Id} with {postNumber} posts."); isWorking = false; break; } pageNumber++; HtmlWeb web = new HtmlWeb(); while (CanGo == false) { Thread.Sleep(random.Next(4000)); } CanGo = false; var page = web.Load(MakeUrl(task.PostUrl, pageNumber)); if (page.DocumentNode.InnerText.Contains("you are accessing the forum too quickly")) { Log.Error("uh oh, we are rate limited!!! Stoping the timer for 1 minute"); Timer.Stop(); Thread.Sleep(60000); Log.Information("Resuming operations"); Timer.Start(); } baseCol = page.DocumentNode.SelectSingleNode(PostXpaths.BaseSelector); if (baseCol == null) { Log.Error($"BaseCol is nullllll"); } } var c = new MariaContext(); c.UpdateTaskStatusToComplete(task); Inprogress--; }