Beispiel #1
0
        private void Timer_Elapsed(object sender, ElapsedEventArgs e)
        {
            Log.Information("timer elapsed");

            var context = new MariaContext(); //reinstaniating like this means its thread safe
            var status  = context.NextProfile();

            Log.Information("NextProfile finished successfully");
            if (status == null)
            {
                context.Dispose();
                return;
            }
            if (status.Id < End)
            {
                isWorking = true;
                HtmlWeb web = new HtmlWeb();

                var doc = web.Load(MakeUrl(status.Id));
                Log.Information("Loaded page successfully");
                var result = this.Parse(status.Id, doc, status);
                Log.Information("parsed successfully");

                if (result != null)
                {
                    status.Status = core.models.ProfileStatus.Complete;
                    context.SetStatusForId(status);
                    context.Users.Add(result);
                    context.SaveChanges();
                }
                else
                {
                    status.Status = core.models.ProfileStatus.ProfileNotPresent;
                    context.SetStatusForId(status);
                }
            }
            else
            {
                Log.Information("oh no, in a bad place! <- could be the root cause!");

                timer.Stop();
                isRunning = false;
            }
            context.Dispose();
        }
Beispiel #2
0
        public void ParsePost(HtmlNode node, HtmlDocument doc, int postNum, AnnTaskModel task)
        {
            if (node.InnerText.Contains("They may be unsafe, untrustworthy, or illegal in your jurisdiction.")) //ITS AN AD!
            {
                return;
            }
            var model = new PostModel(task);

            model.PostNumber = postNum;
            model.TopicTitle = model.TopicTitle.RemoveEmojis();
            var td = node.Element("td");

            if (td == null)
            {
                throw new Exception("Could not find any tds as children");
            }
            if (!td.HasChildNodes)
            {
                return;
            }
            var table = td.Element("table");

            if (table.ChildNodes.Any(f => f.Name == "tbody"))
            {
                table = table.Element("tbody");
            }
            table = table.Element("tr").Element("td").Element("table");

            if (table.ChildNodes.Any(f => f.Name == "tbody"))
            {
                table = table.Element("tbody");
            }

            var details = table.Element("tr");

            GetDetails(details, model);
            GetPost(details, model);
            model.IsScamHeaderPresent = IsPossibleScam(doc);
            var context = new MariaContext();

            context.Posts.Add(model);
            context.SaveChanges();
            context.Dispose();
        }
Beispiel #3
0
        private UserPageModel Parse(int id, HtmlDocument doc, UserProfileScrapingStatus userProfileStatus)
        {
            if (doc.DocumentNode.InnerHtml.Contains("An Error Has Occurred!"))
            {
                Log.Information("parsing - Profile doesn't exist");

                return(null);
            }
            if (doc.DocumentNode.InnerText.Contains("403"))
            {
                Log.Information("rate limited!!!! <- could be the root cause!");
                var context = new MariaContext();
                userProfileStatus.Status = ProfileStatus.Error;
                context.SetStatusForId(userProfileStatus);
                context.Dispose();
                throw new Exception("Error! getting 403 response. Quitting so we don't get locked out for longer!");
            }

            var item = new UserPageModel(id);

            item.Name = handleItem(doc.DocumentNode.SelectNodes(XpathSelectors.NameSelector));
            var baseCol = doc.DocumentNode.SelectSingleNode(XpathSelectors.baseSelector);

            if (baseCol == null)
            {
                throw new Exception("Error, should never be null!");
            }

            item.Merit          = handleItem(baseCol.SelectNodes($"{DynamicXpath(baseCol,"Merit")}/td[2]"));
            item.Position       = handleItem(baseCol.SelectNodes($"{DynamicXpath(baseCol, "Position")}/td[2]"));
            item.Posts          = handleItem(baseCol.SelectNodes($"{DynamicXpath(baseCol, "Posts")}/td[2]"));
            item.Activity       = handleItem(baseCol.SelectNodes($"{DynamicXpath(baseCol, "Activity")}/td[2]"));
            item.DateRegistered = handleItem(baseCol.SelectNodes($"{DynamicXpath(baseCol, "Date Registered")}/td[2]"));
            item.LastActive     = handleItem(baseCol.SelectNodes($"{DynamicXpath(baseCol, "Last Active")}/td[2]"));
            item.Gender         = handleItem(baseCol.SelectNodes($"{DynamicXpath(baseCol, "Gender")}/td[2]"));
            item.Age            = handleItem(baseCol.SelectNodes($"{DynamicXpath(baseCol, "Age")}/td[2]"));
            item.Location       = handleItem(baseCol.SelectNodes($"{DynamicXpath(baseCol, "Location")}/td[2]"));
            item.LocalTime      = handleItem(baseCol.SelectNodes($"{DynamicXpath(baseCol, "Local Time")}/td[2]"));
            Log.Information("Finished successfully");

            return(item);
        }
        private async void ParseRowAsync(HtmlNode node)
        {
            if (!node.InnerHtml.ToLower().Contains("ann"))
            {
                //not an ann
                return;
            }
            var td = node.Elements("td");

            if (td == null)
            {
                throw new Exception("Could not find any tds as children!");
            }
            var model   = HandleRow(td);
            var context = new MariaContext();

            // await context.AnnTasks.AddAsync(model);
            // await context.SaveChangesAsync();
            context.Dispose();
            return;
        }
Beispiel #5
0
        private async void Timer_Elapsed(object sender, System.Timers.ElapsedEventArgs e)
        {
            if (CanGo == false)
            {
                CanGo = true;
            }
            if (Inprogress > 7)
            {
                return;
            }
            Inprogress++;
            var c    = new MariaContext();
            var task = await c.NextTask();

            c.Dispose();
            if (task == null)
            {
                return;
            }
            HtmlWeb web = new HtmlWeb();

            Parse(web.Load(MakeUrl(task.PostUrl, 0)), task);
        }
Beispiel #6
0
        private void Parse(HtmlDocument doc, AnnTaskModel task)
        {
            var random = new Random();

            Log.Information($"Starting on task id:{task.Id}");
            int  pageNumber = 1;
            int  postNumber = 0;
            bool isWorking  = true;

            int totalPages = GetRealTotalPages(doc.DocumentNode, task);

            Log.Information($"task {task.Id} has {totalPages} pages, which is PROJECTED to be {totalPages * 20} total posts");
            var baseCol = doc.DocumentNode.SelectSingleNode(PostXpaths.BaseSelector);

            while (isWorking)
            {
                var tr = baseCol.Elements("tr");
                foreach (var row in tr)
                {
                    postNumber++;
                    Log.Information($"task {task.Id} scraping post number {postNumber} on page {pageNumber}");
                    ParsePost(row, doc, postNumber, task);
                }


                if (totalPages <= pageNumber)
                {
                    Log.Information($"Finished scraping {task.Id} with {postNumber} posts.");
                    isWorking = false;
                    break;
                }
                pageNumber++;
                HtmlWeb web = new HtmlWeb();
                while (CanGo == false)
                {
                    Thread.Sleep(random.Next(4000));
                }
                CanGo = false;
                var page = web.Load(MakeUrl(task.PostUrl, pageNumber));
                if (page.DocumentNode.InnerText.Contains("you are accessing the forum too quickly"))
                {
                    Log.Error("uh oh, we are rate limited!!! Stoping the timer for 1 minute");

                    Timer.Stop();

                    Thread.Sleep(60000);
                    Log.Information("Resuming operations");
                    Timer.Start();
                }
                baseCol = page.DocumentNode.SelectSingleNode(PostXpaths.BaseSelector);
                if (baseCol == null)
                {
                    Log.Error($"BaseCol is nullllll");
                }
            }

            var c = new MariaContext();

            c.UpdateTaskStatusToComplete(task);
            Inprogress--;
        }