Exemple #1
0
        private int GetRealTotalPages(HtmlNode node, AnnTaskModel task)
        {
            var n = node.SelectSingleNode("/html[1]/body[1]/div[2]");

            var table = n.Element("table");

            if (table.ChildNodes.Any(f => f.Name == "tbody"))
            {
                table = table.Element("tbody");
            }
            var t = table.Element("tr").Element("td");


            var allAs = t.Elements("a");

            if (allAs.Count() == 0)
            {
                return(0);
            }

            if (allAs.Last().InnerText.Contains("All"))
            {
                return(int.Parse(allAs.ElementAt(allAs.Count() - 2).InnerText));
            }

            return(int.Parse(allAs.ElementAt(allAs.Count() - 1).InnerText));
        }
        private AnnTaskModel HandleRow(IEnumerable <HtmlNode> nodes)
        {
            var model = new AnnTaskModel();

            for (int i = 2; i < nodes.Count(); i++)
            {
                var ele = nodes.ElementAt(i);
                switch (i)
                {
                case (2):     //title and url
                    var a = ele.Element("span").Element("a");
                    model.PostTitle = a.InnerText.Replace("�", "");
                    model.PostUrl   = a.Attributes.Where(f => f.Name == "href").FirstOrDefault().Value;
                    break;

                case (3):     // Profile
                    model.Author = ele.Element("a").InnerText.Replace("�", "");
                    break;

                case (4):
                    model.Replies = int.Parse(Regex.Replace(ele.Element("#text").InnerText, @"\t|\n|\r", ""));
                    break;

                case (5):
                    model.Views = int.Parse(Regex.Replace(ele.Element("#text").InnerText, @"\t|\n|\r", ""));
                    break;

                default:
                    break;
                }
            }

            return(model);
        }
Exemple #3
0
        public void ParsePost(HtmlNode node, HtmlDocument doc, int postNum, AnnTaskModel task)
        {
            if (node.InnerText.Contains("They may be unsafe, untrustworthy, or illegal in your jurisdiction.")) //ITS AN AD!
            {
                return;
            }
            var model = new PostModel(task);

            model.PostNumber = postNum;
            model.TopicTitle = model.TopicTitle.RemoveEmojis();
            var td = node.Element("td");

            if (td == null)
            {
                throw new Exception("Could not find any tds as children");
            }
            if (!td.HasChildNodes)
            {
                return;
            }
            var table = td.Element("table");

            if (table.ChildNodes.Any(f => f.Name == "tbody"))
            {
                table = table.Element("tbody");
            }
            table = table.Element("tr").Element("td").Element("table");

            if (table.ChildNodes.Any(f => f.Name == "tbody"))
            {
                table = table.Element("tbody");
            }

            var details = table.Element("tr");

            GetDetails(details, model);
            GetPost(details, model);
            model.IsScamHeaderPresent = IsPossibleScam(doc);
            var context = new MariaContext();

            context.Posts.Add(model);
            context.SaveChanges();
            context.Dispose();
        }
Exemple #4
0
        private void Parse(HtmlDocument doc, AnnTaskModel task)
        {
            var random = new Random();

            Log.Information($"Starting on task id:{task.Id}");
            int  pageNumber = 1;
            int  postNumber = 0;
            bool isWorking  = true;

            int totalPages = GetRealTotalPages(doc.DocumentNode, task);

            Log.Information($"task {task.Id} has {totalPages} pages, which is PROJECTED to be {totalPages * 20} total posts");
            var baseCol = doc.DocumentNode.SelectSingleNode(PostXpaths.BaseSelector);

            while (isWorking)
            {
                var tr = baseCol.Elements("tr");
                foreach (var row in tr)
                {
                    postNumber++;
                    Log.Information($"task {task.Id} scraping post number {postNumber} on page {pageNumber}");
                    ParsePost(row, doc, postNumber, task);
                }


                if (totalPages <= pageNumber)
                {
                    Log.Information($"Finished scraping {task.Id} with {postNumber} posts.");
                    isWorking = false;
                    break;
                }
                pageNumber++;
                HtmlWeb web = new HtmlWeb();
                while (CanGo == false)
                {
                    Thread.Sleep(random.Next(4000));
                }
                CanGo = false;
                var page = web.Load(MakeUrl(task.PostUrl, pageNumber));
                if (page.DocumentNode.InnerText.Contains("you are accessing the forum too quickly"))
                {
                    Log.Error("uh oh, we are rate limited!!! Stoping the timer for 1 minute");

                    Timer.Stop();

                    Thread.Sleep(60000);
                    Log.Information("Resuming operations");
                    Timer.Start();
                }
                baseCol = page.DocumentNode.SelectSingleNode(PostXpaths.BaseSelector);
                if (baseCol == null)
                {
                    Log.Error($"BaseCol is nullllll");
                }
            }

            var c = new MariaContext();

            c.UpdateTaskStatusToComplete(task);
            Inprogress--;
        }
 public async void UpdateTaskStatusToComplete(AnnTaskModel task)
 {
     task.Status            = AnnStatus.Complete;
     this.Entry(task).State = EntityState.Modified;
     this.SaveChanges();
 }