Exemple #1
0
 protected override void LinksScrapperThread(string l)
 {
     _l.info("Processor thread");
     if (ValidationService.Validate(l))
     {
         ScrapingBrowser _b = new ScrapingBrowser();
         try {
             var htmlNode = _b.NavigateToPage(new Uri(l)).Html;
             if (!ReferenceEquals(htmlNode, null))
             {
                 //var links = htmlNode.SelectNodes("//body//a/@href");
                 var links = htmlNode.CssSelect("a");
                 if (!ReferenceEquals(links, null))
                 {
                     try {
                         var productNameNode = htmlNode.CssSelect(requestScrappingSite.SiteProductPageIndicationSelector);
                         if (!ReferenceEquals(productNameNode, null) && !ReferenceEquals(productNameNode.First(), null))
                         {
                             var preparedLink = PrepareLink(l);
                             _l.info($"Link {preparedLink} is a valid link, adding it to collection!");
                             LinksPool.Add(preparedLink);
                         }
                         else
                         {
                             _l.warn("Node selection error: not a product name");
                         }
                     } catch (Exception) {
                         //  _l.warn($"Url {l} is not a valid product page, skip it");
                     }
                     foreach (var link in links)
                     {
                         var linkValue = link.GetAttributeValue("href", "").Trim();
                         if (ValidationService.Validate(linkValue))
                         {
                             var preparedLink = PrepareLink(linkValue);
                             if (!IsLinkExist(preparedLink) && IsNotExcluded(preparedLink))
                             {
                                 LinksPool.Add(preparedLink);
                                 TurnableLinksList.Add(preparedLink);
                             }
                             else
                             {
                                 //  _l.warn($"Link {preparedLink} already scrapped, skip it...");
                             }
                         }
                         else
                         {
                             //  _l.warn($"Link {linkValue} is not our required link!");
                         }
                     }
                 }
                 else
                 {
                     _l.warn($"Any links on the page {l}");
                 }
             }
             else
             {
                 _l.warn($"Nothing to scrap from url {l}");
             }
         } catch (AggregateException e) {
             _l.error(String.Concat(e.Message, " -> ", l));
         }
     }
     else
     {
         if (ReferenceEquals(ValidationService.GetExceptMessage(), null))
         {
             // _l.warn($"Link {l} already processed, skipping it...");
         }
         else
         {
             _l.warn(ValidationService.GetExceptMessage());
         }
     }
 }
Exemple #2
0
        private void LinksProcessor()
        {
            if (TurnableLinksList.Count > 0)
            {
                var currentTurnableList = TurnableLinksList;
                var removeList          = new List <string>();

                var MathRandom = new Random();

                int OperationsLimitPerMoment = MathRandom.Next(requestScrappingSite.SiteBaseRequestsIntervalMin, requestScrappingSite.SiteBaseRequestsIntervalMax);
                int CurrentOperationNumber   = 0;

                for (var i = 0; i <= currentTurnableList.Count() - 1; i++)
                {
                    string link = currentTurnableList.ElementAt(i);
                    _l.info($"Current link: {link}");
                    ScrapingBrowser _b = new ScrapingBrowser();
                    try {
                        if (CurrentOperationNumber >= OperationsLimitPerMoment)
                        {
                            Thread.Sleep(MathRandom.Next(requestScrappingSite.SiteBaseRequestsIntervalMin * 1000, requestScrappingSite.SiteBaseRequestsIntervalMax * 1000));
                            OperationsLimitPerMoment = MathRandom.Next(requestScrappingSite.SiteBaseRequestsIntervalMin, requestScrappingSite.SiteBaseRequestsIntervalMax);
                            CurrentOperationNumber   = 0;
                        }
                        CurrentOperationNumber++;
                        var htmlNode = _b.NavigateToPage(new Uri(link)).Html;
                        if (!ReferenceEquals(htmlNode, null))
                        {
                            try {
                                if (!ReferenceEquals(requestScrappingSite.SiteProductPageIndicationSelector, null) &&
                                    !requestScrappingSite.SiteProductPageIndicationSelector.Equals(String.Empty))
                                {
                                    var productNameNode = SelectNode(requestScrappingSite.SiteProductPageIndicationSelector, htmlNode);
                                    if (!ReferenceEquals(productNameNode, null))
                                    {
                                        var productName = productNameNode.InnerText;
                                        if (!ReferenceEquals(productName.Trim(), String.Empty))
                                        {
                                            IndexedLinks.Add(link);
                                            _l.info($"Adding {link} to products list collection");
                                            ScrapProductFromUrl(link, htmlNode, new WebScrapperBaseProxyEntity {
                                            });
                                        }
                                    }
                                }
                                LinksScrapperThread(link);
                            } catch (Exception e) {
                                _l.error($"Product name node selector error : {e.Message}");
                            }
                        }
                    } catch (Exception e) {
                        _l.error($"Internal scrapping browser error: {e.Message}");
                    }
                    removeList.Add(link);
                }
                if (removeList.Count > 0)
                {
                    foreach (var item in removeList)
                    {
                        TurnableLinksList.Remove(item);
                    }
                }
            }
            _l.info($"Task {Thread.CurrentThread.Name} : {TurnableLinksList.Count} left in scrapping");
            if (TurnableLinksList.Count > 0)
            {
                LinksProcessor();
            }
            else
            {
                InvokeOnInstanceStatusUpdating(WebScrapperBaseStatuses.InstanceShuttedDown);
                _l.info($"Task {Thread.CurrentThread.Name} finished!");
            }
        }