protected override void LinksScrapperThread(string l) { _l.info("Processor thread"); if (ValidationService.Validate(l)) { ScrapingBrowser _b = new ScrapingBrowser(); try { var htmlNode = _b.NavigateToPage(new Uri(l)).Html; if (!ReferenceEquals(htmlNode, null)) { //var links = htmlNode.SelectNodes("//body//a/@href"); var links = htmlNode.CssSelect("a"); if (!ReferenceEquals(links, null)) { try { var productNameNode = htmlNode.CssSelect(requestScrappingSite.SiteProductPageIndicationSelector); if (!ReferenceEquals(productNameNode, null) && !ReferenceEquals(productNameNode.First(), null)) { var preparedLink = PrepareLink(l); _l.info($"Link {preparedLink} is a valid link, adding it to collection!"); LinksPool.Add(preparedLink); } else { _l.warn("Node selection error: not a product name"); } } catch (Exception) { // _l.warn($"Url {l} is not a valid product page, skip it"); } foreach (var link in links) { var linkValue = link.GetAttributeValue("href", "").Trim(); if (ValidationService.Validate(linkValue)) { var preparedLink = PrepareLink(linkValue); if (!IsLinkExist(preparedLink) && IsNotExcluded(preparedLink)) { LinksPool.Add(preparedLink); TurnableLinksList.Add(preparedLink); } else { // _l.warn($"Link {preparedLink} already scrapped, skip it..."); } } else { // _l.warn($"Link {linkValue} is not our required link!"); } } } else { _l.warn($"Any links on the page {l}"); } } else { _l.warn($"Nothing to scrap from url {l}"); } } catch (AggregateException e) { _l.error(String.Concat(e.Message, " -> ", l)); } } else { if (ReferenceEquals(ValidationService.GetExceptMessage(), null)) { // _l.warn($"Link {l} already processed, skipping it..."); } else { _l.warn(ValidationService.GetExceptMessage()); } } }
private void LinksProcessor() { if (TurnableLinksList.Count > 0) { var currentTurnableList = TurnableLinksList; var removeList = new List <string>(); var MathRandom = new Random(); int OperationsLimitPerMoment = MathRandom.Next(requestScrappingSite.SiteBaseRequestsIntervalMin, requestScrappingSite.SiteBaseRequestsIntervalMax); int CurrentOperationNumber = 0; for (var i = 0; i <= currentTurnableList.Count() - 1; i++) { string link = currentTurnableList.ElementAt(i); _l.info($"Current link: {link}"); ScrapingBrowser _b = new ScrapingBrowser(); try { if (CurrentOperationNumber >= OperationsLimitPerMoment) { Thread.Sleep(MathRandom.Next(requestScrappingSite.SiteBaseRequestsIntervalMin * 1000, requestScrappingSite.SiteBaseRequestsIntervalMax * 1000)); OperationsLimitPerMoment = MathRandom.Next(requestScrappingSite.SiteBaseRequestsIntervalMin, requestScrappingSite.SiteBaseRequestsIntervalMax); CurrentOperationNumber = 0; } CurrentOperationNumber++; var htmlNode = _b.NavigateToPage(new Uri(link)).Html; if (!ReferenceEquals(htmlNode, null)) { try { if (!ReferenceEquals(requestScrappingSite.SiteProductPageIndicationSelector, null) && !requestScrappingSite.SiteProductPageIndicationSelector.Equals(String.Empty)) { var productNameNode = SelectNode(requestScrappingSite.SiteProductPageIndicationSelector, htmlNode); if (!ReferenceEquals(productNameNode, null)) { var productName = productNameNode.InnerText; if (!ReferenceEquals(productName.Trim(), String.Empty)) { IndexedLinks.Add(link); _l.info($"Adding {link} to products list collection"); ScrapProductFromUrl(link, htmlNode, new WebScrapperBaseProxyEntity { }); } } } LinksScrapperThread(link); } catch (Exception e) { _l.error($"Product name node selector error : {e.Message}"); } } } catch (Exception e) { _l.error($"Internal scrapping browser error: {e.Message}"); } removeList.Add(link); } if (removeList.Count > 0) { foreach (var item in removeList) { TurnableLinksList.Remove(item); } } } _l.info($"Task {Thread.CurrentThread.Name} : {TurnableLinksList.Count} left in scrapping"); if (TurnableLinksList.Count > 0) { LinksProcessor(); } else { InvokeOnInstanceStatusUpdating(WebScrapperBaseStatuses.InstanceShuttedDown); _l.info($"Task {Thread.CurrentThread.Name} finished!"); } }