private void Extraction(HtmlDocument doc, JobFindNew job) { var countLinkAdds = 0; var countLinks = 0; if (job.Deep > _config.MaxDeep) { _log.Info("Over dee. Not extraction"); return; } else if (_visitedCrc.Count > _config.MaxLinksFindNew) { _log.Info("Over max link crc. Not extraction"); return; } var nodeLinks = doc.DocumentNode.SelectNodes("//a[@href]"); if (nodeLinks != null) { foreach (var nodelink in nodeLinks) { countLinks++; var link = System.Web.HttpUtility.HtmlDecode(Common.GetAbsoluteUrl(nodelink.Attributes["href"].Value, _rootUri)).Trim(); if (_companyId == 480254425312154563 && link.Contains("sid")) { link = link.Substring(0, link.IndexOf("sid", StringComparison.Ordinal) - 1); } if (link.Length < MaxLengthUrl) { var crcNewLink = Common.GetIDProduct(link); if (!_visitedCrc.Contains(crcNewLink) && !_crcProductOldGroup.Contains(crcNewLink) && !_hsDuplicateProduct.Contains(crcNewLink) && Common.CheckRegex(link, _config.VisitUrlsRegex, _config.NoVisitUrlRegex, false)) { countLinkAdds++; _visitedCrc.Add(crcNewLink); _linkQueue.Enqueue(new JobFindNew() { Url = link, Deep = job.Deep + 1, ParentId = job.Id, Id = Common.CrcProductID(link) }); _log.Debug("Add link to queue:" + link); } } } } _log.Info(GetPrefixLog() + string.Format("NumberLinkAdded {0}/{1}", countLinkAdds, countLinks)); }
private void ProcessLink(JobFindNew jobCrawl, string html) { var doc = new HtmlDocument(); doc.LoadHtml(html); if (IsDetailUrl(jobCrawl.Url)) { Analysic(jobCrawl, doc); } Extraction(doc, jobCrawl); }
private void Analysic(JobFindNew jobCrawl, HtmlDocument doc) { if (_company.Status == Common.CompanyStatus.TIN) { var product = new Product(); product.Analytics(doc, jobCrawl.Url, _config, false, _company.Domain); } else { var product = new ProductEntity(); var productParse = new ProductParse(); productParse.Analytics(product, doc, jobCrawl.Url, _config, _company.Domain); if (product.IsSuccessData(_config.CheckPrice)) { product.Valid = false; if (!IsExistsProduct(product.ID)) { if (!_dicDuplicate.ContainsKey(product.GetHashDuplicate())) { product.StatusChange.IsNew = true; PushChangeProduct(product); _dicDuplicate.Add(product.GetHashDuplicate(), product.ID); _crcProductOldGroup.Add(product.ID); _countNewProduct++; } else { _producerDuplicateProduct.PublishString( Newtonsoft.Json.JsonConvert.SerializeObject(new ProductDuplicate() { CId = _companyId, Id = product.ID, Hash = product.GetHashDuplicate(), IdDup = _dicDuplicate[product.GetHashDuplicate()], Url = product.DetailUrl }), true); } } } } }