예제 #1
0
        private void Extraction(HtmlDocument doc, JobFindNew job)
        {
            var countLinkAdds = 0;
            var countLinks    = 0;

            if (job.Deep > _config.MaxDeep)
            {
                _log.Info("Over dee. Not extraction");
                return;
            }
            else if (_visitedCrc.Count > _config.MaxLinksFindNew)
            {
                _log.Info("Over max link crc. Not extraction");
                return;
            }
            var nodeLinks = doc.DocumentNode.SelectNodes("//a[@href]");

            if (nodeLinks != null)
            {
                foreach (var nodelink in nodeLinks)
                {
                    countLinks++;
                    var link =
                        System.Web.HttpUtility.HtmlDecode(Common.GetAbsoluteUrl(nodelink.Attributes["href"].Value,
                                                                                _rootUri)).Trim();

                    if (_companyId == 480254425312154563 && link.Contains("sid"))
                    {
                        link = link.Substring(0, link.IndexOf("sid", StringComparison.Ordinal) - 1);
                    }

                    if (link.Length < MaxLengthUrl)
                    {
                        var crcNewLink = Common.GetIDProduct(link);
                        if (!_visitedCrc.Contains(crcNewLink) &&
                            !_crcProductOldGroup.Contains(crcNewLink) &&
                            !_hsDuplicateProduct.Contains(crcNewLink) &&
                            Common.CheckRegex(link, _config.VisitUrlsRegex, _config.NoVisitUrlRegex, false))
                        {
                            countLinkAdds++;
                            _visitedCrc.Add(crcNewLink);
                            _linkQueue.Enqueue(new JobFindNew()
                            {
                                Url      = link,
                                Deep     = job.Deep + 1,
                                ParentId = job.Id,
                                Id       = Common.CrcProductID(link)
                            });
                            _log.Debug("Add link to queue:" + link);
                        }
                    }
                }
            }
            _log.Info(GetPrefixLog() + string.Format("NumberLinkAdded {0}/{1}", countLinkAdds, countLinks));
        }
예제 #2
0
        private void ProcessLink(JobFindNew jobCrawl, string html)
        {
            var doc = new HtmlDocument();

            doc.LoadHtml(html);
            if (IsDetailUrl(jobCrawl.Url))
            {
                Analysic(jobCrawl, doc);
            }
            Extraction(doc, jobCrawl);
        }
예제 #3
0
        private void Analysic(JobFindNew jobCrawl, HtmlDocument doc)
        {
            if (_company.Status == Common.CompanyStatus.TIN)
            {
                var product = new Product();
                product.Analytics(doc, jobCrawl.Url, _config, false, _company.Domain);
            }
            else
            {
                var product      = new ProductEntity();
                var productParse = new ProductParse();
                productParse.Analytics(product, doc, jobCrawl.Url, _config, _company.Domain);

                if (product.IsSuccessData(_config.CheckPrice))
                {
                    product.Valid = false;
                    if (!IsExistsProduct(product.ID))
                    {
                        if (!_dicDuplicate.ContainsKey(product.GetHashDuplicate()))
                        {
                            product.StatusChange.IsNew = true;
                            PushChangeProduct(product);
                            _dicDuplicate.Add(product.GetHashDuplicate(), product.ID);
                            _crcProductOldGroup.Add(product.ID);
                            _countNewProduct++;
                        }

                        else
                        {
                            _producerDuplicateProduct.PublishString(
                                Newtonsoft.Json.JsonConvert.SerializeObject(new ProductDuplicate()
                            {
                                CId   = _companyId,
                                Id    = product.ID,
                                Hash  = product.GetHashDuplicate(),
                                IdDup = _dicDuplicate[product.GetHashDuplicate()],
                                Url   = product.DetailUrl
                            }), true);
                        }
                    }
                }
            }
        }