public void AnalyticsDienmayTantienTest() { Server.ConnectionString = ConfigCrawler.ConnectProduct; Server.LogConnectionString = ConfigCrawler.ConnectLog; Server.ConnectionStringCrawler = ConfigCrawler.ConnectionCrawler; string url = @"http://dienmaytantien.vn/detail.asp?parent_id=336&id=2204"; long companyId = 8223820966383374348; ProductParse productParse = new ProductParse(); ProductEntity productEntity = new ProductEntity(); string html = GABIZ.Base.HtmlUrl.HTMLTransmitter.getHTML(url, 45, 2); html = System.Web.HttpUtility.HtmlDecode(html); HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(html); var config = new Configuration(companyId); var company = new Company(companyId); productParse.Analytics(productEntity, doc, url, config, company.Domain); bool bok = productEntity.IsSuccessData(false); Assert.AreEqual(productEntity.Price, 10590000); Assert.AreEqual(bok, true); }
public void Start() { ProductAdapter pa = new ProductAdapter(new SqlDb(ConfigCrawler.ConnectProduct)); QT.Entities.Server.ConnectionString = ConfigCrawler.ConnectProduct; List <long> companyIds = pa.GetAllCompanyIdCrawler(); for (int i = 0; i < companyIds.Count; i++) { long companyId = companyIds[i]; Company company = new Company(companyId); Configuration configuration = new Configuration(companyId); ProductParse parse = new ProductParse(); ProductEntity productEntity = new ProductEntity(); HtmlDocument document = new HtmlDocument(); string url = configuration.LinkTest; string html = GABIZ.Base.HtmlUrl.HTMLTransmitter.getHTML(url, 45, 2); if (!string.IsNullOrEmpty(html)) { document.LoadHtml(html); try { parse.Analytics(productEntity, document, configuration.LinkTest, configuration, company.Domain); if (!productEntity.IsSuccessData(configuration.CheckPrice)) { pa.GetSqlDb() .RunQuery("insert into Company_FailConfig (CompanyId) Values (@CompanyId)", CommandType.Text, new SqlParameter[] { SqlDb.CreateParamteterSQL("@CompanyId", companyId, SqlDbType.BigInt) }); } } catch (Exception ex) { pa.GetSqlDb() .RunQuery("insert into Company_FailConfig (CompanyId, Error) Values (@CompanyId, @Error)", CommandType.Text, new SqlParameter[] { SqlDb.CreateParamteterSQL("@CompanyId", companyId, SqlDbType.BigInt), SqlDb.CreateParamteterSQL("@Error", ex.Message + "\n" + ex.StackTrace, SqlDbType.NVarChar) }); } _log.Info(string.Format("Run data {0}/ {1}", i, companyIds.Count)); } } }
public void StartRun() { WebExceptionStatus outS = WebExceptionStatus.UnknownError; foreach (var linkExtraction in this.configurationHotProduct.HotProduct_Link.Split(new char[] { ',', '\n', ';' }, StringSplitOptions.RemoveEmptyEntries)) { this.LogData("Get html of cat page"); string html = downloadHtml.GetHTML(linkExtraction, 45, 2, out outS, this.configurationHotProduct.HotProduct_UseSelenium); if (!string.IsNullOrEmpty(html)) { HtmlDocument htmlDocument = new HtmlDocument(); htmlDocument.LoadHtml(html); var nodeLinks = htmlDocument.DocumentNode.SelectNodes(this.configurationHotProduct.HotProduct_Xpath); if (nodeLinks != null) { foreach (var VARIABLE in nodeLinks) { try { string shortLink = VARIABLE.GetAttributeValue("href", ""); string fullLink = Common.GetAbsoluteUrl(shortLink, new Uri(this.company.Website)); this.LogData(string.Format("Process link product {0}", fullLink)); string htmlLinkProduct = this.downloadHtml.GetHTML(fullLink, 45, 2, out outS); HtmlDocument h = new HtmlDocument(); h.LoadHtml(htmlLinkProduct); if (!string.IsNullOrEmpty(htmlLinkProduct)) { ProductEntity productEntity = new ProductEntity(); this.producerParser.Analytics(productEntity, h, fullLink, this.configuration, this.company.Domain); if (productEntity.IsSuccessData(true)) { this.productAdapter.UpsertProductHot(productEntity); this.LogData(string.Format("Saved a product to database. {0} {1} {2}", productEntity.ID, productEntity.Name, productEntity.Price)); } } } catch (Exception ex) { LogData(string.Format("Error: {0} {1}", ex.Message, ex.StackTrace)); } } } } } }
private void Analysic(JobFindNew jobCrawl, HtmlDocument doc) { if (_company.Status == Common.CompanyStatus.TIN) { var product = new Product(); product.Analytics(doc, jobCrawl.Url, _config, false, _company.Domain); } else { var product = new ProductEntity(); var productParse = new ProductParse(); productParse.Analytics(product, doc, jobCrawl.Url, _config, _company.Domain); if (product.IsSuccessData(_config.CheckPrice)) { product.Valid = false; if (!IsExistsProduct(product.ID)) { if (!_dicDuplicate.ContainsKey(product.GetHashDuplicate())) { product.StatusChange.IsNew = true; PushChangeProduct(product); _dicDuplicate.Add(product.GetHashDuplicate(), product.ID); _crcProductOldGroup.Add(product.ID); _countNewProduct++; } else { _producerDuplicateProduct.PublishString( Newtonsoft.Json.JsonConvert.SerializeObject(new ProductDuplicate() { CId = _companyId, Id = product.ID, Hash = product.GetHashDuplicate(), IdDup = _dicDuplicate[product.GetHashDuplicate()], Url = product.DetailUrl }), true); } } } } }
public void AnalyticsTHEGIOICAYXANHTest() { Server.ConnectionString = ConfigCrawler.ConnectProduct; Server.LogConnectionString = ConfigCrawler.ConnectLog; Server.ConnectionStringCrawler = ConfigCrawler.ConnectionCrawler; string url = @"http://thegioicayxanh.vn/cay-van-phong/cay-chan-ret.html"; ProductParse productParse = new ProductParse(); ProductEntity productEntity = new ProductEntity(); string html = GABIZ.Base.HtmlUrl.HTMLTransmitter.getHTML(url, 45, 2); HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(html); var config = new Configuration(8153388634833285394); productParse.Analytics(productEntity, doc, url, config, "thegioicayxanh.vn"); bool bok = productEntity.IsSuccessData(false); Assert.AreEqual(productEntity.Price, 10590000); Assert.AreEqual(bok, true); }
public void AnalyticsTest() { Server.ConnectionString = ConfigCrawler.ConnectProduct; Server.LogConnectionString = ConfigCrawler.ConnectLog; Server.ConnectionStringCrawler = ConfigCrawler.ConnectionCrawler; string url = @"http://giadungchinhhang.vn/san-pham-chi-tiet/am-sieu-toc-philips-hd-4646--15-l-15.aspx"; ProductParse productParse = new ProductParse(); ProductEntity productEntity = new ProductEntity(); string html = GABIZ.Base.HtmlUrl.HTMLTransmitter.getHTML(url, 45, 2); HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(html); var config = new Configuration(1793534743671200240); productParse.Analytics(productEntity, doc, url, config, "giadungchinhhang.vn"); bool bok = productEntity.IsSuccessData(false); Assert.AreEqual(productEntity.Price, 10590000); Assert.AreEqual(bok, true); }
public void AnalyticsDongho12hTest() { Server.ConnectionString = ConfigCrawler.ConnectProduct; Server.LogConnectionString = ConfigCrawler.ConnectLog; Server.ConnectionStringCrawler = ConfigCrawler.ConnectionCrawler; string url = @"http://dongho12h.vn/product/fune2006w0.html"; long companyId = 297705792783058114; ProductParse productParse = new ProductParse(); ProductEntity productEntity = new ProductEntity(); string html = GABIZ.Base.HtmlUrl.HTMLTransmitter.getHTML(url, 45, 2); html = System.Web.HttpUtility.HtmlDecode(html); HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(html); var config = new Configuration(companyId); var company = new Company(companyId); productParse.Analytics(productEntity, doc, url, config, company.Domain); bool bok = productEntity.IsSuccessData(false); Assert.AreEqual(productEntity.Price, 10590000); Assert.AreEqual(bok, true); }
private void ParseProduct(Job job, ProductEntity product) { DelayTime(); WebExceptionStatus status; var htm = GetHtmlCode(job.url, out status); _countVisited++; if (status == WebExceptionStatus.Timeout || status == WebExceptionStatus.ConnectFailure) { _log.Info(string.Format("Fail download link: {0}", status)); } else { var doc = new HtmlDocument(); doc.LoadHtml(htm); _productParse.Analytics(product, doc, job.url, _config, _company.Domain); if (product.IsSuccessData(_config.CheckPrice)) { CheckDuplicate(product); if (!product.StatusChange.IsDuplicate) { CheckDeleteProduct(product); CheckChangeBasic(product); CheckChangeDesc(product); CheckChangePrice(product); CheckChangeImg(product); } } else { CheckDelete(product); } } }
public void Analytics(ProductEntity pt, HtmlDocument doc, String detailUrl, Configuration conf, string domain) { try { if (pt.ID == 0) { pt.ID = Common.GetIDProduct(detailUrl); } pt.CompanyId = conf.CompanyID; pt.Domain = domain; pt.DetailUrl = detailUrl; pt.Name = ParseName(doc, conf.ProductNameXPath); pt.HashName = Common.GetHashNameProduct(domain, pt.Name); pt.ShortDescription = ParseShortDescription(doc, conf.ShortDescriptionXPath); pt.Categories = ParseCategories(doc, conf.CategoryXPath, domain, conf.RemoveLastItemClassification, detailUrl); pt.OriginPrice = ParsePrice(doc, conf.OriginPriceXPath, conf.CheckPrice, new Uri(detailUrl), null); pt.Price = ParsePrice(doc, conf.PriceXPath, conf.CheckPrice, new Uri(detailUrl), conf.RegexPrice.Split(Common.arSplitToList, StringSplitOptions.RemoveEmptyEntries).ToList()); pt.VATStatus = conf.VATStatus; pt.VatInfo = ParseVatInfo(doc, conf.VATInfoXPath); if (!string.IsNullOrEmpty(pt.VatInfo)) { if (pt.VatInfo.ToLower().Contains("chưa")) { pt.VATStatus = 0; } if (pt.VatInfo.ToLower().Contains("đã")) { pt.VATStatus = 1; } } pt.PromotionInfo = ParsePromotionInfo(doc, conf.PromotionInfoXPath); pt.StartDeal = ParseStartDeal(doc, conf.StartDealXPath); pt.EndDeal = ParseEndDeal(doc, conf.EndDealXPath); pt.Warranty = ParseWarranty(doc, conf.WarrantyXPath); pt.Status = ParseStatus(doc, conf.StatusXPath); pt.Instock = Common.GetProductInstockFormStatus(pt.Status); pt.ImageUrls = ParseImages(doc, conf.ImageXPath, conf.AutoFixLinkImage, detailUrl); pt.Manufacture = ParseManufacture(doc, conf.ManufactureXPath); pt.Origin = ParseOrigin(doc, conf.OriginXPath); pt.Promotion = ParsePromotion(doc, conf.PromotionXPath); pt.Summary = ParseSummary(doc, conf.SummaryXPath); pt.ProductContent = ParseContent(doc, conf.ContentXPath); pt.VideoDescHtml = GetDesc(doc, conf.VideoXpath.Split(Common.arSplitToList, StringSplitOptions.RemoveEmptyEntries).ToList()); pt.SpecsDescHtml = GetDesc(doc, conf.SpecsXPath.Split(Common.arSplitToList, StringSplitOptions.RemoveEmptyEntries).ToList()); pt.FullDescHtml = GetDesc(doc, conf.FullDescXPath.Split(Common.arSplitToList, StringSplitOptions.RemoveEmptyEntries).ToList()); pt.ShortDescHtml = GetDesc(doc, conf.ShortDescriptionXPath); pt.Valid = pt.IsSuccessData(conf.CheckPrice); } catch (Exception ex) { Exception ex01 = new Exception(string.Format("Company: {0} Product: {1} Url: {2}: {3}", pt.CompanyId, pt.ID, pt.DetailUrl, ex.Message + ex.StackTrace)); throw ex01; } }