public static void ShowProduct(long CompanyId) { Entities.Company company = new Entities.Company(CompanyId); Configuration config = new Configuration(CompanyId); ProductParse pp = new ProductParse(); ProductEntity product = new ProductEntity(); string detailUrl = config.LinkTest; GABIZ.Base.HtmlAgilityPack.HtmlDocument document = new GABIZ.Base.HtmlAgilityPack.HtmlDocument(); pp.Analytics(product, document, config.LinkTest, config, config.Domain); string strDataShow = ""; strDataShow += string.Format("\r\n Name: {0}", product.Name); frmShow.Visible = true; frmShow.Show(); }
protected override void OnStart(string[] args) { log.Info("Start service"); try { InitializeComponent(); cancelTokenSource = new CancellationTokenSource(); string rabbitMQServerName = ConfigurationManager.AppSettings["rabbitMQServerName"]; workers = new Worker[workerCount]; rabbitMQServer = RabbitMQManager.GetRabbitMQServer(rabbitMQServerName); string connectToSQL = @"Data Source=172.22.30.86,1455;Initial Catalog=QT_2;Persist Security Info=True;User ID=qt_vn;Password=@F4sJ=l9/ryJt9MT;connection timeout=200"; string connectToConnection = @"Data Source=42.112.28.93;Initial Catalog=QT_2;Persist Security Info=True;User ID=wss_price;Password=HzlRt4$$axzG-*UlpuL2gYDu;connection timeout=200"; CrawlerProductAdapter crawlerProductAdapter = new CrawlerProductAdapter(new SqlDb(connectToSQL)); ProductAdapter productAdapter = new ProductAdapter(new SqlDb(connectToConnection)); for (int i = 0; i < workerCount; i++) { log.InfoFormat("Start worker {i}", i.ToString()); var worker = new Worker(AddProductToSqlJobName, false, rabbitMQServer); workers[i] = worker; var token = this.cancelTokenSource.Token; Task workerTask = new Task(() => { worker.JobHandler = (downloadImageJob) => { try { token.ThrowIfCancellationRequested(); QT.Entities.CrawlerProduct.RabbitMQ.MsSaveProduct Mss = QT.Entities.CrawlerProduct.RabbitMQ.MsSaveProduct.GetDataFromMessage(downloadImageJob.Data); string Url = Mss.Url; string Domain = QT.Entities.Common.GetDomainFromUrl(Url); long CompanyID = QT.Entities.Common.GetIDCompany(Domain); QT.Entities.Configuration config = new QT.Entities.Configuration(CompanyID); if (_company.Status == Common.CompanyStatus.WEB_CRAWLERDOMAIN) { List <QT.Entities.Company> ls = new List <QT.Entities.Company>(); QT.Entities.CrawlerDomain obj = new CrawlerDomain(); string html = GABIZ.Base.HtmlUrl.HTMLTransmitter.getHTML(Url.Trim(), 15, 1); GABIZ.Base.HtmlAgilityPack.HtmlDocument doc = new GABIZ.Base.HtmlAgilityPack.HtmlDocument(); html = html.Replace("<form", "<div"); html = html.Replace("</form", "</div"); doc.LoadHtml(html); } else { int numberItemSaved = 0; string[] arLink = Url.Trim().Split(SqlDb.arSplit, StringSplitOptions.RemoveEmptyEntries); foreach (var item in arLink) { QT.Entities.Product _product = new Product(); string html = GABIZ.Base.HtmlUrl.HTMLTransmitter.getHTML(item, 45, 2); if (config.ContentAnanyticXPath.Count >= 1) { int i1 = 0, i2 = 0; i1 = html.IndexOf(config.ContentAnanyticXPath[0]); if (i1 >= 0) { html = html.Substring(i1); if (config.ContentAnanyticXPath.Count >= 2) { i2 = html.IndexOf(config.ContentAnanyticXPath[1]); if (i2 >= 0) { html = html.Substring(0, i2 + config.ContentAnanyticXPath[1].Length); } } } html = html.Replace("<form", "<div"); html = html.Replace("</form", "</div"); html = Common.TidyCleanR(html); } _htmlSource = html; GABIZ.Base.HtmlAgilityPack.HtmlDocument doc = new GABIZ.Base.HtmlAgilityPack.HtmlDocument(); html = html.Replace("<form", "<div"); html = html.Replace("</form", "</div"); doc.LoadHtml(html); List <Product> lstUpdateProduct = new List <Product>(); List <Product> lstInsertProduct = new List <Product>(); _product.Analytics(doc, item, config, true, _company.Domain); if (_product != null && _product.IsSuccessData(config.CheckPrice)) { numberItemSaved++; if (productAdapter.CheckExistInDb(_product.ID)) { lstUpdateProduct.Add(_product); } else { lstInsertProduct.Add(_product); } productAdapter.UpdateProductsChangeToDb(lstUpdateProduct); productAdapter.InsertListProduct(lstInsertProduct); productAdapter.PushQueueIndexCompany(config.CompanyID); productAdapter.PushQueueChangeChangeImage(new MQChangeImage() { ProductID = _product.ID, Type = 1 }); log.InfoFormat("Saved {0} item product!", _product.Name); } } } return(true); } catch (OperationCanceledException opc) { log.Info("End worker"); return(false); } }; worker.Start(); }, token); workerTask.Start(); log.InfoFormat("Worker {0} started", i); } } catch (Exception ex) { log.Error("Start error", ex); throw; } }
public bool Init() { try { var rabbitMqCrawler = RabbitMQManager.GetRabbitMQServer(ConfigCrawler.KeyRabbitMqCrawler); _producerReportSessionRunning = new ProducerBasic(rabbitMqCrawler, ConfigCrawler.ExchangeSessionRunning, ConfigCrawler.RoutingkeySessionRunning); _producerReportError = new ProducerBasic(rabbitMqCrawler, ConfigCrawler.ExchangeErorrCrawler, ConfigCrawler.RoutingKeyErrorCrawler); _producerProductChange = new ProducerBasic(rabbitMqCrawler, ConfigCrawler.ExchangeChangeProduct, ConfigCrawler.RoutingkeyChangeProduct); _producerDuplicateProduct = new ProducerBasic(rabbitMqCrawler, ConfigCrawler.ExchangeDuplicateProductToCache, ConfigCrawler.ExchangeDuplicateProductToCache); _producerEndCrawler = new ProducerBasic(rabbitMqCrawler, ConfigCrawler.ExchangeEndSession, ConfigCrawler.RoutingEndSession); _producerVisitedLinkFindNew = new ProducerBasic(rabbitMqCrawler, ConfigCrawler.ExchangeVisitedLinkFindNew, ConfigCrawler.RoutingKeyVisitedLinkFindNew); _company = new Company(_companyId); _config = new Configuration(_companyId); if (_config.LimitProductValid == 0) { this._limitProductValid = 1000000; } _rootUri = new Uri(_company.Website); _cacheCrcVisited = RedisCrcVisitedFindNew.Instance(); _cacheWaitCrawler = RedisCompanyWaitCrawler.Instance(); _cacheLastUpdateProduct = RedisLastUpdateProduct.Instance(); _cacheProductHash = CacheProductHash.Instance(); _cacheCacheCompanyCrawler = RedisCacheCompanyCrawler.Instance(); _cacheDuplicateProduct = CacheDuplicateProduct.Instance(); _company = new Company(_companyId); _config = new Configuration(_companyId); _visitedCrc = new HashSet <long>(); _linkQueue = new Queue <JobFindNew>(); _crcProductOldGroup = new HashSet <long>(); _dicDuplicate = new Dictionary <long, long>(); _countVisited = 0; _countNewProduct = 0; _tokenCrawler.ThrowIfCancellationRequested(); _visitRegexs = _config.VisitUrlsRegex; _detailLinkRegexs = _config.ProductUrlsRegex; _noCrawlerRegexs = _config.NoVisitUrlRegex ?? new List <string>(); _noCrawlerRegexs.AddRange(UtilCrawlerProduct.NoCrawlerRegexDefault); _timeStart = DateTime.Now; _rootUri = Common.GetUriFromUrl(_company.Website); _hsDuplicateProduct = _cacheDuplicateProduct.GetHashDuplicate(_companyId); ClearOldCache(); LoadCrcOldProduct(); LoadOldQueue(); return(true); } catch (Exception ex) { _log.Error(string.Format("Company:{0} {1} {2}", _companyId, ex.Message, ex.StackTrace)); string mss = Newtonsoft.Json.JsonConvert.SerializeObject(new ErrorCrawler() { CompanyId = _companyId, ProductId = 0, TimeError = DateTime.Now, Message = "Init" + ex.Message + ex.StackTrace }); _producerReportError.PublishString(mss, true); if (_producerEndCrawler != null) { _producerEndCrawler.PublishString(new CrawlerSessionLog() { CompanyId = _companyId, CountChange = 0, CountProduct = 0, CountVisited = 0, Domain = "", EndAt = DateTime.Now, Ip = Dns.GetHostName(), NumberDuplicateProduct = 0, Session = this._session, StartAt = this._timeStart, TotalProduct = 0, TypeCrawler = 0, TypeEnd = "Error Init", TypeRun = "Auto" }.ToJson()); } return(false); } }