public List <string> GetListTag(string url, string xpath) { QT.Entities.RaoVat.HandlerContentOfHtml handlerContentHtml = new Entities.RaoVat.HandlerContentOfHtml(); GABIZ.Base.HtmlAgilityPack.HtmlDocument doc = new GABIZ.Base.HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(GABIZ.Base.HtmlUrl.HTMLTransmitter.getHTML(url, 45, 2, true)); List <string> lstTag = QT.Entities.Common.GetTextInNode(doc, xpath); return(lstTag); }
private void DoCrawler() { Dictionary <long, int[]> dicMapClassAndCategori = this.raovatSqlAdapter.GetDicMapClassificationAndCategories(this.websiteRaoVat.id); Dictionary <int, string[]> dicMapCity = this.raovatSqlAdapter.GetDicCityAndRegex(); while (true) { try { int igone = 0; //Khởi tạo. Queue <JobCrawlerSale> queueUrl = new Queue <JobCrawlerSale>(); Dictionary <long, string> dicVisited = new Dictionary <long, string>(); foreach (var item in this.RunnerCrawler.root_link) { queueUrl.Enqueue(new JobCrawlerSale() { deep = 0, url = item }); } this.ShowQueue(queueUrl.Count); while (!this.Pause && queueUrl != null && queueUrl.Count > 0) { JobCrawlerSale job = queueUrl.Dequeue(); ShowUrlCurrent(job.url); ShowQueue(queueUrl.Count); if (configXPath.TimeDelay > 0) { Thread.Sleep(configXPath.TimeDelay); } string html = GABIZ.Base.HtmlUrl.HTMLTransmitter.getHTML(job.url, 45, 2); if (!string.IsNullOrEmpty(html)) { GABIZ.Base.HtmlAgilityPack.HtmlDocument doc = new GABIZ.Base.HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(html); //Extraction. var a_nodes = doc.DocumentNode.SelectNodes("//a[@href]"); if (a_nodes != null) { foreach (var a_node in a_nodes) { string url1 = QT.Entities.Common.GetAbsoluteUrl(a_node.Attributes["href"].Value, this.websiteRaoVat.base_link); string compacLink = QT.Entities.Common.CompactUrl(url1); long s_crc = Math.Abs(GABIZ.Base.Tools.getCRC64(compacLink)); if (!dicVisited.ContainsKey(s_crc)) { dicVisited.Add(s_crc, ""); ShowVisited(dicVisited.Count); bool bRegexProduct = QT.Entities.Common.CheckRegex(compacLink, configXPath.ProductUrlsRegex, configXPath.NoProductUrlRegex, false); bool bRegexExtract = QT.Entities.Common.CheckRegex(compacLink, configXPath.VisitUrlsRegex, configXPath.NoVisitUrlRegex, false); if (bRegexExtract) { if (job.deep + 1 < this.RunnerCrawler.max_deep) { queueUrl.Enqueue(new JobCrawlerSale() { url = url1, deep = job.deep + 1 }); ShowQueue(queueUrl.Count); } } else { if (bRegexProduct) { queueUrl.Enqueue(new JobCrawlerSale() { url = url1, deep = job.deep + 1 }); ShowQueue(queueUrl.Count); } } } } } //AnalysicData. if (QT.Entities.Common.CheckRegex( QT.Entities.Common.CompactUrl(job.url), configXPath.ProductUrlsRegex, configXPath.NoProductUrlRegex, false)) { QT.Entities.RaoVat.HandlerContentOfHtml handlerContentHtml = new Entities.RaoVat.HandlerContentOfHtml(); ProductSaleNew productSaleNew = new ProductSaleNew(); handlerContentHtml.AnalyticsProductSaleNew(websiteRaoVat.domain, job.url, doc, configXPath , productSaleNew, dicMapClassAndCategori, dicMapCity); if (productSaleNew.IsDetailSucess) { //SaveClassification try { this.raovatSqlAdapter.SaveClassification(productSaleNew.website_id, productSaleNew.web_category); } catch (Exception ex01) { } if (!this.mongoDbAdapter.CheckExistsProductSalenew(productSaleNew.id)) { this.mongoDbAdapter.InsertProduct(productSaleNew); } else { this.mongoDbAdapter.UpdateProduct(productSaleNew); } ShowProduct(productSaleNew); } else { ShowIgone(igone++); } } } } this.Invoke(new Action(() => { richTextBox1.AppendText("\n\rWait to next run!"); })); Thread.Sleep(10000); } catch (ThreadAbortException threadAbortException) { return; } } }