public string TestXpath(string DetailUrl, string configXpath) { string Text = ""; GABIZ.Base.HtmlAgilityPack.HtmlDocument doc = new GABIZ.Base.HtmlAgilityPack.HtmlDocument(); string html = GABIZ.Base.HtmlUrl.HTMLTransmitter.getHTML(DetailUrl, 45, 2); html = html.Replace("<form", "<div"); html = html.Replace("</form", "</div"); doc.LoadHtml(html); if (!string.IsNullOrEmpty(configXpath)) { var Nodes = doc.DocumentNode.SelectNodes(configXpath); if (Nodes != null) { foreach (var node in Nodes) { Text = node.InnerText.ToString(); } } } else { Text = "Không phân tích được!"; } return(Text); }
private void AnalysicProduct(string urlCurrent, GABIZ.Base.HtmlAgilityPack.HtmlDocument doc) { if (IsDetailUrl(urlCurrent, _detailLinkRegexs)) { if (_company.Status == Common.CompanyStatus.TIN) { Product product = new Product(); product.Analytics(doc, urlCurrent, _config, false, _company.Domain); } else { var pt = new Product(); pt.Analytics(doc, urlCurrent, _config, false, _company.Domain); if (pt.IsSuccessData(this._config.CheckPrice)) { pt.Valid = false; if (!IsExistsProduct(pt.ID)) { if (!this._hsHashDuplicate.ContainsKey(pt.GetHashCheckDuplicate())) { _totalProductBefore++; _productAdapter.InsertProduct(pt); _redisLastUpdateProduct.UpdateBathLastUpdateProduct(this._companyId, new List <long> { pt.ID }, DateTime.Now); _cacheProductHash.SetCacheProductHash(_companyId, new List <QT.Entities.CrawlerProduct.Cache.ProductHash> { new ProductHash() { HashChange = pt.GetHashChange(), HashDuplicate = pt.GetHashCheckDuplicate(), Id = pt.ID, Price = pt.Price, url = pt.DetailUrl, HashImage = pt.GetHashImage(), } }); //_productAdapter.PushMQChangeImage(new List<long> { pt.ID }); _mqLogChangePrice.PushQueueChangePriceLog( new JobRabbitChangePrice() { Name = pt.Name, OldPrice = 0, NewPrice = pt.Price, ProductID = pt.ID, CompanyID = pt.IDCongTy }); AddToDuplicate(pt.GetHashCheckDuplicate(), pt.ID); } else { _log.Info("Duplicate data"); } } } } } }
private void btnDuyetTang_Click(object sender, EventArgs e) { ProductSaleNewDataAdapter productAdapter = new ProductSaleNewDataAdapter(new QT.Entities.Data.SqlDb(QT.Entities.Server.ConnectionStringCrawler)); try { string root = "http://bantoyota.com.vn"; Uri uriroot = new Uri(root); string html = GABIZ.Base.HtmlUrl.HTMLTransmitter.getHTML(root, 45, 2); GABIZ.Base.HtmlAgilityPack.HtmlDocument doc = new GABIZ.Base.HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(html); var nodeMakers = doc.DocumentNode.SelectNodes(@"//ul[@class='sub-menu']//li//a"); //Danh sách các dòng xe foreach (var nodeModelCar in nodeMakers) //Vào node dòng xe { string strModelCar = nodeModelCar.InnerText.Trim(); if (productAdapter.CheckExitFullLink("toyota->" + strModelCar) >= 0) { string urlToModel = nodeModelCar.Attributes["href"].Value.ToString(); urlToModel = uriroot.Scheme + @"://" + uriroot.Host + urlToModel; GABIZ.Base.HtmlAgilityPack.HtmlDocument docModel = new GABIZ.Base.HtmlAgilityPack.HtmlDocument(); docModel.LoadHtml(System.Web.HttpUtility.HtmlDecode(GABIZ.Base.HtmlUrl.HTMLTransmitter.getHTML(urlToModel, 45, 2))); string xPathKeyWord = "//div[@class='rightsearch classhot']//div[@class='item']//a"; var nodesKeyWords = docModel.DocumentNode.SelectNodes(xPathKeyWord); if (nodesKeyWords != null) { foreach (var nodeKeyWord in nodesKeyWords) { string keyWord = nodeKeyWord.Attributes["title"].Value.Trim().ToLower().Replace(" ", ""); string urlDetailKeyWord = nodeKeyWord.Attributes["href"].Value.ToString(); urlDetailKeyWord = uriroot.Scheme + @"://" + uriroot.Host + urlDetailKeyWord; GABIZ.Base.HtmlAgilityPack.HtmlDocument docKeyWordDetail = new GABIZ.Base.HtmlAgilityPack.HtmlDocument(); docKeyWordDetail.LoadHtml(GABIZ.Base.HtmlUrl.HTMLTransmitter.getHTML(urlDetailKeyWord, 45, 2)); string descXPath = @"//meta[@name='description']/@content"; var node = docKeyWordDetail.DocumentNode.SelectSingleNode(descXPath); string description = "";// (node == null) ? "" : Common.GetTextOfXPath(descXPath,,)[0]; //Lưu dữ liệu. try { productAdapter.SaveKeyWord("toyota", strModelCar, keyWord.Replace("bán xe", ""), keyWord, description); } catch (Exception ex) { MessageBox.Show(ex.Message); } } } } } } catch (Exception ex) { MessageBox.Show(ex.Message); } MessageBox.Show("Hoàn tất!"); }
private void AddCategoryName() { GABIZ.Base.HtmlAgilityPack.HtmlDocument doc = new GABIZ.Base.HtmlAgilityPack.HtmlDocument(); DataTable tblCategory = sqlDb.GetTblData("select * from Category_Lazada"); foreach (DataRow RowInfo in tblCategory.Rows) { long ID = QT.Entities.Common.Obj2Int64(RowInfo["ID"]); string url = QT.Entities.Common.Obj2String(RowInfo["Url"]); string xpathName = "//li[@class='last-child']//span[@class='header-breadcrumb__element']"; doc.LoadHtml(this.GetHtml(url)); var nodes = doc.DocumentNode.SelectNodes(xpathName); if (nodes != null) { foreach (var node in nodes) { int count = nodes.Count; string name = node.InnerText.ToString(); sqlDb.RunQuery("Update Category_Lazada set CategoryName = @CategoryName where ID = @ID", CommandType.Text, new System.Data.SqlClient.SqlParameter[] { sqlDb.CreateParamteter("@CategoryName", name, SqlDbType.NVarChar), sqlDb.CreateParamteter("@ID", ID, SqlDbType.BigInt) }); } } Console.WriteLine("Success: " + ID); } Console.WriteLine("Done!"); Console.ReadLine(); }
private void ExtractionLink(GABIZ.Base.HtmlAgilityPack.HtmlDocument doc) { var nodeLinks = doc.DocumentNode.SelectNodes("//a[@href]"); if (nodeLinks != null) { List <string> linkOfUrl = new List <string>(); foreach (var itemNode in nodeLinks) { linkOfUrl.Add(itemNode.Attributes["href"].Value.ToString()); } foreach (string newLink in linkOfUrl) { string newLinkFull = System.Web.HttpUtility.HtmlDecode(Common.GetAbsoluteUrl(newLink, _rootUri)).Trim(); if (newLinkFull.Length < MaxLengthUrl) { long crcNewLink = Common.GetIDProduct(newLinkFull); if (!_visitedCrc.ContainsKey(crcNewLink) && !IsNoVisitUrl(newLinkFull) && IsVisitLink(newLinkFull)) { _visitedCrc.Add(crcNewLink, true); _linkQueue.Enqueue(newLinkFull); } } } } }
private void btnTest_Click(object sender, EventArgs e) { string urlTest = linkTestTextBox.Text; IDownloadHtml dowloadHtml = new DownloadHtmlCrawler(); WebExceptionStatus webExceptionStatus = WebExceptionStatus.Success; string html = System.Web.HttpUtility.HtmlDecode(dowloadHtml.GetHtml(urlTest, 45, 2, out webExceptionStatus)); if (!string.IsNullOrEmpty(html)) { HtmlDocument htmlDocument = new HtmlDocument(); htmlDocument.LoadHtml(html); var kerner = new StandardKernel(new DomainModule()); IParser parseNormal = kerner.Get <IParser>(); parseNormal.Init(domainTextBox.Text); var propertiesData = parseNormal.ParseData(htmlDocument); if (propertiesData != null) { txtProductTest.Text = propertiesData.GetJSonDisplay(); } } else { MessageBox.Show("Can't download html"); } }
public List <string> GetListTag(string url, string xpath) { QT.Entities.RaoVat.HandlerContentOfHtml handlerContentHtml = new Entities.RaoVat.HandlerContentOfHtml(); GABIZ.Base.HtmlAgilityPack.HtmlDocument doc = new GABIZ.Base.HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(GABIZ.Base.HtmlUrl.HTMLTransmitter.getHTML(url, 45, 2, true)); List <string> lstTag = QT.Entities.Common.GetTextInNode(doc, xpath); return(lstTag); }
private void StartCrawler() { Queue <QT.Moduls.Crawler.Job> queueWait = new Queue <QT.Moduls.Crawler.Job>(); queueWait.Enqueue(new QT.Moduls.Crawler.Job() { url = "http://www.lazada.vn", ProductId = QT.Entities.Common.CrcProductID("http://www.lazada.vn") }); Dictionary <long, string> dicVited = new Dictionary <long, string>(); do { QT.Moduls.Crawler.Job jobData = queueWait.Dequeue(); GABIZ.Base.HtmlAgilityPack.HtmlDocument doc = new GABIZ.Base.HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(this.GetHtml(jobData.url)); //Extraction var a_nodes = doc.DocumentNode.SelectNodes("//a[@href]"); if (a_nodes != null) { List <string> lstLink = new List <string>(); foreach (var itemNode in a_nodes) { lstLink.Add(itemNode.Attributes["href"].Value.ToString()); } foreach (string aUrl in lstLink) { if (QT.Entities.Common.CheckRegex(aUrl, this.lstRegexOK, this.lstRegexIgone, false)) { long LinkID = QT.Entities.Common.CrcProductID(aUrl); bool bAdded = false; addedQueue.TryGetValue(LinkID, out bAdded); if (!bAdded) { this.addedQueue.Add(LinkID, true); queueWait.Enqueue(new QT.Moduls.Crawler.Job() { ConfigID = 0, deep = jobData.deep + 1, ProductId = LinkID, url = aUrl }); } } } //ProductAnalysic if (QT.Entities.Common.CheckRegex(jobData.url, this.lstRegexProduct, null, true)) { this.sqlDb.RunQuery("Insert into Category_Lazada (ID, Url) Values (@ID, @Url)", System.Data.CommandType.Text, new System.Data.SqlClient.SqlParameter[] { SqlDb.CreateParamteterSQL("@ID", jobData.ProductId, System.Data.SqlDbType.BigInt), SqlDb.CreateParamteterSQL("@Url", jobData.url, System.Data.SqlDbType.NVarChar) }, true, 10); } } }while (queueWait.Count > 0); }
private void btnTestProduct_Click(object sender, EventArgs e) { string urlTest = urlTestTextBox.Text; if (!string.IsNullOrWhiteSpace(urlTest)) { string url = urlTestTextBox.Text; string html = GABIZ.Base.HtmlUrl.HTMLTransmitter.getHTML(url, 45, 2); GABIZ.Base.HtmlAgilityPack.HtmlDocument doc = new GABIZ.Base.HtmlAgilityPack.HtmlDocument(); WebExceptionStatus status = WebExceptionStatus.Success; html = GABIZ.Base.HtmlUrl.HTMLTransmitter.getHTML(url, 45, 2, out status); doc.LoadHtml(html); ConfigXPaths config = this.raovatSqlAdapter.GetConfigByID((int)this.configXPathIDSpinEdit.Value); if (config == null) { config = new ConfigXPaths() { ID = -1 } } ; if (this.LoadFormToConfig(ref config)) { var product = new ProductSaleNew(); int iError = this.hanlerContentOfHtml.AnalyticsProductSaleNew(config.domain, urlTest, config, product, this.raovatSqlAdapter.GetDicMapClassificationAndCategories(config.website_id), this.raovatSqlAdapter.GetDicCityAndRegex()); FrmDataShow frmDataShow = new FrmDataShow(product.ToString()); frmDataShow.btnSave.Click += new EventHandler(delegate(object obj, EventArgs eventArg) { if (MessageBox.Show("Save to Cassandra?", "Warning", MessageBoxButtons.YesNo, MessageBoxIcon.Warning) == System.Windows.Forms.DialogResult.Yes) { bool bExits = this.mongoDbAdapter.CheckExistsProductSalenew(product.id); if (bExits) { mongoDbAdapter.UpdateProduct(product); mongoDbAdapter.SaveHtml(product.id, html, bExits); } else { mongoDbAdapter.InsertProduct(product); mongoDbAdapter.SaveHtml(product.id, html, bExits); } } }); frmDataShow.ShowDialog(); } } else { MessageBox.Show("Not url test"); } }
private void Extraction(HtmlDocument doc, JobFindNew job) { var countLinkAdds = 0; var countLinks = 0; if (job.Deep > _config.MaxDeep) { _log.Info("Over dee. Not extraction"); return; } else if (_visitedCrc.Count > _config.MaxLinksFindNew) { _log.Info("Over max link crc. Not extraction"); return; } var nodeLinks = doc.DocumentNode.SelectNodes("//a[@href]"); if (nodeLinks != null) { foreach (var nodelink in nodeLinks) { countLinks++; var link = System.Web.HttpUtility.HtmlDecode(Common.GetAbsoluteUrl(nodelink.Attributes["href"].Value, _rootUri)).Trim(); if (_companyId == 480254425312154563 && link.Contains("sid")) { link = link.Substring(0, link.IndexOf("sid", StringComparison.Ordinal) - 1); } if (link.Length < MaxLengthUrl) { var crcNewLink = Common.GetIDProduct(link); if (!_visitedCrc.Contains(crcNewLink) && !_crcProductOldGroup.Contains(crcNewLink) && !_hsDuplicateProduct.Contains(crcNewLink) && Common.CheckRegex(link, _config.VisitUrlsRegex, _config.NoVisitUrlRegex, false)) { countLinkAdds++; _visitedCrc.Add(crcNewLink); _linkQueue.Enqueue(new JobFindNew() { Url = link, Deep = job.Deep + 1, ParentId = job.Id, Id = Common.CrcProductID(link) }); _log.Debug("Add link to queue:" + link); } } } } _log.Info(GetPrefixLog() + string.Format("NumberLinkAdded {0}/{1}", countLinkAdds, countLinks)); }
private void ProcessLink(JobFindNew jobCrawl, string html) { var doc = new HtmlDocument(); doc.LoadHtml(html); if (IsDetailUrl(jobCrawl.Url)) { Analysic(jobCrawl, doc); } Extraction(doc, jobCrawl); }
private static List <PropertyEntyties> GetListPropertiesFyi(string html) { List <PropertyEntyties> rlist = new List <PropertyEntyties>(); try { GABIZ.Base.HtmlAgilityPack.HtmlDocument doc = new GABIZ.Base.HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(html); var nodes = doc.DocumentNode.SelectNodes("//tr"); var nodesTD = doc.DocumentNode.SelectNodes("//tr[1]"); if (nodes != null) { string tenNhom = "", tenthuoctinh = "", giatri = ""; tenNhom = "Thông số chung"; PropertyEntyties item = new PropertyEntyties(); int stt = 1; bool check3cot = false; for (int i = 0; i < nodes.Count; i++) { //if (nodes[i].ChildNodes[1].InnerText.Trim().Length > 0) //{ // check3cot = true; //} //if (check3cot) //{ //} //else //{ tenthuoctinh = nodes[i].ChildNodes[1].InnerText.Trim(); giatri = nodes[i].ChildNodes[3].InnerText.Trim(); //} //tenNhom = nodes[i].ChildNodes[1].InnerText.Trim(); //tenthuoctinh = nodes[i].ChildNodes[3].InnerText.Trim(); //giatri = nodes[i].ChildNodes[5].InnerText.Trim(); item = new PropertyEntyties(); item.ID = Common.GetID_Properties(tenthuoctinh + tenNhom); item.IDType = Common.GetID_Properties(tenNhom); item.IDValue = Common.GetID_Properties(giatri); item.Name = tenthuoctinh; item.NameType = tenNhom; item.Value = giatri; item.STT = stt; stt++; rlist.Add(item); } } } catch (Exception) { } return(rlist); }
private WebExceptionStatus Analysic(Product product, string url) { var outException = new WebExceptionStatus(); var html = this.GetHtmlCode(url, _config.UseClearHtml, out outException); if (html != "") { var doc = new GABIZ.Base.HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(html); product.Analytics(doc, url, _config, false, _company.Domain); product.Valid = product.IsSuccessData(_config.CheckPrice); } return(outException); }
private void Crawl() { InitSession(); while (_linkQueue.Count > 0 && (DateTime.Now - startCrawler).TotalHours < _config.MaxHourFindNew && _countVisited < _config.MaxLinksFindNew) { try { this._tokenCrawler.ThrowIfCancellationRequested(); DelayCrawler(); _urlCurrent = _linkQueue.Dequeue(); SetRunningCompany(); LogData(string.Format("THR: {4} Cmp: {5} Q: {0} cVs: {1} cNP: {2} TTP: {6} cC: {7} Url: {3}" , _linkQueue.Count, _countVisited, _totalProductBefore , _urlCurrent, _indexThread , _company.Domain.PadRight(50, ' ') , _totalProduct , _countCompany)); if (!IsNoVisitUrl(_urlCurrent)) { _countVisited++; var html = GetHtmlCode(_urlCurrent, _config.UseClearHtml); PushLogVisited(_urlCurrent, false); if (html != "") { GABIZ.Base.HtmlAgilityPack.HtmlDocument doc = new GABIZ.Base.HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(html); AnalysicProduct(_urlCurrent, doc); ExtractionLink(doc); } } } catch (OperationCanceledException) { throw; } catch (Exception ex01) { _log.Error(ex01); } } CheckWarningOverMax(); EndSession(); }
public static void ShowProduct(long CompanyId) { Entities.Company company = new Entities.Company(CompanyId); Configuration config = new Configuration(CompanyId); ProductParse pp = new ProductParse(); ProductEntity product = new ProductEntity(); string detailUrl = config.LinkTest; GABIZ.Base.HtmlAgilityPack.HtmlDocument document = new GABIZ.Base.HtmlAgilityPack.HtmlDocument(); pp.Analytics(product, document, config.LinkTest, config, config.Domain); string strDataShow = ""; strDataShow += string.Format("\r\n Name: {0}", product.Name); frmShow.Visible = true; frmShow.Show(); }
public static string GetHtmlFromUrl(string url, bool UseClearHtml, int timeOut = 15, int loopTry = 2) { string html = GABIZ.Base.HtmlUrl.HTMLTransmitter.getHTML(url, timeOut, loopTry); if (html != "") { if (UseClearHtml) { html = QT.Entities.Common.TidyCleanR(html); } GABIZ.Base.HtmlAgilityPack.HtmlDocument doc = new GABIZ.Base.HtmlAgilityPack.HtmlDocument(); html = html.Replace("<form", "<div"); html = html.Replace("</form", "</div"); } return(html); }
private void btnRun_Click(object sender, EventArgs e) { Task.Factory.StartNew(new Action(() => { SqlDb sqlDb = new SqlDb(@"Data Source=WIN-6ICNIQVFE0A;Initial Catalog=SaleNews;Integrated Security=True"); DataTable tbl = sqlDb.GetTblData("", CommandType.Text, new System.Data.SqlClient.SqlParameter[] { }); foreach (DataRow rowInfo in tbl.Rows) { string key = ""; GABIZ.Base.HtmlAgilityPack.HtmlDocument document = new GABIZ.Base.HtmlAgilityPack.HtmlDocument(); string url = string.Format(@"https://www.google.com/?gws_rd=ssl#safe=off&q=con+ch%C3%B3", key.Replace(" ", "-")); document.Load(url); document.DocumentNode.SelectNodes(""); } })); }
private void btnGetHTML_Click(object sender, EventArgs e) { try { var config = this.raovatSqlAdapter.GetConfigByID((int)this.configXPathIDSpinEdit.Value); GABIZ.Base.HtmlAgilityPack.HtmlDocument doc = new GABIZ.Base.HtmlAgilityPack.HtmlDocument(); string html = GABIZ.Base.HtmlUrl.HTMLTransmitter.getHTML(urlTestTextBox.Text, 45, 2, true); doc.LoadHtml(html); FrmDataShow fr = new FrmDataShow(html); fr.ShowDialog(); } catch (Exception ex) { MessageBox.Show(ex.Message, "Error", MessageBoxButtons.OK, MessageBoxIcon.Error); } }
public static List <string> GetWebsiteInGoogle(string Keyword) { string xpath = @"//div[@class='ads-visurl']/cite"; string url = "https://www.google.com.vn/search?q="; StringBuilder sb = new StringBuilder(); List <string> listLinks = new List <string>(); for (int i = 0; i < 10; i++) { url = "https://www.google.com.vn/search?q=" + System.Web.HttpUtility.UrlEncode(Keyword) + "&safe=off&start=" + (i * 10).ToString(); try { Uri urlRoot = new Uri(url, UriKind.RelativeOrAbsolute); HttpWebRequest oReq = (HttpWebRequest)WebRequest.Create(urlRoot); oReq.AllowAutoRedirect = true; oReq.UserAgent = @"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.22 (KHTML, like Gecko) Chrome/25.0.1364.152 Safari/537.22"; oReq.Timeout = 3000; HttpWebResponse resp = (HttpWebResponse)oReq.GetResponse(); var encoding = Encoding.GetEncoding(resp.CharacterSet); if (resp.ContentType.StartsWith("text/html", StringComparison.InvariantCultureIgnoreCase)) { GABIZ.Base.HtmlAgilityPack.HtmlDocument doc = new GABIZ.Base.HtmlAgilityPack.HtmlDocument(); var resultStream = resp.GetResponseStream(); doc.Load(resultStream, encoding); #region Get Value GABIZ.Base.HtmlAgilityPack.HtmlNodeCollection node = doc.DocumentNode.SelectNodes(xpath); if (node != null) { foreach (GABIZ.Base.HtmlAgilityPack.HtmlNode item in node) { string strLink = item.InnerText; string Domain = QT.Entities.Common.GetDomainFromUrl(strLink); string Website = QT.Entities.Common.GetWebsiteFromUrl(strLink); listLinks.Add(strLink); } } #endregion resultStream.Close(); } resp.Close(); } catch (Exception ex01) { } } return(listLinks); }
private void Analysic(JobFindNew jobCrawl, HtmlDocument doc) { if (_company.Status == Common.CompanyStatus.TIN) { var product = new Product(); product.Analytics(doc, jobCrawl.Url, _config, false, _company.Domain); } else { var product = new ProductEntity(); var productParse = new ProductParse(); productParse.Analytics(product, doc, jobCrawl.Url, _config, _company.Domain); if (product.IsSuccessData(_config.CheckPrice)) { product.Valid = false; if (!IsExistsProduct(product.ID)) { if (!_dicDuplicate.ContainsKey(product.GetHashDuplicate())) { product.StatusChange.IsNew = true; PushChangeProduct(product); _dicDuplicate.Add(product.GetHashDuplicate(), product.ID); _crcProductOldGroup.Add(product.ID); _countNewProduct++; } else { _producerDuplicateProduct.PublishString( Newtonsoft.Json.JsonConvert.SerializeObject(new ProductDuplicate() { CId = _companyId, Id = product.ID, Hash = product.GetHashDuplicate(), IdDup = _dicDuplicate[product.GetHashDuplicate()], Url = product.DetailUrl }), true); } } } } }
private string GetDescription(string html, Configuration configXPath) { List <string> lstDescripotionHtml = new List <string>(); GABIZ.Base.HtmlAgilityPack.HtmlDocument doc = new GABIZ.Base.HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(html); if (configXPath.ShortDescriptionXPath != null && configXPath.ShortDescriptionXPath.Count > 0) { for (int i = 0; i < configXPath.ShortDescriptionXPath.Count; i++) { if (configXPath.ShortDescriptionXPath[i].Trim() != "") { var node_ShortDescription = doc.DocumentNode.SelectSingleNode(configXPath.ShortDescriptionXPath[i]); if (node_ShortDescription != null) { lstDescripotionHtml.Add(node_ShortDescription.OuterHtml); } } } } return(string.Join("||||", lstDescripotionHtml)); }
public List <string> GetProxy() { List <string> lstProxy = new List <string>(); foreach (ConfigWebsite configItem in FactoryConfigWebsite.Instance().GetListConfigWebsite()) { foreach (string url in configItem.RootLinks) { GABIZ.Base.HtmlAgilityPack.HtmlDocument document = new GABIZ.Base.HtmlAgilityPack.HtmlDocument(); document.LoadHtml(GetHtmlCode(url)); var nodes = document.DocumentNode.SelectNodes(configItem.XPath); if (nodes != null && nodes.Count > 0) { foreach (var aNode in nodes) { lstProxy.Add(aNode.Attributes["href"].Value.ToString()); } } } } return(lstProxy); }
private static List <PropertyEntyties> GetListPropertiesVatGia(string html) { List <PropertyEntyties> rlist = new List <PropertyEntyties>(); GABIZ.Base.HtmlAgilityPack.HtmlDocument doc = new GABIZ.Base.HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(html); var nodes = doc.DocumentNode.SelectNodes("//tr"); if (nodes != null) { string tenNhom = "", tenthuoctinh = "", giatri = ""; PropertyEntyties item = new PropertyEntyties(); int stt = 1; for (int i = 0; i < nodes.Count; i++) { if (nodes[i].Attributes.Count == 0) { // tên nhóm tenNhom = nodes[i].ChildNodes[1].InnerText.Trim(); } else { tenthuoctinh = nodes[i].ChildNodes[1].InnerText.Trim(); giatri = nodes[i].ChildNodes[3].InnerText.Trim(); item = new PropertyEntyties(); item.ID = Common.GetID_Properties(tenthuoctinh + tenNhom); item.IDType = Common.GetID_Properties(tenNhom); item.IDValue = Common.GetID_Properties(giatri); item.Name = tenthuoctinh; item.NameType = tenNhom; item.Value = giatri; item.STT = stt; stt++; rlist.Add(item); } } } return(rlist); }
public void Start() { string patternQuery = "update company set ConfigSuccess = {0} where id = {1}"; List <string> query = new List <string>(); QT.Entities.Server.ConnectionString = this.connectionString; ProductAdapter productAdapter = new ProductAdapter(new SqlDb(QT.Entities.Server.ConnectionString)); foreach (DataRow row in productAdapter.GetLinkTestCrawlerAllCompany().Rows) { try { Thread.Sleep(1000); long CompanyID = Convert.ToInt64(row["Id"]); string domain = Convert.ToString(row["Domain"]); string LinkAutoTest = Convert.ToString(row["LinkAutoTest"]); var config = new Configuration(CompanyID); bool IsOK = false; Product product = new Product(); string html = GABIZ.Base.HtmlUrl.HTMLTransmitter.getHtmlNomarlTag(GABIZ.Base.HtmlUrl.HTMLTransmitter.getHTML(LinkAutoTest, 45, 2)); if (html != "") { GABIZ.Base.HtmlAgilityPack.HtmlDocument doc = new GABIZ.Base.HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(html); product.Analytics(doc, LinkAutoTest, config, true, domain, null); IsOK = product.IsSuccessData(config.CheckPrice); query.Add(string.Format(patternQuery, IsOK == true ? "1" : "0", CompanyID)); if (query.Count > 10) { } } } catch (Exception ex01) { log.Error(ex01); } } }
private void EventCheckXPaths(object sender, KeyEventArgs e) { if (e.KeyCode == Keys.F10) { try { string url = urlTestTextBox.Text; string html = GABIZ.Base.HtmlUrl.HTMLTransmitter.getHTML(url, 45, 2); GABIZ.Base.HtmlAgilityPack.HtmlDocument doc = new GABIZ.Base.HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(html); var sXPaths = (sender as RichTextBox).Text.Trim().Split(SqlDb.arSplit, 100, StringSplitOptions.RemoveEmptyEntries); if (sXPaths != null) { foreach (var xPath in sXPaths) { var nodes = doc.DocumentNode.SelectNodes(xPath); if (nodes != null) { foreach (var node1 in nodes) { MessageBox.Show(node1.InnerText); } } else { MessageBox.Show("NoData", "NoData", MessageBoxButtons.OK, MessageBoxIcon.Warning); } } } } catch (Exception ex) { MessageBox.Show(ex.Message, "Error XPaths Config", MessageBoxButtons.OK, MessageBoxIcon.Error); } } }
private void btnGetProductLInk_Click(object sender, EventArgs e) { try { var config = this.raovatSqlAdapter.GetConfigByID((int)this.configXPathIDSpinEdit.Value); List <string> lstLink = Common.GetListXPathFromString(urlTestTextBox.Text); List <string> lstExtractLink = new List <string>(); foreach (string str in lstLink) { GABIZ.Base.HtmlAgilityPack.HtmlDocument doc = new GABIZ.Base.HtmlAgilityPack.HtmlDocument(); string html = GABIZ.Base.HtmlUrl.HTMLTransmitter.getHTML(str, 45, 2, true); doc.LoadHtml(html); var nodes = doc.DocumentNode.SelectNodes(@"//a[@href]"); if (nodes != null) { foreach (var node in nodes) { string url = Common.GetAbsoluteUrl(node.Attributes["href"].Value.Trim(), config.domain); if (QT.Entities.Common.CheckRegex(url, config.ProductUrlsRegex, config.NoProductUrlRegex, false)) { if (!lstExtractLink.Contains(url.Trim())) { lstExtractLink.Add(url.Trim()); } } } } } FrmDataShow fr = new FrmDataShow(QT.Entities.Common.ConvertToString(lstExtractLink, "\n")); fr.ShowDialog(); } catch (Exception ex) { MessageBox.Show(ex.Message, "Error", MessageBoxButtons.OK, MessageBoxIcon.Error); } }
public void Analysic() { try { long productID = 0; ProductAdapter productAdapter = new ProductAdapter(sqldb); Configuration configXPath = new Configuration(companyID); QT.Entities.Company company = new Entities.Company(companyID); DataTable tblProduct = sqldb.GetTblData("Select ID,DetailUrl From Product Where Company = @CompanyID", CommandType.Text, new SqlParameter[] { SqlDb.CreateParamteterSQL("@CompanyID", companyID, SqlDbType.BigInt) }, null, true); foreach (DataRow rowInfo in tblProduct.Rows) { productID = QT.Entities.Common.Obj2Int64(rowInfo["ID"]); if (!DicDetailUrl.ContainsKey(QT.Entities.Common.Obj2String(rowInfo["DetailUrl"]))) { sqldb.RunQuery("update Product set Valid = 0 where Company = @CompanyID and ID = @productID", CommandType.Text, new SqlParameter[] { sqldb.CreateParamteter("@CompanyID", companyID, SqlDbType.BigInt), sqldb.CreateParamteter("@productID", productID, SqlDbType.BigInt) }); } } foreach (var DetailUrl in DicDetailUrl) { string strDetailUrl = DetailUrl.Key.ToString(); productID = QT.Entities.Common.GetIDProduct(strDetailUrl); if (this.bDeleteProductData) { sqldb.RunQuery("delete product where id = @id", CommandType.Text, new SqlParameter[] { SqlDb.CreateParamteterSQL("@id", productID, SqlDbType.BigInt) }); } Product pt = new Product(); GABIZ.Base.HtmlAgilityPack.HtmlDocument doc = new GABIZ.Base.HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(GetHtmlCode(strDetailUrl, false)); pt.Analytics(doc, strDetailUrl, configXPath, false, company.Domain, null); if (pt.IsSuccessData(this.config.CheckPrice)) { productAdapter.InsertProduct(pt); lstProductIDChangeImage.Add(productID); this.Invoke(new Action(() => { richTextBox1.AppendText("\r\nSuccess link: " + strDetailUrl); })); } else { this.Invoke(new Action(() => { richTextBox1.AppendText("\r\nNo product link: " + strDetailUrl); })); } } } catch (Exception ex) { this.Invoke(new Action(() => { MessageBox.Show(ex.Message + ex.StackTrace); })); } MessageBox.Show(string.Format("Crawler {0} \nSuccess Link {1} \nFail link {2}", DicDetailUrl.Count, lstProductIDChangeImage.Count, (DicDetailUrl.Count - lstProductIDChangeImage.Count))); }
public void StartCrawler() { log.InfoFormat("START CRALWER:{0}", this.idCrawler); if (this.eventWhenStart != null) { this.eventWhenStart(this, "Started"); } Job task = null; while (!this.IsEnded && !(this.eventCheckOutSide != null && this.eventCheckOutSide(this))) { task = queueWaitRun.GetJob(); if (task == null) { if (AddJobToQueue()) { continue; } else { break; } } else if (task != null) { if (this.eventWhenGetJob != null) { this.eventWhenGetJob(this, task.ToString()); } if (!CheckStopCrawler(task)) { string html = GetHtmlOfWeb(task.url); if (html != "") { GABIZ.Base.HtmlAgilityPack.HtmlDocument doc = new GABIZ.Base.HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(html); //Extraction= if (CheckExtractionLink(task)) { var a_nodes = doc.DocumentNode.SelectNodes("//a[@href]"); if (a_nodes != null) { #region add link to process for (int i = 0; i < a_nodes.Count; i++) { string s = QT.Entities.Common.GetAbsoluteUrl(a_nodes[i].Attributes["href"].Value, this.Domain); string compactLink = QT.Entities.Common.CompactUrl(s); int s_crc = Math.Abs(GABIZ.Base.Tools.getCRC32(QT.Entities.Common.CompactUrl(s))); if (CheckRegexVisit(s) && !setAddedQueue.Exists(s_crc)) { try { //Thêm vào danh sách đã duyệt. this.setAddedQueue.Add(s_crc, s); //Đẩy thêm việc vào queue. this.PushQueue(new Job() { deep = task.deep + 1, url = s }); } catch (Exception ex2) { log.ErrorFormat(ex2.Message); } } } #endregion } } //AnalysicProduct. if (CheckRegexProduct(QT.Entities.Common.CompactUrl(task.url))) { ProcessProductData(task, doc); } } this.UpdateProcessedJob(task); } } } if (this.eventWhenEnd != null) { this.eventWhenEnd(this, "End"); } UpdateWhenEnd(); CleanDataAfterCrawler(); }
protected override void OnStart(string[] args) { log.Info("Start service"); try { InitializeComponent(); cancelTokenSource = new CancellationTokenSource(); string rabbitMQServerName = ConfigurationManager.AppSettings["rabbitMQServerName"]; workers = new Worker[workerCount]; rabbitMQServer = RabbitMQManager.GetRabbitMQServer(rabbitMQServerName); string connectToSQL = @"Data Source=172.22.30.86,1455;Initial Catalog=QT_2;Persist Security Info=True;User ID=qt_vn;Password=@F4sJ=l9/ryJt9MT;connection timeout=200"; string connectToConnection = @"Data Source=42.112.28.93;Initial Catalog=QT_2;Persist Security Info=True;User ID=wss_price;Password=HzlRt4$$axzG-*UlpuL2gYDu;connection timeout=200"; CrawlerProductAdapter crawlerProductAdapter = new CrawlerProductAdapter(new SqlDb(connectToSQL)); ProductAdapter productAdapter = new ProductAdapter(new SqlDb(connectToConnection)); for (int i = 0; i < workerCount; i++) { log.InfoFormat("Start worker {i}", i.ToString()); var worker = new Worker(AddProductToSqlJobName, false, rabbitMQServer); workers[i] = worker; var token = this.cancelTokenSource.Token; Task workerTask = new Task(() => { worker.JobHandler = (downloadImageJob) => { try { token.ThrowIfCancellationRequested(); QT.Entities.CrawlerProduct.RabbitMQ.MsSaveProduct Mss = QT.Entities.CrawlerProduct.RabbitMQ.MsSaveProduct.GetDataFromMessage(downloadImageJob.Data); string Url = Mss.Url; string Domain = QT.Entities.Common.GetDomainFromUrl(Url); long CompanyID = QT.Entities.Common.GetIDCompany(Domain); QT.Entities.Configuration config = new QT.Entities.Configuration(CompanyID); if (_company.Status == Common.CompanyStatus.WEB_CRAWLERDOMAIN) { List <QT.Entities.Company> ls = new List <QT.Entities.Company>(); QT.Entities.CrawlerDomain obj = new CrawlerDomain(); string html = GABIZ.Base.HtmlUrl.HTMLTransmitter.getHTML(Url.Trim(), 15, 1); GABIZ.Base.HtmlAgilityPack.HtmlDocument doc = new GABIZ.Base.HtmlAgilityPack.HtmlDocument(); html = html.Replace("<form", "<div"); html = html.Replace("</form", "</div"); doc.LoadHtml(html); } else { int numberItemSaved = 0; string[] arLink = Url.Trim().Split(SqlDb.arSplit, StringSplitOptions.RemoveEmptyEntries); foreach (var item in arLink) { QT.Entities.Product _product = new Product(); string html = GABIZ.Base.HtmlUrl.HTMLTransmitter.getHTML(item, 45, 2); if (config.ContentAnanyticXPath.Count >= 1) { int i1 = 0, i2 = 0; i1 = html.IndexOf(config.ContentAnanyticXPath[0]); if (i1 >= 0) { html = html.Substring(i1); if (config.ContentAnanyticXPath.Count >= 2) { i2 = html.IndexOf(config.ContentAnanyticXPath[1]); if (i2 >= 0) { html = html.Substring(0, i2 + config.ContentAnanyticXPath[1].Length); } } } html = html.Replace("<form", "<div"); html = html.Replace("</form", "</div"); html = Common.TidyCleanR(html); } _htmlSource = html; GABIZ.Base.HtmlAgilityPack.HtmlDocument doc = new GABIZ.Base.HtmlAgilityPack.HtmlDocument(); html = html.Replace("<form", "<div"); html = html.Replace("</form", "</div"); doc.LoadHtml(html); List <Product> lstUpdateProduct = new List <Product>(); List <Product> lstInsertProduct = new List <Product>(); _product.Analytics(doc, item, config, true, _company.Domain); if (_product != null && _product.IsSuccessData(config.CheckPrice)) { numberItemSaved++; if (productAdapter.CheckExistInDb(_product.ID)) { lstUpdateProduct.Add(_product); } else { lstInsertProduct.Add(_product); } productAdapter.UpdateProductsChangeToDb(lstUpdateProduct); productAdapter.InsertListProduct(lstInsertProduct); productAdapter.PushQueueIndexCompany(config.CompanyID); productAdapter.PushQueueChangeChangeImage(new MQChangeImage() { ProductID = _product.ID, Type = 1 }); log.InfoFormat("Saved {0} item product!", _product.Name); } } } return(true); } catch (OperationCanceledException opc) { log.Info("End worker"); return(false); } }; worker.Start(); }, token); workerTask.Start(); log.InfoFormat("Worker {0} started", i); } } catch (Exception ex) { log.Error("Start error", ex); throw; } }
private void DoCrawler() { Dictionary <long, int[]> dicMapClassAndCategori = this.raovatSqlAdapter.GetDicMapClassificationAndCategories(this.websiteRaoVat.id); Dictionary <int, string[]> dicMapCity = this.raovatSqlAdapter.GetDicCityAndRegex(); while (true) { try { int igone = 0; //Khởi tạo. Queue <JobCrawlerSale> queueUrl = new Queue <JobCrawlerSale>(); Dictionary <long, string> dicVisited = new Dictionary <long, string>(); foreach (var item in this.RunnerCrawler.root_link) { queueUrl.Enqueue(new JobCrawlerSale() { deep = 0, url = item }); } this.ShowQueue(queueUrl.Count); while (!this.Pause && queueUrl != null && queueUrl.Count > 0) { JobCrawlerSale job = queueUrl.Dequeue(); ShowUrlCurrent(job.url); ShowQueue(queueUrl.Count); if (configXPath.TimeDelay > 0) { Thread.Sleep(configXPath.TimeDelay); } string html = GABIZ.Base.HtmlUrl.HTMLTransmitter.getHTML(job.url, 45, 2); if (!string.IsNullOrEmpty(html)) { GABIZ.Base.HtmlAgilityPack.HtmlDocument doc = new GABIZ.Base.HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(html); //Extraction. var a_nodes = doc.DocumentNode.SelectNodes("//a[@href]"); if (a_nodes != null) { foreach (var a_node in a_nodes) { string url1 = QT.Entities.Common.GetAbsoluteUrl(a_node.Attributes["href"].Value, this.websiteRaoVat.base_link); string compacLink = QT.Entities.Common.CompactUrl(url1); long s_crc = Math.Abs(GABIZ.Base.Tools.getCRC64(compacLink)); if (!dicVisited.ContainsKey(s_crc)) { dicVisited.Add(s_crc, ""); ShowVisited(dicVisited.Count); bool bRegexProduct = QT.Entities.Common.CheckRegex(compacLink, configXPath.ProductUrlsRegex, configXPath.NoProductUrlRegex, false); bool bRegexExtract = QT.Entities.Common.CheckRegex(compacLink, configXPath.VisitUrlsRegex, configXPath.NoVisitUrlRegex, false); if (bRegexExtract) { if (job.deep + 1 < this.RunnerCrawler.max_deep) { queueUrl.Enqueue(new JobCrawlerSale() { url = url1, deep = job.deep + 1 }); ShowQueue(queueUrl.Count); } } else { if (bRegexProduct) { queueUrl.Enqueue(new JobCrawlerSale() { url = url1, deep = job.deep + 1 }); ShowQueue(queueUrl.Count); } } } } } //AnalysicData. if (QT.Entities.Common.CheckRegex( QT.Entities.Common.CompactUrl(job.url), configXPath.ProductUrlsRegex, configXPath.NoProductUrlRegex, false)) { QT.Entities.RaoVat.HandlerContentOfHtml handlerContentHtml = new Entities.RaoVat.HandlerContentOfHtml(); ProductSaleNew productSaleNew = new ProductSaleNew(); handlerContentHtml.AnalyticsProductSaleNew(websiteRaoVat.domain, job.url, doc, configXPath , productSaleNew, dicMapClassAndCategori, dicMapCity); if (productSaleNew.IsDetailSucess) { //SaveClassification try { this.raovatSqlAdapter.SaveClassification(productSaleNew.website_id, productSaleNew.web_category); } catch (Exception ex01) { } if (!this.mongoDbAdapter.CheckExistsProductSalenew(productSaleNew.id)) { this.mongoDbAdapter.InsertProduct(productSaleNew); } else { this.mongoDbAdapter.UpdateProduct(productSaleNew); } ShowProduct(productSaleNew); } else { ShowIgone(igone++); } } } } this.Invoke(new Action(() => { richTextBox1.AppendText("\n\rWait to next run!"); })); Thread.Sleep(10000); } catch (ThreadAbortException threadAbortException) { return; } } }