private Common.ProductStatus ParseStatus(GABIZ.Base.HtmlAgilityPack.HtmlDocument doc, List <string> StatusXPath) { Common.ProductStatus pt = Common.ProductStatus.LienHe; for (int i = 0; i < StatusXPath.Count; i++) { if (StatusXPath[i].Trim() != "") { var ls_node_status = doc.DocumentNode.SelectNodes(StatusXPath[i]); if (ls_node_status != null && ls_node_status.Count > 0) { foreach (var node_status in ls_node_status) { string s_s = QT.Entities.Common.ChuanHoaUnicode(Tools.removeHTML(node_status.InnerText).Trim().Replace(" ", "")).ToLower(); if (s_s == "không còn hàng") { s_s = s_s.Replace(" còn hàng", ""); } pt = QT.Entities.CrawlerProduct.ProductStatusRegex.Instance().GetStatusProduct(s_s); // Common.ParseStatus(s_s); if (pt != Common.ProductStatus.NotDefine) { break; } } if (pt != Common.ProductStatus.NotDefine) { break; } } } } return(pt); }
public DocInfo ParseInfoDoc(HtmlDocument doc, string url) { DocInfo info = new DocInfo(); var trNodes = doc.DocumentNode.SelectNodes("//table[@class='tableproperties']//tr"); if (trNodes != null) { foreach (var VARIABLE in trNodes) { var tdNodes = VARIABLE.SelectNodes("./td"); if (tdNodes != null) { for (int i = 0; i < tdNodes.Count; i++) { if (tdNodes[i].Attributes.Contains("class") && tdNodes[i].GetAttributeValue("class", "") == "headerproperties") { if (tdNodes[i + 1].GetAttributeValue("class", "") == "contentproperties") { string file = tdNodes[i].InnerText.Trim().ToLower(); string properties = tdNodes[i + 1].InnerText.Trim(); ParseData(file, properties, info); } } } } } return(info); } else { return(null); } }
public bool Parse(ref Documet document, HtmlDocument doc, string url) { bool bOk = true; var nodeData = doc.DocumentNode.SelectSingleNode("//table[@class='detailcontent']"); if (nodeData == null) { bOk = false; } else { document.TextDoc = nodeData.InnerText; document.HtmlDoc = nodeData.InnerHtml; document.Id = Common.CrcProductID(url); document.Url = url; var nodeDetail = doc.DocumentNode.SelectSingleNode(@"//table[@class='detailcontent']//tr/td/div[@align='justify']"); var nodeParagrap = nodeDetail.SelectNodes(".//p"); List <Tuple <HtmlNode, List <HtmlNode>, List <String> > > structurtData = new List <Tuple <HtmlNode, List <HtmlNode>, List <String> > >(); if (nodeParagrap == null) { bOk = false; } else { for (int i = 0; i < nodeParagrap.Count; i++) { var nodeCurrent = nodeParagrap[i]; var nodeTreeNav = nodeCurrent.SelectSingleNode(".//a[@name]"); if (nodeTreeNav != null && nodeTreeNav.GetAttributeValue("name", "").ToLower().StartsWith("chuong_")) { Tuple <HtmlNode, List <HtmlNode>, List <String> > newItem = new Tuple <HtmlNode, List <HtmlNode>, List <string> >(nodeTreeNav, new List <HtmlNode>(), new List <string>()); structurtData.Add(newItem); } else if (structurtData.Count == 0 && nodeTreeNav != null && nodeTreeNav.GetAttributeValue("name", "").ToLower().StartsWith("dieu_")) { Tuple <HtmlNode, List <HtmlNode>, List <String> > newItem = new Tuple <HtmlNode, List <HtmlNode>, List <string> >( new HtmlNode(HtmlNodeType.Element, doc, -1), new List <HtmlNode>(), new List <string>()); structurtData.Add(newItem); } else if (structurtData.Count > 0) { structurtData[structurtData.Count - 1].Item2.Add(nodeParagrap[i]); } } document.LstStructure = structurtData; document.ParseDieu(); } } return(bOk); }
public Model.SaleNews.TypeCrawlerData GetTypeOfLink(string c_url, GABIZ.Base.HtmlAgilityPack.HtmlDocument document, ConfigXPaths config) { string category = GetCategory(document, config).Trim(); if (!string.IsNullOrEmpty(category)) { if (Regex.IsMatch(category, "Nhật tảo->Mua bán->Điện thoại.*")) { return(Model.SaleNews.TypeCrawlerData.PhoneComputer); } } return(Model.SaleNews.TypeCrawlerData.None); }
private string ParseShortDescription(GABIZ.Base.HtmlAgilityPack.HtmlDocument doc, List <string> xpaths) { var description = ""; List <string> descriptions = new List <string>(); foreach (var xpath in xpaths) { var desc = doc.DocumentNode.SelectSingleNode(xpath); if (desc != null) { descriptions.Add(desc.InnerText.Trim()); } } description = string.Join(" ", descriptions); return(description); }
private string ParseName(GABIZ.Base.HtmlAgilityPack.HtmlDocument doc, List <string> xpaths) { string name = ""; var lstName = new List <string>(); foreach (var xpath in xpaths) { var nodeProductName = doc.DocumentNode.SelectSingleNode(xpath); if (nodeProductName != null) { var nameNode = Tools.removeHTML(nodeProductName.InnerText).Trim(); if (!string.IsNullOrEmpty(nameNode)) { lstName.Add(nameNode); } } } name = string.Join(" ", lstName); return(name); }
private string ParseVatInfo(GABIZ.Base.HtmlAgilityPack.HtmlDocument doc, IEnumerable <string> VATInfoXPaths) { var vatInfo = ""; foreach (var VATInfoXPath in VATInfoXPaths) { if (VATInfoXPath.Trim() != "") { var node_VAT = doc.DocumentNode.SelectSingleNode(VATInfoXPath); if (node_VAT != null) { string tempVAT = Tools.removeHTML(node_VAT.InnerText); if (Common.ParseVATInfo(tempVAT) != -1) { vatInfo += (string.IsNullOrEmpty(vatInfo) ? "" : " ") + tempVAT; } } } } return(vatInfo); }
private string ParsePromotionInfo(GABIZ.Base.HtmlAgilityPack.HtmlDocument doc, List <string> PromotionInfoXPaths) { var promotionInfo = ""; foreach (string PromotionInfoXPath in PromotionInfoXPaths) { if (PromotionInfoXPath.Trim() != "") { var node_VAT = doc.DocumentNode.SelectSingleNode(PromotionInfoXPath); if (node_VAT != null) { string tempPromotion = Tools.removeHTML(node_VAT.InnerText); if (Common.ParsePromotionInfo(tempPromotion) != -1) { promotionInfo = Common.RemoveDumplicateSpace((string.IsNullOrEmpty(promotionInfo) ? "" : " ") + tempPromotion); } } } } return(promotionInfo); }
private int ParseWarranty(GABIZ.Base.HtmlAgilityPack.HtmlDocument doc, List <string> WarrantyXPath) { int Warranty = 0; for (int i = 0; i < WarrantyXPath.Count; i++) { if (WarrantyXPath[i].Trim() != "") { var node_warranty = doc.DocumentNode.SelectSingleNode(WarrantyXPath[i]); if (node_warranty != null) { string s_w = Tools.removeHTML(node_warranty.InnerText); int v_w = Common.ParseWarranty(s_w); if (v_w != -1) { Warranty = v_w; break; } } } } return(Warranty); }
private DateTime ParseEndDeal(GABIZ.Base.HtmlAgilityPack.HtmlDocument doc, List <string> EndDealXPath) { DateTime dtEnd = SqlDb.MinDateDb; for (int i = 0; i < EndDealXPath.Count; i++) { if (EndDealXPath[i].Trim() != "") { var node_EndDate = doc.DocumentNode.SelectSingleNode(EndDealXPath[i]); if (node_EndDate != null) { string data = Tools.removeHTML(node_EndDate.InnerText).TrimEnd(new char[] { '-', ' ', '>' }); DateTime dt = Common.ParseDateTime(data); if (dt != SqlDb.MinDateDb) { dtEnd = dt; break; } } } } return(dtEnd); }
public void FindClassification() { List <long> addedToDb = new List <long>(); Dictionary <long, string> addedLink = this.LoadVisitedLink(); ILog log = log4net.LogManager.GetLogger(typeof(Program)); var rabbitMQServer = RabbitMQManager.GetRabbitMQServer("rabbitMQ177"); var worker = new Worker("VatGia_Queue", false, rabbitMQServer); Task workerTask = new Task(() => { log.Info("Start consumer!"); worker.JobHandler = (downloadImageJob) => { log.Info("Get job from MQ"); try { JobCrawler jobData = JobCrawler.Deserialize(downloadImageJob.Data); if (jobData == null) { return(true); } Console.WriteLine("Get Job:" + jobData.urlDetail); long IDUrl = Math.Abs(GABIZ.Base.Tools.getCRC64(jobData.urlDetail)); if (!addedLink.ContainsKey(IDUrl)) { addedLink.Add(IDUrl, ""); string regexExtract = @"^.*vatgia.com\/\d+\/[^\/]*html$"; string html = GABIZ.Base.HtmlUrl.HTMLTransmitter.getHTML(jobData.urlDetail, 120, 2); GABIZ.Base.HtmlAgilityPack.HtmlDocument htmlDocument = new GABIZ.Base.HtmlAgilityPack.HtmlDocument(); htmlDocument.LoadHtml(html); var nodesLink = htmlDocument.DocumentNode.SelectNodes("//a"); if (nodesLink != null) { foreach (var nodeLink in nodesLink) { string url = QT.Entities.Common.GetAbsoluteUrl(nodeLink.Attributes["href"].Value.ToString(), "http://vatgia.com"); long IDUrlNew = Math.Abs(GABIZ.Base.Tools.getCRC64(url)); if (!addedLink.ContainsKey(IDUrlNew)) { if (Regex.IsMatch(url, regexExtract)) { PushJobToQueue(new JobCrawler() { level = jobData.level + 1, urlDetail = url }); sqlSaveData.RunQuery("if not exists (select id from VatGiaClassification where id = @id) insert into VatGiaClassification (id, url) values (@id, @url)", CommandType.Text, new System.Data.SqlClient.SqlParameter[] { SqlDb.CreateParamteterSQL("@id", Math.Abs(GABIZ.Base.Tools.getCRC64(url)), SqlDbType.BigInt), SqlDb.CreateParamteterSQL("@url", url, SqlDbType.NVarChar) }); addedLink.Add(IDUrlNew, ""); addedToDb.Add(IDUrlNew); Console.WriteLine(url); } if (addedToDb.Count > 100) { foreach (var item in addedToDb) { this.sqlVisited.RunQuery("if not exists (select id from visitedlink where id = @id) insert into VisitedLink (id) values (@id)" , CommandType.Text , new System.Data.SqlClient.SqlParameter[] { SqlDb.CreateParamteterSQL("@id", item, SqlDbType.BigInt) }); } addedToDb.Clear(); } } } } } return(true); } catch (Exception ex01) { log.Error("Exception:", ex01); return(true); } }; worker.Start(); }); workerTask.Start(); Console.ReadLine(); }