Exemple #1
0
 private Common.ProductStatus ParseStatus(GABIZ.Base.HtmlAgilityPack.HtmlDocument doc, List <string> StatusXPath)
 {
     Common.ProductStatus pt = Common.ProductStatus.LienHe;
     for (int i = 0; i < StatusXPath.Count; i++)
     {
         if (StatusXPath[i].Trim() != "")
         {
             var ls_node_status = doc.DocumentNode.SelectNodes(StatusXPath[i]);
             if (ls_node_status != null && ls_node_status.Count > 0)
             {
                 foreach (var node_status in ls_node_status)
                 {
                     string s_s = QT.Entities.Common.ChuanHoaUnicode(Tools.removeHTML(node_status.InnerText).Trim().Replace("&nbsp;", "")).ToLower();
                     if (s_s == "không còn hàng")
                     {
                         s_s = s_s.Replace(" còn hàng", "");
                     }
                     pt = QT.Entities.CrawlerProduct.ProductStatusRegex.Instance().GetStatusProduct(s_s);             // Common.ParseStatus(s_s);
                     if (pt != Common.ProductStatus.NotDefine)
                     {
                         break;
                     }
                 }
                 if (pt != Common.ProductStatus.NotDefine)
                 {
                     break;
                 }
             }
         }
     }
     return(pt);
 }
Exemple #2
0
        public DocInfo ParseInfoDoc(HtmlDocument doc, string url)
        {
            DocInfo info    = new DocInfo();
            var     trNodes = doc.DocumentNode.SelectNodes("//table[@class='tableproperties']//tr");

            if (trNodes != null)
            {
                foreach (var VARIABLE in trNodes)
                {
                    var tdNodes = VARIABLE.SelectNodes("./td");
                    if (tdNodes != null)
                    {
                        for (int i = 0; i < tdNodes.Count; i++)
                        {
                            if (tdNodes[i].Attributes.Contains("class") &&
                                tdNodes[i].GetAttributeValue("class", "") == "headerproperties")
                            {
                                if (tdNodes[i + 1].GetAttributeValue("class", "") == "contentproperties")
                                {
                                    string file       = tdNodes[i].InnerText.Trim().ToLower();
                                    string properties = tdNodes[i + 1].InnerText.Trim();
                                    ParseData(file, properties, info);
                                }
                            }
                        }
                    }
                }

                return(info);
            }
            else
            {
                return(null);
            }
        }
Exemple #3
0
        public bool Parse(ref Documet document, HtmlDocument doc, string url)
        {
            bool bOk      = true;
            var  nodeData = doc.DocumentNode.SelectSingleNode("//table[@class='detailcontent']");

            if (nodeData == null)
            {
                bOk = false;
            }
            else
            {
                document.TextDoc = nodeData.InnerText;
                document.HtmlDoc = nodeData.InnerHtml;
                document.Id      = Common.CrcProductID(url);
                document.Url     = url;
                var nodeDetail =
                    doc.DocumentNode.SelectSingleNode(@"//table[@class='detailcontent']//tr/td/div[@align='justify']");
                var nodeParagrap = nodeDetail.SelectNodes(".//p");
                List <Tuple <HtmlNode, List <HtmlNode>, List <String> > > structurtData =
                    new List <Tuple <HtmlNode, List <HtmlNode>, List <String> > >();

                if (nodeParagrap == null)
                {
                    bOk = false;
                }
                else
                {
                    for (int i = 0; i < nodeParagrap.Count; i++)
                    {
                        var nodeCurrent = nodeParagrap[i];
                        var nodeTreeNav = nodeCurrent.SelectSingleNode(".//a[@name]");

                        if (nodeTreeNav != null &&
                            nodeTreeNav.GetAttributeValue("name", "").ToLower().StartsWith("chuong_"))
                        {
                            Tuple <HtmlNode, List <HtmlNode>, List <String> > newItem =
                                new Tuple <HtmlNode, List <HtmlNode>, List <string> >(nodeTreeNav,
                                                                                      new List <HtmlNode>(), new List <string>());
                            structurtData.Add(newItem);
                        }
                        else if (structurtData.Count == 0 && nodeTreeNav != null &&
                                 nodeTreeNav.GetAttributeValue("name", "").ToLower().StartsWith("dieu_"))
                        {
                            Tuple <HtmlNode, List <HtmlNode>, List <String> > newItem =
                                new Tuple <HtmlNode, List <HtmlNode>, List <string> >(
                                    new HtmlNode(HtmlNodeType.Element, doc, -1),
                                    new List <HtmlNode>(), new List <string>());
                            structurtData.Add(newItem);
                        }
                        else if (structurtData.Count > 0)
                        {
                            structurtData[structurtData.Count - 1].Item2.Add(nodeParagrap[i]);
                        }
                    }
                    document.LstStructure = structurtData;
                    document.ParseDieu();
                }
            }
            return(bOk);
        }
Exemple #4
0
        public Model.SaleNews.TypeCrawlerData GetTypeOfLink(string c_url, GABIZ.Base.HtmlAgilityPack.HtmlDocument document, ConfigXPaths config)
        {
            string category = GetCategory(document, config).Trim();

            if (!string.IsNullOrEmpty(category))
            {
                if (Regex.IsMatch(category, "Nhật tảo->Mua bán->Điện thoại.*"))
                {
                    return(Model.SaleNews.TypeCrawlerData.PhoneComputer);
                }
            }
            return(Model.SaleNews.TypeCrawlerData.None);
        }
Exemple #5
0
        private string ParseShortDescription(GABIZ.Base.HtmlAgilityPack.HtmlDocument doc, List <string> xpaths)
        {
            var           description  = "";
            List <string> descriptions = new List <string>();

            foreach (var xpath in xpaths)
            {
                var desc = doc.DocumentNode.SelectSingleNode(xpath);
                if (desc != null)
                {
                    descriptions.Add(desc.InnerText.Trim());
                }
            }
            description = string.Join(" ", descriptions);
            return(description);
        }
Exemple #6
0
        private string ParseName(GABIZ.Base.HtmlAgilityPack.HtmlDocument doc, List <string> xpaths)
        {
            string name    = "";
            var    lstName = new List <string>();

            foreach (var xpath in xpaths)
            {
                var nodeProductName = doc.DocumentNode.SelectSingleNode(xpath);
                if (nodeProductName != null)
                {
                    var nameNode = Tools.removeHTML(nodeProductName.InnerText).Trim();
                    if (!string.IsNullOrEmpty(nameNode))
                    {
                        lstName.Add(nameNode);
                    }
                }
            }
            name = string.Join(" ", lstName);
            return(name);
        }
Exemple #7
0
        private string ParseVatInfo(GABIZ.Base.HtmlAgilityPack.HtmlDocument doc, IEnumerable <string> VATInfoXPaths)
        {
            var vatInfo = "";

            foreach (var VATInfoXPath in VATInfoXPaths)
            {
                if (VATInfoXPath.Trim() != "")
                {
                    var node_VAT = doc.DocumentNode.SelectSingleNode(VATInfoXPath);
                    if (node_VAT != null)
                    {
                        string tempVAT = Tools.removeHTML(node_VAT.InnerText);
                        if (Common.ParseVATInfo(tempVAT) != -1)
                        {
                            vatInfo += (string.IsNullOrEmpty(vatInfo) ? "" : " ") + tempVAT;
                        }
                    }
                }
            }
            return(vatInfo);
        }
Exemple #8
0
        private string ParsePromotionInfo(GABIZ.Base.HtmlAgilityPack.HtmlDocument doc, List <string> PromotionInfoXPaths)
        {
            var promotionInfo = "";

            foreach (string PromotionInfoXPath in PromotionInfoXPaths)
            {
                if (PromotionInfoXPath.Trim() != "")
                {
                    var node_VAT = doc.DocumentNode.SelectSingleNode(PromotionInfoXPath);
                    if (node_VAT != null)
                    {
                        string tempPromotion = Tools.removeHTML(node_VAT.InnerText);
                        if (Common.ParsePromotionInfo(tempPromotion) != -1)
                        {
                            promotionInfo =
                                Common.RemoveDumplicateSpace((string.IsNullOrEmpty(promotionInfo) ? "" : " ") +
                                                             tempPromotion);
                        }
                    }
                }
            }
            return(promotionInfo);
        }
Exemple #9
0
        private int ParseWarranty(GABIZ.Base.HtmlAgilityPack.HtmlDocument doc, List <string> WarrantyXPath)
        {
            int Warranty = 0;

            for (int i = 0; i < WarrantyXPath.Count; i++)
            {
                if (WarrantyXPath[i].Trim() != "")
                {
                    var node_warranty = doc.DocumentNode.SelectSingleNode(WarrantyXPath[i]);
                    if (node_warranty != null)
                    {
                        string s_w = Tools.removeHTML(node_warranty.InnerText);
                        int    v_w = Common.ParseWarranty(s_w);
                        if (v_w != -1)
                        {
                            Warranty = v_w;
                            break;
                        }
                    }
                }
            }
            return(Warranty);
        }
Exemple #10
0
        private DateTime ParseEndDeal(GABIZ.Base.HtmlAgilityPack.HtmlDocument doc, List <string> EndDealXPath)
        {
            DateTime dtEnd = SqlDb.MinDateDb;

            for (int i = 0; i < EndDealXPath.Count; i++)
            {
                if (EndDealXPath[i].Trim() != "")
                {
                    var node_EndDate = doc.DocumentNode.SelectSingleNode(EndDealXPath[i]);
                    if (node_EndDate != null)
                    {
                        string data =
                            Tools.removeHTML(node_EndDate.InnerText).TrimEnd(new char[] { '-', ' ', '>' });
                        DateTime dt = Common.ParseDateTime(data);
                        if (dt != SqlDb.MinDateDb)
                        {
                            dtEnd = dt;
                            break;
                        }
                    }
                }
            }
            return(dtEnd);
        }
Exemple #11
0
        public void FindClassification()
        {
            List <long> addedToDb = new List <long>();
            Dictionary <long, string> addedLink = this.LoadVisitedLink();
            ILog log = log4net.LogManager.GetLogger(typeof(Program));

            var  rabbitMQServer = RabbitMQManager.GetRabbitMQServer("rabbitMQ177");
            var  worker         = new Worker("VatGia_Queue", false, rabbitMQServer);
            Task workerTask     = new Task(() =>
            {
                log.Info("Start consumer!");
                worker.JobHandler = (downloadImageJob) =>
                {
                    log.Info("Get job from MQ");
                    try
                    {
                        JobCrawler jobData = JobCrawler.Deserialize(downloadImageJob.Data);
                        if (jobData == null)
                        {
                            return(true);
                        }

                        Console.WriteLine("Get Job:" + jobData.urlDetail);
                        long IDUrl = Math.Abs(GABIZ.Base.Tools.getCRC64(jobData.urlDetail));
                        if (!addedLink.ContainsKey(IDUrl))
                        {
                            addedLink.Add(IDUrl, "");
                            string regexExtract = @"^.*vatgia.com\/\d+\/[^\/]*html$";
                            string html         = GABIZ.Base.HtmlUrl.HTMLTransmitter.getHTML(jobData.urlDetail, 120, 2);

                            GABIZ.Base.HtmlAgilityPack.HtmlDocument htmlDocument = new GABIZ.Base.HtmlAgilityPack.HtmlDocument();
                            htmlDocument.LoadHtml(html);
                            var nodesLink = htmlDocument.DocumentNode.SelectNodes("//a");
                            if (nodesLink != null)
                            {
                                foreach (var nodeLink in nodesLink)
                                {
                                    string url    = QT.Entities.Common.GetAbsoluteUrl(nodeLink.Attributes["href"].Value.ToString(), "http://vatgia.com");
                                    long IDUrlNew = Math.Abs(GABIZ.Base.Tools.getCRC64(url));
                                    if (!addedLink.ContainsKey(IDUrlNew))
                                    {
                                        if (Regex.IsMatch(url, regexExtract))
                                        {
                                            PushJobToQueue(new JobCrawler()
                                            {
                                                level     = jobData.level + 1,
                                                urlDetail = url
                                            });

                                            sqlSaveData.RunQuery("if not exists (select id from VatGiaClassification where id = @id) insert into VatGiaClassification (id, url) values (@id, @url)", CommandType.Text,
                                                                 new System.Data.SqlClient.SqlParameter[] {
                                                SqlDb.CreateParamteterSQL("@id", Math.Abs(GABIZ.Base.Tools.getCRC64(url)), SqlDbType.BigInt),
                                                SqlDb.CreateParamteterSQL("@url", url, SqlDbType.NVarChar)
                                            });

                                            addedLink.Add(IDUrlNew, "");
                                            addedToDb.Add(IDUrlNew);

                                            Console.WriteLine(url);
                                        }

                                        if (addedToDb.Count > 100)
                                        {
                                            foreach (var item in addedToDb)
                                            {
                                                this.sqlVisited.RunQuery("if not exists (select id from visitedlink where id = @id) insert into VisitedLink (id) values (@id)"
                                                                         , CommandType.Text
                                                                         , new System.Data.SqlClient.SqlParameter[] {
                                                    SqlDb.CreateParamteterSQL("@id", item, SqlDbType.BigInt)
                                                });
                                            }
                                            addedToDb.Clear();
                                        }
                                    }
                                }
                            }
                        }
                        return(true);
                    }
                    catch (Exception ex01)
                    {
                        log.Error("Exception:", ex01);
                        return(true);
                    }
                };
                worker.Start();
            });

            workerTask.Start();
            Console.ReadLine();
        }