Exemplo n.º 1
0
 private void ProcessATaskMQ(byte[] body)
 {
     try
     {
         if (queueName == "product_crawler_new")
         {
             MQTask_NewProduct taskCrawler = Websosanh.Core.Common.BAL.ProtobufTool.DeSerialize <MQTask_NewProduct>(body);
             CrawlerNewProduct(taskCrawler);
         }
         //else if (queueName == WSS.ManageConnectStatic.ManagerConnect.Instance().RabbitMQ_QueueTaskCrawlReloadProduct)
         //{
         //    CrawlerReloadProduct crawler = new CrawlerReloadProduct();
         //    crawler.Start(message);
         //}
         //else if (queueName == WSS.ManageConnectStatic.ManagerConnect.Instance().RabbitMQ_QueueTaskCrawlReloadProductByCompany)
         //{
         //    MQTask_NewProduct taskCrawler = JsonConvert.DeserializeObject<MQTask_NewProduct>(message);
         //    List<Product> lstProduct = Crawler.Data.SQLServer.SqlDb.Instance.GetListProductByCompanyID(taskCrawler.company.ID);
         //    MyRabbitMqHandler.Instance.PublishToReportMessage(string.Format("Process queue: {0}  numberProductReload: {1}", queueName
         //        , lstProduct.Count));
         //    CrawlerReloadProduct crawlerReload = new CrawlerReloadProduct();
         //    foreach (Product product in lstProduct)
         //    {
         //        product.Domain = taskCrawler.company.Domain;
         //        MQTask_ReloadProduct taskNew = new MQTask_ReloadProduct()
         //        {
         //            Product = product,
         //            ConfigCrawler = taskCrawler.configCrawler
         //        };
         //        crawlerReload.Start(taskNew);
         //        //string messageMQ = JsonConvert.SerializeObject(taskNew);
         //        //MyRabbitMqHandler.Instance.Publish(WSS.ManageConnectStatic.ManagerConnect.Instance().RabbitMQ_QueueTaskCrawlReloadProduct, messageMQ, 0);
         //        //MyRabbitMqHandler.Instance.PublishToReportMessage(string.Format("Send message ProductID:{0} ConfigID:{1}"
         //        //    , taskNew.Product.ID, taskNew.ConfigCrawler.ID));
         //    }
         //}
         //else
         //{
         //    MyRabbitMqHandler.Instance.PublishToReportMessage("Queue not handle because don't exits method to process for it");
         //}
     }
     catch (Exception ex)
     {
     }
 }
Exemplo n.º 2
0
        private void CrawlerNewProduct(MQTask_NewProduct taskCrawler)
        {
            if (taskCrawler.company == null)
            {
                ReportData("Company null in task");
            }
            else if (taskCrawler.Configuration == null)
            {
                ReportData("Config null in task");
            }
            else
            {
                //CrawlerNewProduct crawlerCore = new CrawlerNewProduct(taskCrawler.company, taskCrawler.configCrawler);
                //crawlerCore.bAllowInsertNewProduct = true;
                //crawlerCore.bAllowUpdateOldProduct = false;
                //crawlerCore.bTrackQueueToDb = true;
                //crawlerCore.iLevelMaxCrawler = 10;
                //crawlerCore.Start();

                string         rootUrl         = taskCrawler.company.Website;
                int            visitedCount    = 0;
                Configuration  config          = taskCrawler.Configuration;
                List <string>  crawlerRegex    = config.VisitUrlsRegex;
                List <string>  detailLinkRegex = config.ProductUrlsRegex;
                List <string>  P_Show          = new List <string>();
                List <Product> Products        = new List <Product>();
                Queue <string> crawlerLink     = new Queue <string>();
                List <long>    visitedCRC      = new List <long>();
                Uri            rootUri         = new Uri(rootUrl);
                crawlerLink.Enqueue(rootUrl);
                string currentUrl   = "";
                bool   finish       = false;
                bool   pause        = false;
                int    ignoredCount = 0;

                while (crawlerLink.Count > 0)
                {
                    if (Products.Count >= config.ItemReCrawler)
                    {
                        break;
                    }
                    if (finish)
                    {
                        break;
                    }
                    if (!pause)
                    {
                        try
                        {
                            string c_url = crawlerLink.Dequeue();
                            FileLog.WriteAppendText(DateTime.Now.ToString("dd/MM HH:mm:ss") + "\t, " + c_url, rootUri.Host + ".txt");
                            string html = GABIZ.Base.HtmlUrl.HTMLTransmitter.getHTML(c_url, 45, 2);
                            if (html != "")
                            {
                                GABIZ.Base.HtmlAgilityPack.HtmlDocument doc = new GABIZ.Base.HtmlAgilityPack.HtmlDocument();
                                if (config.UseClearHtml)
                                {
                                    html = Common.TidyCleanR(html);
                                }
                                doc.LoadHtml(html);

                                var a_nodes = doc.DocumentNode.SelectNodes("//a[@href]");
                                if (a_nodes != null)
                                {
                                    for (int i = 0; i < a_nodes.Count; i++)
                                    {
                                        string s = Common.GetAbsoluteUrl(a_nodes[i].Attributes["href"].Value, rootUri);

                                        long s_crc = GABIZ.Base.Tools.getCRC64(LinkCanonicalization.NormalizeLink(s));
                                        int  index = visitedCRC.BinarySearch(s_crc);
                                        if (index < 0)
                                        {
                                            if (IsRelevantUrl(s))
                                            {
                                                crawlerLink.Enqueue(s);
                                            }
                                            visitedCRC.Insert(~index, s_crc);
                                        }
                                    }
                                }

                                if (IsDetailUrl(c_url, detailLinkRegex))
                                {
                                    QT.Entities.Product p = new Product();
                                    p.Analytics(doc, c_url, config, false, taskCrawler.company.Domain);

                                    if (p != null)
                                    {
                                        if (p.Name != null)
                                        {
                                            if (p.Name.Trim() != "")
                                            {
                                                Products.Add(p);
                                            }
                                            else
                                            {
                                                FileLog.WriteAppendText(DateTime.Now.ToString("dd/MM HH:mm:ss") + "\t, " + "Product not name", rootUri.Host + ".txt");
                                            }
                                        }
                                    }
                                    else
                                    {
                                        ignoredCount++;
                                    }
                                }
                            }

                            visitedCount++;
                            currentUrl = c_url;
                            Thread.Sleep(config.TimeDelay);
                        }
                        catch (Exception ex)
                        {
                            FileLog.WriteAppendText(DateTime.Now.ToString("dd/MM HH:mm:ss") + "\t, " + ex.ToString(), rootUri.Host + ".txt");
                        }
                    }
                }
                finish = true;
            }
        }