private void ProcessATaskMQ(byte[] body) { try { if (queueName == "product_crawler_new") { MQTask_NewProduct taskCrawler = Websosanh.Core.Common.BAL.ProtobufTool.DeSerialize <MQTask_NewProduct>(body); CrawlerNewProduct(taskCrawler); } //else if (queueName == WSS.ManageConnectStatic.ManagerConnect.Instance().RabbitMQ_QueueTaskCrawlReloadProduct) //{ // CrawlerReloadProduct crawler = new CrawlerReloadProduct(); // crawler.Start(message); //} //else if (queueName == WSS.ManageConnectStatic.ManagerConnect.Instance().RabbitMQ_QueueTaskCrawlReloadProductByCompany) //{ // MQTask_NewProduct taskCrawler = JsonConvert.DeserializeObject<MQTask_NewProduct>(message); // List<Product> lstProduct = Crawler.Data.SQLServer.SqlDb.Instance.GetListProductByCompanyID(taskCrawler.company.ID); // MyRabbitMqHandler.Instance.PublishToReportMessage(string.Format("Process queue: {0} numberProductReload: {1}", queueName // , lstProduct.Count)); // CrawlerReloadProduct crawlerReload = new CrawlerReloadProduct(); // foreach (Product product in lstProduct) // { // product.Domain = taskCrawler.company.Domain; // MQTask_ReloadProduct taskNew = new MQTask_ReloadProduct() // { // Product = product, // ConfigCrawler = taskCrawler.configCrawler // }; // crawlerReload.Start(taskNew); // //string messageMQ = JsonConvert.SerializeObject(taskNew); // //MyRabbitMqHandler.Instance.Publish(WSS.ManageConnectStatic.ManagerConnect.Instance().RabbitMQ_QueueTaskCrawlReloadProduct, messageMQ, 0); // //MyRabbitMqHandler.Instance.PublishToReportMessage(string.Format("Send message ProductID:{0} ConfigID:{1}" // // , taskNew.Product.ID, taskNew.ConfigCrawler.ID)); // } //} //else //{ // MyRabbitMqHandler.Instance.PublishToReportMessage("Queue not handle because don't exits method to process for it"); //} } catch (Exception ex) { } }
private void CrawlerNewProduct(MQTask_NewProduct taskCrawler) { if (taskCrawler.company == null) { ReportData("Company null in task"); } else if (taskCrawler.Configuration == null) { ReportData("Config null in task"); } else { //CrawlerNewProduct crawlerCore = new CrawlerNewProduct(taskCrawler.company, taskCrawler.configCrawler); //crawlerCore.bAllowInsertNewProduct = true; //crawlerCore.bAllowUpdateOldProduct = false; //crawlerCore.bTrackQueueToDb = true; //crawlerCore.iLevelMaxCrawler = 10; //crawlerCore.Start(); string rootUrl = taskCrawler.company.Website; int visitedCount = 0; Configuration config = taskCrawler.Configuration; List <string> crawlerRegex = config.VisitUrlsRegex; List <string> detailLinkRegex = config.ProductUrlsRegex; List <string> P_Show = new List <string>(); List <Product> Products = new List <Product>(); Queue <string> crawlerLink = new Queue <string>(); List <long> visitedCRC = new List <long>(); Uri rootUri = new Uri(rootUrl); crawlerLink.Enqueue(rootUrl); string currentUrl = ""; bool finish = false; bool pause = false; int ignoredCount = 0; while (crawlerLink.Count > 0) { if (Products.Count >= config.ItemReCrawler) { break; } if (finish) { break; } if (!pause) { try { string c_url = crawlerLink.Dequeue(); FileLog.WriteAppendText(DateTime.Now.ToString("dd/MM HH:mm:ss") + "\t, " + c_url, rootUri.Host + ".txt"); string html = GABIZ.Base.HtmlUrl.HTMLTransmitter.getHTML(c_url, 45, 2); if (html != "") { GABIZ.Base.HtmlAgilityPack.HtmlDocument doc = new GABIZ.Base.HtmlAgilityPack.HtmlDocument(); if (config.UseClearHtml) { html = Common.TidyCleanR(html); } doc.LoadHtml(html); var a_nodes = doc.DocumentNode.SelectNodes("//a[@href]"); if (a_nodes != null) { for (int i = 0; i < a_nodes.Count; i++) { string s = Common.GetAbsoluteUrl(a_nodes[i].Attributes["href"].Value, rootUri); long s_crc = GABIZ.Base.Tools.getCRC64(LinkCanonicalization.NormalizeLink(s)); int index = visitedCRC.BinarySearch(s_crc); if (index < 0) { if (IsRelevantUrl(s)) { crawlerLink.Enqueue(s); } visitedCRC.Insert(~index, s_crc); } } } if (IsDetailUrl(c_url, detailLinkRegex)) { QT.Entities.Product p = new Product(); p.Analytics(doc, c_url, config, false, taskCrawler.company.Domain); if (p != null) { if (p.Name != null) { if (p.Name.Trim() != "") { Products.Add(p); } else { FileLog.WriteAppendText(DateTime.Now.ToString("dd/MM HH:mm:ss") + "\t, " + "Product not name", rootUri.Host + ".txt"); } } } else { ignoredCount++; } } } visitedCount++; currentUrl = c_url; Thread.Sleep(config.TimeDelay); } catch (Exception ex) { FileLog.WriteAppendText(DateTime.Now.ToString("dd/MM HH:mm:ss") + "\t, " + ex.ToString(), rootUri.Host + ".txt"); } } } finish = true; } }