void doCrawler() { dtCom = new DB.CompanyDataTable(); adtCom = new DBTableAdapters.CompanyTableAdapter(); adtCom.Connection.ConnectionString = QT.Entities.Server.ConnectionString; if (adtCom.Connection.State == ConnectionState.Closed) { adtCom.Connection.Open(); } if (chkFind.Checked == true) { webCRC = new List <long>(); adtCom.Fill(dtCom); int i0 = 0; foreach (var dr in dtCom) { i0 = webCRC.BinarySearch(dr.ID); if (i0 < 0) { webCRC.Insert(~i0, dr.ID); } } //adtCom.Connection.Close(); //adtCom.Dispose(); //dtCom.Dispose(); } visitedCount = 0; crawlerLink = new Queue <string>(); visitedCRC = new List <long>(); rootUri = new Uri(rootUrl); crawlerLink.Enqueue(rootUrl); while (crawlerLink.Count > 0) { if (finish) { break; } if (!pause) { string c_url = crawlerLink.Dequeue(); try { string html = GABIZ.Base.HtmlUrl.HTMLTransmitter.getHTML(c_url, 45, 2); if (html != "") { GABIZ.Base.HtmlAgilityPack.HtmlDocument doc = new GABIZ.Base.HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(html); var a_nodes = doc.DocumentNode.SelectNodes("//a[@href]"); if (a_nodes != null) { #region add link to process for (int i = 0; i < a_nodes.Count; i++) { string s = Common.GetAbsoluteUrl(a_nodes[i].Attributes["href"].Value, rootUri); if (!IsNoVisitUrl(s)) { long s_crc = Tools.getCRC64(LinkCanonicalization.NormalizeLink(s)); int index = visitedCRC.BinarySearch(s_crc); if (index < 0) { if (IsRelevantUrl(s)) { crawlerLink.Enqueue(s); } visitedCRC.Insert(~index, s_crc); if (chkFind.Checked == true) { if (!IsRelevantUrl(s)) { Uri uri = new Uri(s); TimeSpan timestartup = new TimeSpan(0, 1, 1, 0); TimeSpan timeSleep = new TimeSpan(0, 1, 1, 0); String domain = uri.Host.ToLower(); domain = domain.Replace("www.", ""); long idcom = Common.GetIDCompany(domain); int index1 = webCRC.BinarySearch(idcom); if (index1 < 0) { Alexa a = new Alexa(); a = Common.GetRankAlexa(uri.Host); Thread.Sleep(Common.Obj2Int(txtDelay.Text.Trim())); countWeb++; webCRC.Insert(~index1, idcom); adtCom.Insert( idcom, "", "Tìm thấy từ " + txtURL.Text, domain, domain, DateTime.Now, "", "", "", "", Common.CompanyStatus.WEB_ADDNEWS, false, "", a.AlexaRankContries, a.AlexaRank, timestartup, timeSleep, 500, 0, DateTime.Now, DateTime.Now, 30, 0, 0, 0, "", DateTime.Now, "", 0, DateTime.Now, 0, "", "", true, false, false, true, true, true, null, null, false, "", 3); } } } } } } #endregion } if (showLog) { #region show log this.Invoke((MethodInvoker) delegate { lblVisited.Text = visitedCount.ToString(); lblQueue.Text = crawlerLink.Count.ToString(); lblProduct.Text = countWeb.ToString(); txtUrlCurrent.Text = currentUrl; var xx = DateTime.Now - start; DateTime mydate = new DateTime(xx.Ticks); lblTime.Text = mydate.ToString("HH:mm:ss"); lblIgnored.Text = ignoredCount.ToString(); }); #endregion } } visitedCount++; currentUrl = c_url; } catch (Exception ex) { FileLog.WriteAppendText(DateTime.Now.ToString("dd/MM HH:mm:ss") + "\t, " + c_url + "\r\n" + ex.ToString(), rootUri.Host + ".csv"); } } } finish = true; crawlerLink.Clear(); crawlerLink = null; this.timer1.Start(); if (crawlerThread != null) { if (crawlerThread.IsAlive) { crawlerThread.Abort(); crawlerThread.Join(); crawlerThread = null; } } }
private void CrawlerNewProduct(MQTask_NewProduct taskCrawler) { if (taskCrawler.company == null) { ReportData("Company null in task"); } else if (taskCrawler.Configuration == null) { ReportData("Config null in task"); } else { //CrawlerNewProduct crawlerCore = new CrawlerNewProduct(taskCrawler.company, taskCrawler.configCrawler); //crawlerCore.bAllowInsertNewProduct = true; //crawlerCore.bAllowUpdateOldProduct = false; //crawlerCore.bTrackQueueToDb = true; //crawlerCore.iLevelMaxCrawler = 10; //crawlerCore.Start(); string rootUrl = taskCrawler.company.Website; int visitedCount = 0; Configuration config = taskCrawler.Configuration; List <string> crawlerRegex = config.VisitUrlsRegex; List <string> detailLinkRegex = config.ProductUrlsRegex; List <string> P_Show = new List <string>(); List <Product> Products = new List <Product>(); Queue <string> crawlerLink = new Queue <string>(); List <long> visitedCRC = new List <long>(); Uri rootUri = new Uri(rootUrl); crawlerLink.Enqueue(rootUrl); string currentUrl = ""; bool finish = false; bool pause = false; int ignoredCount = 0; while (crawlerLink.Count > 0) { if (Products.Count >= config.ItemReCrawler) { break; } if (finish) { break; } if (!pause) { try { string c_url = crawlerLink.Dequeue(); FileLog.WriteAppendText(DateTime.Now.ToString("dd/MM HH:mm:ss") + "\t, " + c_url, rootUri.Host + ".txt"); string html = GABIZ.Base.HtmlUrl.HTMLTransmitter.getHTML(c_url, 45, 2); if (html != "") { GABIZ.Base.HtmlAgilityPack.HtmlDocument doc = new GABIZ.Base.HtmlAgilityPack.HtmlDocument(); if (config.UseClearHtml) { html = Common.TidyCleanR(html); } doc.LoadHtml(html); var a_nodes = doc.DocumentNode.SelectNodes("//a[@href]"); if (a_nodes != null) { for (int i = 0; i < a_nodes.Count; i++) { string s = Common.GetAbsoluteUrl(a_nodes[i].Attributes["href"].Value, rootUri); long s_crc = GABIZ.Base.Tools.getCRC64(LinkCanonicalization.NormalizeLink(s)); int index = visitedCRC.BinarySearch(s_crc); if (index < 0) { if (IsRelevantUrl(s)) { crawlerLink.Enqueue(s); } visitedCRC.Insert(~index, s_crc); } } } if (IsDetailUrl(c_url, detailLinkRegex)) { QT.Entities.Product p = new Product(); p.Analytics(doc, c_url, config, false, taskCrawler.company.Domain); if (p != null) { if (p.Name != null) { if (p.Name.Trim() != "") { Products.Add(p); } else { FileLog.WriteAppendText(DateTime.Now.ToString("dd/MM HH:mm:ss") + "\t, " + "Product not name", rootUri.Host + ".txt"); } } } else { ignoredCount++; } } } visitedCount++; currentUrl = c_url; Thread.Sleep(config.TimeDelay); } catch (Exception ex) { FileLog.WriteAppendText(DateTime.Now.ToString("dd/MM HH:mm:ss") + "\t, " + ex.ToString(), rootUri.Host + ".txt"); } } } finish = true; } }