Example #1
0
        void doCrawler()
        {
            dtCom  = new DB.CompanyDataTable();
            adtCom = new DBTableAdapters.CompanyTableAdapter();
            adtCom.Connection.ConnectionString = QT.Entities.Server.ConnectionString;
            if (adtCom.Connection.State == ConnectionState.Closed)
            {
                adtCom.Connection.Open();
            }
            if (chkFind.Checked == true)
            {
                webCRC = new List <long>();
                adtCom.Fill(dtCom);
                int i0 = 0;
                foreach (var dr in dtCom)
                {
                    i0 = webCRC.BinarySearch(dr.ID);
                    if (i0 < 0)
                    {
                        webCRC.Insert(~i0, dr.ID);
                    }
                }
                //adtCom.Connection.Close();
                //adtCom.Dispose();
                //dtCom.Dispose();
            }


            visitedCount = 0;
            crawlerLink  = new Queue <string>();
            visitedCRC   = new List <long>();
            rootUri      = new Uri(rootUrl);
            crawlerLink.Enqueue(rootUrl);
            while (crawlerLink.Count > 0)
            {
                if (finish)
                {
                    break;
                }
                if (!pause)
                {
                    string c_url = crawlerLink.Dequeue();
                    try
                    {
                        string html = GABIZ.Base.HtmlUrl.HTMLTransmitter.getHTML(c_url, 45, 2);

                        if (html != "")
                        {
                            GABIZ.Base.HtmlAgilityPack.HtmlDocument doc = new GABIZ.Base.HtmlAgilityPack.HtmlDocument();
                            doc.LoadHtml(html);

                            var a_nodes = doc.DocumentNode.SelectNodes("//a[@href]");
                            if (a_nodes != null)
                            {
                                #region add link to process
                                for (int i = 0; i < a_nodes.Count; i++)
                                {
                                    string s = Common.GetAbsoluteUrl(a_nodes[i].Attributes["href"].Value, rootUri);
                                    if (!IsNoVisitUrl(s))
                                    {
                                        long s_crc = Tools.getCRC64(LinkCanonicalization.NormalizeLink(s));
                                        int  index = visitedCRC.BinarySearch(s_crc);
                                        if (index < 0)
                                        {
                                            if (IsRelevantUrl(s))
                                            {
                                                crawlerLink.Enqueue(s);
                                            }
                                            visitedCRC.Insert(~index, s_crc);
                                            if (chkFind.Checked == true)
                                            {
                                                if (!IsRelevantUrl(s))
                                                {
                                                    Uri      uri         = new Uri(s);
                                                    TimeSpan timestartup = new TimeSpan(0, 1, 1, 0);
                                                    TimeSpan timeSleep   = new TimeSpan(0, 1, 1, 0);
                                                    String   domain      = uri.Host.ToLower();
                                                    domain = domain.Replace("www.", "");

                                                    long idcom  = Common.GetIDCompany(domain);
                                                    int  index1 = webCRC.BinarySearch(idcom);
                                                    if (index1 < 0)
                                                    {
                                                        Alexa a = new Alexa();
                                                        a = Common.GetRankAlexa(uri.Host);
                                                        Thread.Sleep(Common.Obj2Int(txtDelay.Text.Trim()));
                                                        countWeb++;
                                                        webCRC.Insert(~index1, idcom);
                                                        adtCom.Insert(
                                                            idcom,
                                                            "",
                                                            "Tìm thấy từ " + txtURL.Text,
                                                            domain,
                                                            domain,
                                                            DateTime.Now,
                                                            "",
                                                            "",
                                                            "",
                                                            "",
                                                            Common.CompanyStatus.WEB_ADDNEWS,
                                                            false,
                                                            "",
                                                            a.AlexaRankContries,
                                                            a.AlexaRank,
                                                            timestartup,
                                                            timeSleep,
                                                            500,
                                                            0,
                                                            DateTime.Now,
                                                            DateTime.Now,
                                                            30,
                                                            0,
                                                            0,
                                                            0, "", DateTime.Now, "", 0, DateTime.Now, 0, "", "", true, false, false, true, true, true, null, null, false, "", 3);
                                                    }
                                                }
                                            }
                                        }
                                    }
                                }
                                #endregion
                            }


                            if (showLog)
                            {
                                #region show log
                                this.Invoke((MethodInvoker) delegate
                                {
                                    lblVisited.Text    = visitedCount.ToString();
                                    lblQueue.Text      = crawlerLink.Count.ToString();
                                    lblProduct.Text    = countWeb.ToString();
                                    txtUrlCurrent.Text = currentUrl;
                                    var xx             = DateTime.Now - start;
                                    DateTime mydate    = new DateTime(xx.Ticks);
                                    lblTime.Text       = mydate.ToString("HH:mm:ss");
                                    lblIgnored.Text    = ignoredCount.ToString();
                                });
                                #endregion
                            }
                        }
                        visitedCount++;
                        currentUrl = c_url;
                    }
                    catch (Exception ex)
                    {
                        FileLog.WriteAppendText(DateTime.Now.ToString("dd/MM HH:mm:ss") + "\t, " + c_url + "\r\n" + ex.ToString(), rootUri.Host + ".csv");
                    }
                }
            }
            finish = true;
            crawlerLink.Clear();
            crawlerLink = null;
            this.timer1.Start();
            if (crawlerThread != null)
            {
                if (crawlerThread.IsAlive)
                {
                    crawlerThread.Abort();
                    crawlerThread.Join();
                    crawlerThread = null;
                }
            }
        }
Example #2
0
        private void CrawlerNewProduct(MQTask_NewProduct taskCrawler)
        {
            if (taskCrawler.company == null)
            {
                ReportData("Company null in task");
            }
            else if (taskCrawler.Configuration == null)
            {
                ReportData("Config null in task");
            }
            else
            {
                //CrawlerNewProduct crawlerCore = new CrawlerNewProduct(taskCrawler.company, taskCrawler.configCrawler);
                //crawlerCore.bAllowInsertNewProduct = true;
                //crawlerCore.bAllowUpdateOldProduct = false;
                //crawlerCore.bTrackQueueToDb = true;
                //crawlerCore.iLevelMaxCrawler = 10;
                //crawlerCore.Start();

                string         rootUrl         = taskCrawler.company.Website;
                int            visitedCount    = 0;
                Configuration  config          = taskCrawler.Configuration;
                List <string>  crawlerRegex    = config.VisitUrlsRegex;
                List <string>  detailLinkRegex = config.ProductUrlsRegex;
                List <string>  P_Show          = new List <string>();
                List <Product> Products        = new List <Product>();
                Queue <string> crawlerLink     = new Queue <string>();
                List <long>    visitedCRC      = new List <long>();
                Uri            rootUri         = new Uri(rootUrl);
                crawlerLink.Enqueue(rootUrl);
                string currentUrl   = "";
                bool   finish       = false;
                bool   pause        = false;
                int    ignoredCount = 0;

                while (crawlerLink.Count > 0)
                {
                    if (Products.Count >= config.ItemReCrawler)
                    {
                        break;
                    }
                    if (finish)
                    {
                        break;
                    }
                    if (!pause)
                    {
                        try
                        {
                            string c_url = crawlerLink.Dequeue();
                            FileLog.WriteAppendText(DateTime.Now.ToString("dd/MM HH:mm:ss") + "\t, " + c_url, rootUri.Host + ".txt");
                            string html = GABIZ.Base.HtmlUrl.HTMLTransmitter.getHTML(c_url, 45, 2);
                            if (html != "")
                            {
                                GABIZ.Base.HtmlAgilityPack.HtmlDocument doc = new GABIZ.Base.HtmlAgilityPack.HtmlDocument();
                                if (config.UseClearHtml)
                                {
                                    html = Common.TidyCleanR(html);
                                }
                                doc.LoadHtml(html);

                                var a_nodes = doc.DocumentNode.SelectNodes("//a[@href]");
                                if (a_nodes != null)
                                {
                                    for (int i = 0; i < a_nodes.Count; i++)
                                    {
                                        string s = Common.GetAbsoluteUrl(a_nodes[i].Attributes["href"].Value, rootUri);

                                        long s_crc = GABIZ.Base.Tools.getCRC64(LinkCanonicalization.NormalizeLink(s));
                                        int  index = visitedCRC.BinarySearch(s_crc);
                                        if (index < 0)
                                        {
                                            if (IsRelevantUrl(s))
                                            {
                                                crawlerLink.Enqueue(s);
                                            }
                                            visitedCRC.Insert(~index, s_crc);
                                        }
                                    }
                                }

                                if (IsDetailUrl(c_url, detailLinkRegex))
                                {
                                    QT.Entities.Product p = new Product();
                                    p.Analytics(doc, c_url, config, false, taskCrawler.company.Domain);

                                    if (p != null)
                                    {
                                        if (p.Name != null)
                                        {
                                            if (p.Name.Trim() != "")
                                            {
                                                Products.Add(p);
                                            }
                                            else
                                            {
                                                FileLog.WriteAppendText(DateTime.Now.ToString("dd/MM HH:mm:ss") + "\t, " + "Product not name", rootUri.Host + ".txt");
                                            }
                                        }
                                    }
                                    else
                                    {
                                        ignoredCount++;
                                    }
                                }
                            }

                            visitedCount++;
                            currentUrl = c_url;
                            Thread.Sleep(config.TimeDelay);
                        }
                        catch (Exception ex)
                        {
                            FileLog.WriteAppendText(DateTime.Now.ToString("dd/MM HH:mm:ss") + "\t, " + ex.ToString(), rootUri.Host + ".txt");
                        }
                    }
                }
                finish = true;
            }
        }