Exemplo n.º 1
0
        public List <string> GetListTag(string url, string xpath)
        {
            QT.Entities.RaoVat.HandlerContentOfHtml handlerContentHtml = new Entities.RaoVat.HandlerContentOfHtml();
            GABIZ.Base.HtmlAgilityPack.HtmlDocument doc = new GABIZ.Base.HtmlAgilityPack.HtmlDocument();
            doc.LoadHtml(GABIZ.Base.HtmlUrl.HTMLTransmitter.getHTML(url, 45, 2, true));
            List <string> lstTag = QT.Entities.Common.GetTextInNode(doc, xpath);

            return(lstTag);
        }
Exemplo n.º 2
0
        private void DoCrawler()
        {
            Dictionary <long, int[]>   dicMapClassAndCategori = this.raovatSqlAdapter.GetDicMapClassificationAndCategories(this.websiteRaoVat.id);
            Dictionary <int, string[]> dicMapCity             = this.raovatSqlAdapter.GetDicCityAndRegex();

            while (true)
            {
                try
                {
                    int igone = 0;

                    //Khởi tạo.
                    Queue <JobCrawlerSale>    queueUrl   = new Queue <JobCrawlerSale>();
                    Dictionary <long, string> dicVisited = new Dictionary <long, string>();
                    foreach (var item in this.RunnerCrawler.root_link)
                    {
                        queueUrl.Enqueue(new JobCrawlerSale()
                        {
                            deep = 0,
                            url  = item
                        });
                    }
                    this.ShowQueue(queueUrl.Count);

                    while (!this.Pause && queueUrl != null && queueUrl.Count > 0)
                    {
                        JobCrawlerSale job = queueUrl.Dequeue();

                        ShowUrlCurrent(job.url);
                        ShowQueue(queueUrl.Count);

                        if (configXPath.TimeDelay > 0)
                        {
                            Thread.Sleep(configXPath.TimeDelay);
                        }
                        string html = GABIZ.Base.HtmlUrl.HTMLTransmitter.getHTML(job.url, 45, 2);
                        if (!string.IsNullOrEmpty(html))
                        {
                            GABIZ.Base.HtmlAgilityPack.HtmlDocument doc = new GABIZ.Base.HtmlAgilityPack.HtmlDocument();
                            doc.LoadHtml(html);

                            //Extraction.
                            var a_nodes = doc.DocumentNode.SelectNodes("//a[@href]");
                            if (a_nodes != null)
                            {
                                foreach (var a_node in a_nodes)
                                {
                                    string url1       = QT.Entities.Common.GetAbsoluteUrl(a_node.Attributes["href"].Value, this.websiteRaoVat.base_link);
                                    string compacLink = QT.Entities.Common.CompactUrl(url1);
                                    long   s_crc      = Math.Abs(GABIZ.Base.Tools.getCRC64(compacLink));
                                    if (!dicVisited.ContainsKey(s_crc))
                                    {
                                        dicVisited.Add(s_crc, "");
                                        ShowVisited(dicVisited.Count);

                                        bool bRegexProduct = QT.Entities.Common.CheckRegex(compacLink, configXPath.ProductUrlsRegex, configXPath.NoProductUrlRegex, false);
                                        bool bRegexExtract = QT.Entities.Common.CheckRegex(compacLink, configXPath.VisitUrlsRegex, configXPath.NoVisitUrlRegex, false);
                                        if (bRegexExtract)
                                        {
                                            if (job.deep + 1 < this.RunnerCrawler.max_deep)
                                            {
                                                queueUrl.Enqueue(new JobCrawlerSale()
                                                {
                                                    url  = url1,
                                                    deep = job.deep + 1
                                                });
                                                ShowQueue(queueUrl.Count);
                                            }
                                        }
                                        else
                                        {
                                            if (bRegexProduct)
                                            {
                                                queueUrl.Enqueue(new JobCrawlerSale()
                                                {
                                                    url  = url1,
                                                    deep = job.deep + 1
                                                });
                                                ShowQueue(queueUrl.Count);
                                            }
                                        }
                                    }
                                }
                            }

                            //AnalysicData.
                            if (QT.Entities.Common.CheckRegex(
                                    QT.Entities.Common.CompactUrl(job.url), configXPath.ProductUrlsRegex, configXPath.NoProductUrlRegex, false))
                            {
                                QT.Entities.RaoVat.HandlerContentOfHtml handlerContentHtml = new Entities.RaoVat.HandlerContentOfHtml();
                                ProductSaleNew productSaleNew = new ProductSaleNew();
                                handlerContentHtml.AnalyticsProductSaleNew(websiteRaoVat.domain, job.url, doc, configXPath
                                                                           , productSaleNew, dicMapClassAndCategori, dicMapCity);

                                if (productSaleNew.IsDetailSucess)
                                {
                                    //SaveClassification
                                    try
                                    {
                                        this.raovatSqlAdapter.SaveClassification(productSaleNew.website_id, productSaleNew.web_category);
                                    }
                                    catch (Exception ex01)
                                    {
                                    }

                                    if (!this.mongoDbAdapter.CheckExistsProductSalenew(productSaleNew.id))
                                    {
                                        this.mongoDbAdapter.InsertProduct(productSaleNew);
                                    }
                                    else
                                    {
                                        this.mongoDbAdapter.UpdateProduct(productSaleNew);
                                    }
                                    ShowProduct(productSaleNew);
                                }
                                else
                                {
                                    ShowIgone(igone++);
                                }
                            }
                        }
                    }

                    this.Invoke(new Action(() =>
                    {
                        richTextBox1.AppendText("\n\rWait to next run!");
                    }));

                    Thread.Sleep(10000);
                }
                catch (ThreadAbortException threadAbortException)
                {
                    return;
                }
            }
        }