예제 #1
0
        private HashSet <string> GetDomain(string keywords)
        {
            HashSet <string> lst = new HashSet <string>();

            for (int i = 0; i < page; i++)
            {
                string url = string.Format(patternUrl, numInPage, i * numInPage, keywords.Replace(" ", "+"));
                //url = System.Web.HttpUtility.UrlEncode(url);
                string html =
                    System.Web.HttpUtility.HtmlDecode(GABIZ.Base.HtmlUrl.HTMLTransmitter.getHTML(url, 45, 2));

                HtmlDocument htmlDocument = new HtmlDocument();
                htmlDocument.LoadHtml(html);

                string xpath     = "//div[@class='srg']//div[@class='g']//a[@href]";
                var    nodesSite = htmlDocument.DocumentNode.SelectNodes(xpath);

                if (nodesSite != null)
                {
                    foreach (var VARIABLE in nodesSite)
                    {
                        string directLink = System.Web.HttpUtility.UrlDecode(VARIABLE.GetAttributeValue("href", ""));
                        directLink = Common.GetAbsoluteUrl(directLink, new Uri("http://google.com"));
                        Uri    uri     = new Uri(directLink);
                        string siteUrl = Common.GetDomainFromUrl(uri.Host);
                        if (!lst.Contains(siteUrl))
                        {
                            lst.Add(uri.Scheme + "://" + uri.Host);
                        }
                    }
                }
            }

            return(lst);
        }
예제 #2
0
        public void storedata()
        {
            //url 변수
            try
            {
                //첫페이지의 데이터를 수집한다.
                web       = new HtmlAgilityPack.HtmlWeb();
                document  = web.Load(naverlink);
                document3 = web.Load(naverlink);
                collectdata(document);

                //나머지페이지의 데이터를 수집한다.
                int    index3 = 0;
                int    index4 = 0;
                int    tmp2   = naverlink.IndexOf("=") + 1;
                int    tmp3   = naverlink.IndexOf("&");
                int    tmp4   = naverlink.IndexOf("query=") + 6;
                int    tmp5   = naverlink.Length;
                String nvMid  = naverlink.Substring(tmp2, tmp3 - tmp2);
                String query  = naverlink.Substring(tmp4, tmp5 - tmp4);
                String page;
                String url;
                var    VARIABLES = document.DocumentNode.SelectSingleNode(".//div[@class='co_paginate']").Descendants().Where(x => x.Name == "a");

                foreach (var VARIABLE in VARIABLES)
                {
                    page   = VARIABLE.GetAttributeValue("onclick", "");
                    index3 = page.IndexOf("(") + 1;
                    index4 = page.IndexOf(",");
                    page   = page.Substring(index3, index4 - index3);
                    url    = "http://shopping.naver.com/detail/section_price_compare.nhn?nvMid=" + nvMid +
                             "&pkey=0&pkey2=0&mallSeq=all&fee=all&page=" + page + "&frm=NVSHATC&query=" + query;
                    HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
                    request.Method  = "GET";
                    request.Referer = "http://shopping.naver.com/detail/detail.nhn?nv_mid=9535864708&cat_id=50000151&frm=NVSHATC&query=%EC%82%BC%EC%84%B1%EC%A0%84%EC%9E%90+%EB%85%B8%ED%8A%B8%EB%B6%819+metal+NT900X3L-K58S";
                    HttpWebResponse response = (HttpWebResponse)request.GetResponse();
                    StreamReader    reader   = new StreamReader(response.GetResponseStream());
                    document3.LoadHtml(reader.ReadToEnd());
                    collectdata(document3);
                }
            }
            catch (WebException e)
            {
                Console.WriteLine("네이버url 변수 WebException");
            }
            catch (HtmlWebException e)
            {
                Console.WriteLine("네이버url 변수 WebException");
            }
            catch (UriFormatException e)
            {
                Console.WriteLine("네이버url 변수 WebException");
            }
            catch (NullReferenceException e)
            {
                Console.WriteLine("네이버url 변수 NullReferenceException");
            }
        }
예제 #3
0
        public void StartRun()
        {
            WebExceptionStatus outS = WebExceptionStatus.UnknownError;

            foreach (var linkExtraction in this.configurationHotProduct.HotProduct_Link.Split(new char[] { ',', '\n', ';' }, StringSplitOptions.RemoveEmptyEntries))
            {
                this.LogData("Get html of cat page");
                string html = downloadHtml.GetHTML(linkExtraction, 45, 2, out outS, this.configurationHotProduct.HotProduct_UseSelenium);
                if (!string.IsNullOrEmpty(html))
                {
                    HtmlDocument htmlDocument = new HtmlDocument();
                    htmlDocument.LoadHtml(html);
                    var nodeLinks = htmlDocument.DocumentNode.SelectNodes(this.configurationHotProduct.HotProduct_Xpath);
                    if (nodeLinks != null)
                    {
                        foreach (var VARIABLE in nodeLinks)
                        {
                            try
                            {
                                string shortLink = VARIABLE.GetAttributeValue("href", "");

                                string fullLink = Common.GetAbsoluteUrl(shortLink, new Uri(this.company.Website));
                                this.LogData(string.Format("Process link product {0}", fullLink));
                                string htmlLinkProduct = this.downloadHtml.GetHTML(fullLink, 45, 2, out outS);

                                HtmlDocument h = new HtmlDocument();
                                h.LoadHtml(htmlLinkProduct);
                                if (!string.IsNullOrEmpty(htmlLinkProduct))
                                {
                                    ProductEntity productEntity = new ProductEntity();
                                    this.producerParser.Analytics(productEntity, h, fullLink, this.configuration, this.company.Domain);
                                    if (productEntity.IsSuccessData(true))
                                    {
                                        this.productAdapter.UpsertProductHot(productEntity);
                                        this.LogData(string.Format("Saved a product to database. {0} {1} {2}", productEntity.ID,
                                                                   productEntity.Name, productEntity.Price));
                                    }
                                }
                            }
                            catch (Exception ex)
                            {
                                LogData(string.Format("Error: {0} {1}", ex.Message, ex.StackTrace));
                            }
                        }
                    }
                }
            }
        }
예제 #4
0
        private void Extraction(HtmlDocument htmlDocument, JobNodeCrawler jobNodeCrawler)
        {
            var nodesUrl = htmlDocument.DocumentNode.SelectNodes("//a");

            if (nodesUrl != null)
            {
                foreach (var VARIABLE in nodesUrl)
                {
                    string urlNew = VARIABLE.GetAttributeValue("href", "");
                    if (!string.IsNullOrEmpty(urlNew))
                    {
                        if (CheckAllowVisit(urlNew))
                        {
                            _queueCrawler.Enqueue(new JobNodeCrawler()
                            {
                                Deep = jobNodeCrawler.Deep + 1,
                                Url  = jobNodeCrawler.Url
                            });
                        }
                    }
                }
            }
        }
예제 #5
0
        //링크타서 정보빼오는 함수
        public void getif(String link, int code)
        {
            try
            {
                HtmlNode tmpnode;
                HtmlAgilityPack.HtmlWeb      web2      = new HtmlAgilityPack.HtmlWeb();
                HtmlAgilityPack.HtmlDocument document2 = web2.Load(link);

                using (var client = new WebClient())
                {
                    using (var stream = client.OpenRead(link))
                    {
                        var reader = new StreamReader(stream, Encoding.GetEncoding("euc-kr"));
                        var html   = reader.ReadToEnd();
                        document2.LoadHtml(html);
                    }
                }

                switch (code)
                {
                case 1:
                    ;
                    tmpnode = document2.DocumentNode.SelectSingleNode(".//div[@id='free_interest_layer_02']");

                    foreach (var VARIABLE in tmpnode.Descendants().Where(x => x.Name == "img"))
                    {
                        profit2 += VARIABLE.GetAttributeValue("alt", " ");
                    }

                    break;

                case 2:
                    getif("http://promotion.gmarket.co.kr/Event/Common/gen/cardbenefit_gen.js", 3);

                    break;

                case 3:
                    //Json
                    String tmp     = document2.DocumentNode.InnerText;
                    String jsonstr = System.Text.RegularExpressions.Regex.Unescape(tmp);
                    //수정 변경이 필용함
                    String[] card   = { "현대카드", "kb국민카드", "신한카드", "citi", "삼성카드", "롯데카드", "하나", "농협" };
                    var      index  = 0;
                    var      index2 = 0;
                    var      index3 = 0;
                    index   = jsonstr.IndexOf("{\"card_");
                    index2  = jsonstr.IndexOf("card_halbu_shop_goods_list") - 2;
                    jsonstr = jsonstr.Substring(index, index2 - index).Trim() + "}";

                    JObject jobj = JObject.Parse(jsonstr);
                    JArray  jarr = JArray.Parse(jobj["card_halbu_list"].ToString());

                    foreach (JObject jobj2 in jarr)
                    {
                        profit3 += card[index3] + jobj2["date"] + jobj2["halbu"] + jobj2["condition"];
                        index3++;
                    }

                    break;
                }
            }
            catch (WebException e)
            {
                Console.WriteLine("g마켓링크타서 정보빼오는 함수 WebException");
            }
            catch (JsonReaderException e)
            {
                Console.WriteLine("g마켓링크타서 정보빼오는 함수 JsonReaderException");
            }
            catch (ArgumentException e)
            {
                Console.WriteLine("g마켓링크타서 정보빼오는 함수 JsonReaderException");
            }
        }
예제 #6
0
        public void storedata()
        {
            //url 변수
            try
            {
                //첫페이지의 데이터를 수집한다.
                web      = new HtmlAgilityPack.HtmlWeb();
                document = web.Load(link + PdtName);
                collectdata(document);



                //나머지페이지의 데이터를 수집한다.
                document3 = web.Load(link + PdtName);
                int index3 = 0;
                int index4 = 0;

                String page;
                String url;
                var    VARIABLES = document.DocumentNode.SelectSingleNode(".//div[@class='co_paginate']").Descendants().Where(x => x.Name == "a");
                VARIABLES.Last().Remove();
                foreach (var VARIABLE in VARIABLES)
                {
                    page   = VARIABLE.GetAttributeValue("onclick", "");
                    index3 = page.IndexOf("(") + 1;
                    index4 = page.IndexOf(",");
                    page   = page.Substring(index3, index4 - index3);
                    url    = "http://shopping.naver.com/search/all.nhn?query=" + PdtName +
                             "&pagingIndex=" + page + "&pagingSize=40&productSet=total&viewType=list&sort=rel&frm=NVSHPAG&sps=N";
                    HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
                    request.Method  = "GET";
                    request.Referer = "http://shopping.naver.com/search/all.nhn?query=%EB%82%98%EC%9D%B4%ED%82%A4%20%EC%97%90%EC%96%B4%ED%8F%AC%EC%8A%A4%20%EA%B2%80%2F%ED%9D%B0&pagingIndex=2&pagingSize=40&productSet=total&viewType=list&sort=rel&frm=NVSHPAG&sps=N";
                    HttpWebResponse response = (HttpWebResponse)request.GetResponse();
                    StreamReader    reader   = new StreamReader(response.GetResponseStream());
                    document3.LoadHtml(reader.ReadToEnd());
                    collectdata(document3);
                }
            }
            catch (WebException e)
            {
                Console.WriteLine("네이버url 변수 WebException");
            }
            catch (ArgumentException e)
            {
                Console.WriteLine("네이버url 변수 ArgumentException");
            }
            catch (HtmlWebException e)
            {
                Console.WriteLine("네이버url 변수 HtmlWebException");
            }
            catch (UriFormatException e)
            {
                Console.WriteLine("네이버url 변수 UriFormatException");
            }
            catch (NullReferenceException e)
            {
                Console.WriteLine("네이버url 변수 NullReferenceException");
            }
            catch (InvalidOperationException e)
            {
                Console.WriteLine("네이버url 변수 InvalidOperationException");
            }
        }
예제 #7
0
        public override void ProcessMessage(BasicDeliverEventArgs message)
        {
            Thread.Sleep(500);
            string       linkProcess = UTF8Encoding.UTF8.GetString(message.Body);
            HtmlDocument document    = new HtmlDocument();
            string       html        = GABIZ.Base.HtmlUrl.HTMLTransmitter.getHTML(linkProcess, 45, 2);

            if (!string.IsNullOrEmpty(html))
            {
                document.LoadHtml(html);


                if (Regex.IsMatch(linkProcess, ConfigDownloadBook.RegexProduct, RegexOptions.None))
                {
                    string urlDownload = "";
                    string nameBook    = "";
                    int    countPage   = 0;

                    var nodeFindLink = document.DocumentNode.SelectNodes(@"//td//a/@href");
                    foreach (var varData in nodeFindLink)
                    {
                        string href = varData.GetAttributeValue("href", ""); if (Regex.IsMatch(href, "filepi.com/i/.*"))
                        {
                            urlDownload = href;
                            nameBook    = varData.InnerText;
                        }
                    }
                    bool bOK = this.sqldb.RunQuery("insert into LinkItbook (Link, Name, CountPage, LinkDownload) values (@Link, @Name, @CountPage, @LinkDownload)"
                                                   , CommandType.Text, new SqlParameter[]
                    {
                        SqlDb.CreateParamteterSQL("Link", linkProcess, SqlDbType.NVarChar),
                        SqlDb.CreateParamteterSQL("Name", nameBook, SqlDbType.NVarChar),
                        SqlDb.CreateParamteterSQL("CountPage", countPage, SqlDbType.Int),
                        SqlDb.CreateParamteterSQL("LinkDownload", urlDownload, SqlDbType.NVarChar)
                    });
                }


                var nodeLinks = document.DocumentNode.SelectNodes("//a");
                if (nodeLinks != null)
                {
                    foreach (HtmlNode VARIABLE in nodeLinks)
                    {
                        string link = VARIABLE.GetAttributeValue("href", ""); string fullLink = Common.GetFullUrlFromLink(link, uriRoot);

                        if (!CheckVisitCrc(Common.CrcProductID(fullLink)) &&
                            !Regex.IsMatch(fullLink, ConfigDownloadBook.RegexNoVisit, RegexOptions.None) &&
                            Regex.IsMatch(fullLink, ConfigDownloadBook.RegexVisit, RegexOptions.None))
                        {
                            _producerBasicDownloadHtml.PublishString(fullLink);
                            hashCrcVisited.SetCrcVisited(Common.CrcProductID(fullLink));

                            if (Regex.IsMatch(fullLink, ConfigDownloadBook.RegexProduct, RegexOptions.None))
                            {
                                string urlDownload = "";
                                string nameBook    = "";
                                int    countPage   = 0;

                                var nodeFindLink = document.DocumentNode.SelectNodes(@"//td//a/@href");
                                foreach (var varData in nodeFindLink)
                                {
                                    string href = varData.GetAttributeValue("href", ""); if (Regex.IsMatch(href, "filepi.com/i/.*"))
                                    {
                                        urlDownload = href;
                                        nameBook    = varData.InnerText;
                                    }
                                }



                                bool bOK = this.sqldb.RunQuery("insert into LinkItbook (Link, Name, CountPage, LinkDownload) values (@Link, @Name, @CountPage, @LinkDownload)"
                                                               , CommandType.Text, new SqlParameter[]
                                {
                                    SqlDb.CreateParamteterSQL("Link", fullLink, SqlDbType.NVarChar),
                                    SqlDb.CreateParamteterSQL("Name", nameBook, SqlDbType.NVarChar),
                                    SqlDb.CreateParamteterSQL("CountPage", countPage, SqlDbType.Int),
                                    SqlDb.CreateParamteterSQL("LinkDownload", urlDownload, SqlDbType.NVarChar)
                                });
                            }
                        }
                    }
                }
            }

            _log.InfoFormat("Processed link {0}", linkProcess); this.GetChannel().BasicAck(message.DeliveryTag, true);
        }