private HashSet <string> GetDomain(string keywords) { HashSet <string> lst = new HashSet <string>(); for (int i = 0; i < page; i++) { string url = string.Format(patternUrl, numInPage, i * numInPage, keywords.Replace(" ", "+")); //url = System.Web.HttpUtility.UrlEncode(url); string html = System.Web.HttpUtility.HtmlDecode(GABIZ.Base.HtmlUrl.HTMLTransmitter.getHTML(url, 45, 2)); HtmlDocument htmlDocument = new HtmlDocument(); htmlDocument.LoadHtml(html); string xpath = "//div[@class='srg']//div[@class='g']//a[@href]"; var nodesSite = htmlDocument.DocumentNode.SelectNodes(xpath); if (nodesSite != null) { foreach (var VARIABLE in nodesSite) { string directLink = System.Web.HttpUtility.UrlDecode(VARIABLE.GetAttributeValue("href", "")); directLink = Common.GetAbsoluteUrl(directLink, new Uri("")); Uri uri = new Uri(directLink); string siteUrl = Common.GetDomainFromUrl(uri.Host); if (!lst.Contains(siteUrl)) { lst.Add(uri.Scheme + "://" + uri.Host); } } } } return(lst); }
public void storedata() { //url 변수 try { //첫페이지의 데이터를 수집한다. web = new HtmlAgilityPack.HtmlWeb(); document = web.Load(naverlink); document3 = web.Load(naverlink); collectdata(document); //나머지페이지의 데이터를 수집한다. int index3 = 0; int index4 = 0; int tmp2 = naverlink.IndexOf("=") + 1; int tmp3 = naverlink.IndexOf("&"); int tmp4 = naverlink.IndexOf("query=") + 6; int tmp5 = naverlink.Length; String nvMid = naverlink.Substring(tmp2, tmp3 - tmp2); String query = naverlink.Substring(tmp4, tmp5 - tmp4); String page; String url; var VARIABLES = document.DocumentNode.SelectSingleNode(".//div[@class='co_paginate']").Descendants().Where(x => x.Name == "a"); foreach (var VARIABLE in VARIABLES) { page = VARIABLE.GetAttributeValue("onclick", ""); index3 = page.IndexOf("(") + 1; index4 = page.IndexOf(","); page = page.Substring(index3, index4 - index3); url = "" + nvMid + "&pkey=0&pkey2=0&mallSeq=all&fee=all&page=" + page + "&frm=NVSHATC&query=" + query; HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url); request.Method = "GET"; request.Referer = ""; HttpWebResponse response = (HttpWebResponse)request.GetResponse(); StreamReader reader = new StreamReader(response.GetResponseStream()); document3.LoadHtml(reader.ReadToEnd()); collectdata(document3); } } catch (WebException e) { Console.WriteLine("네이버url 변수 WebException"); } catch (HtmlWebException e) { Console.WriteLine("네이버url 변수 WebException"); } catch (UriFormatException e) { Console.WriteLine("네이버url 변수 WebException"); } catch (NullReferenceException e) { Console.WriteLine("네이버url 변수 NullReferenceException"); } }
public void StartRun() { WebExceptionStatus outS = WebExceptionStatus.UnknownError; foreach (var linkExtraction in this.configurationHotProduct.HotProduct_Link.Split(new char[] { ',', '\n', ';' }, StringSplitOptions.RemoveEmptyEntries)) { this.LogData("Get html of cat page"); string html = downloadHtml.GetHTML(linkExtraction, 45, 2, out outS, this.configurationHotProduct.HotProduct_UseSelenium); if (!string.IsNullOrEmpty(html)) { HtmlDocument htmlDocument = new HtmlDocument(); htmlDocument.LoadHtml(html); var nodeLinks = htmlDocument.DocumentNode.SelectNodes(this.configurationHotProduct.HotProduct_Xpath); if (nodeLinks != null) { foreach (var VARIABLE in nodeLinks) { try { string shortLink = VARIABLE.GetAttributeValue("href", ""); string fullLink = Common.GetAbsoluteUrl(shortLink, new Uri(; this.LogData(string.Format("Process link product {0}", fullLink)); string htmlLinkProduct = this.downloadHtml.GetHTML(fullLink, 45, 2, out outS); HtmlDocument h = new HtmlDocument(); h.LoadHtml(htmlLinkProduct); if (!string.IsNullOrEmpty(htmlLinkProduct)) { ProductEntity productEntity = new ProductEntity(); this.producerParser.Analytics(productEntity, h, fullLink, this.configuration,; if (productEntity.IsSuccessData(true)) { this.productAdapter.UpsertProductHot(productEntity); this.LogData(string.Format("Saved a product to database. {0} {1} {2}", productEntity.ID, productEntity.Name, productEntity.Price)); } } } catch (Exception ex) { LogData(string.Format("Error: {0} {1}", ex.Message, ex.StackTrace)); } } } } } }
private void Extraction(HtmlDocument htmlDocument, JobNodeCrawler jobNodeCrawler) { var nodesUrl = htmlDocument.DocumentNode.SelectNodes("//a"); if (nodesUrl != null) { foreach (var VARIABLE in nodesUrl) { string urlNew = VARIABLE.GetAttributeValue("href", ""); if (!string.IsNullOrEmpty(urlNew)) { if (CheckAllowVisit(urlNew)) { _queueCrawler.Enqueue(new JobNodeCrawler() { Deep = jobNodeCrawler.Deep + 1, Url = jobNodeCrawler.Url }); } } } } }
//링크타서 정보빼오는 함수 public void getif(String link, int code) { try { HtmlNode tmpnode; HtmlAgilityPack.HtmlWeb web2 = new HtmlAgilityPack.HtmlWeb(); HtmlAgilityPack.HtmlDocument document2 = web2.Load(link); using (var client = new WebClient()) { using (var stream = client.OpenRead(link)) { var reader = new StreamReader(stream, Encoding.GetEncoding("euc-kr")); var html = reader.ReadToEnd(); document2.LoadHtml(html); } } switch (code) { case 1: ; tmpnode = document2.DocumentNode.SelectSingleNode(".//div[@id='free_interest_layer_02']"); foreach (var VARIABLE in tmpnode.Descendants().Where(x => x.Name == "img")) { profit2 += VARIABLE.GetAttributeValue("alt", " "); } break; case 2: getif("", 3); break; case 3: //Json String tmp = document2.DocumentNode.InnerText; String jsonstr = System.Text.RegularExpressions.Regex.Unescape(tmp); //수정 변경이 필용함 String[] card = { "현대카드", "kb국민카드", "신한카드", "citi", "삼성카드", "롯데카드", "하나", "농협" }; var index = 0; var index2 = 0; var index3 = 0; index = jsonstr.IndexOf("{\"card_"); index2 = jsonstr.IndexOf("card_halbu_shop_goods_list") - 2; jsonstr = jsonstr.Substring(index, index2 - index).Trim() + "}"; JObject jobj = JObject.Parse(jsonstr); JArray jarr = JArray.Parse(jobj["card_halbu_list"].ToString()); foreach (JObject jobj2 in jarr) { profit3 += card[index3] + jobj2["date"] + jobj2["halbu"] + jobj2["condition"]; index3++; } break; } } catch (WebException e) { Console.WriteLine("g마켓링크타서 정보빼오는 함수 WebException"); } catch (JsonReaderException e) { Console.WriteLine("g마켓링크타서 정보빼오는 함수 JsonReaderException"); } catch (ArgumentException e) { Console.WriteLine("g마켓링크타서 정보빼오는 함수 JsonReaderException"); } }
public void storedata() { //url 변수 try { //첫페이지의 데이터를 수집한다. web = new HtmlAgilityPack.HtmlWeb(); document = web.Load(link + PdtName); collectdata(document); //나머지페이지의 데이터를 수집한다. document3 = web.Load(link + PdtName); int index3 = 0; int index4 = 0; String page; String url; var VARIABLES = document.DocumentNode.SelectSingleNode(".//div[@class='co_paginate']").Descendants().Where(x => x.Name == "a"); VARIABLES.Last().Remove(); foreach (var VARIABLE in VARIABLES) { page = VARIABLE.GetAttributeValue("onclick", ""); index3 = page.IndexOf("(") + 1; index4 = page.IndexOf(","); page = page.Substring(index3, index4 - index3); url = "" + PdtName + "&pagingIndex=" + page + "&pagingSize=40&productSet=total&viewType=list&sort=rel&frm=NVSHPAG&sps=N"; HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url); request.Method = "GET"; request.Referer = ""; HttpWebResponse response = (HttpWebResponse)request.GetResponse(); StreamReader reader = new StreamReader(response.GetResponseStream()); document3.LoadHtml(reader.ReadToEnd()); collectdata(document3); } } catch (WebException e) { Console.WriteLine("네이버url 변수 WebException"); } catch (ArgumentException e) { Console.WriteLine("네이버url 변수 ArgumentException"); } catch (HtmlWebException e) { Console.WriteLine("네이버url 변수 HtmlWebException"); } catch (UriFormatException e) { Console.WriteLine("네이버url 변수 UriFormatException"); } catch (NullReferenceException e) { Console.WriteLine("네이버url 변수 NullReferenceException"); } catch (InvalidOperationException e) { Console.WriteLine("네이버url 변수 InvalidOperationException"); } }
public override void ProcessMessage(BasicDeliverEventArgs message) { Thread.Sleep(500); string linkProcess = UTF8Encoding.UTF8.GetString(message.Body); HtmlDocument document = new HtmlDocument(); string html = GABIZ.Base.HtmlUrl.HTMLTransmitter.getHTML(linkProcess, 45, 2); if (!string.IsNullOrEmpty(html)) { document.LoadHtml(html); if (Regex.IsMatch(linkProcess, ConfigDownloadBook.RegexProduct, RegexOptions.None)) { string urlDownload = ""; string nameBook = ""; int countPage = 0; var nodeFindLink = document.DocumentNode.SelectNodes(@"//td//a/@href"); foreach (var varData in nodeFindLink) { string href = varData.GetAttributeValue("href", ""); if (Regex.IsMatch(href, "*")) { urlDownload = href; nameBook = varData.InnerText; } } bool bOK = this.sqldb.RunQuery("insert into LinkItbook (Link, Name, CountPage, LinkDownload) values (@Link, @Name, @CountPage, @LinkDownload)" , CommandType.Text, new SqlParameter[] { SqlDb.CreateParamteterSQL("Link", linkProcess, SqlDbType.NVarChar), SqlDb.CreateParamteterSQL("Name", nameBook, SqlDbType.NVarChar), SqlDb.CreateParamteterSQL("CountPage", countPage, SqlDbType.Int), SqlDb.CreateParamteterSQL("LinkDownload", urlDownload, SqlDbType.NVarChar) }); } var nodeLinks = document.DocumentNode.SelectNodes("//a"); if (nodeLinks != null) { foreach (HtmlNode VARIABLE in nodeLinks) { string link = VARIABLE.GetAttributeValue("href", ""); string fullLink = Common.GetFullUrlFromLink(link, uriRoot); if (!CheckVisitCrc(Common.CrcProductID(fullLink)) && !Regex.IsMatch(fullLink, ConfigDownloadBook.RegexNoVisit, RegexOptions.None) && Regex.IsMatch(fullLink, ConfigDownloadBook.RegexVisit, RegexOptions.None)) { _producerBasicDownloadHtml.PublishString(fullLink); hashCrcVisited.SetCrcVisited(Common.CrcProductID(fullLink)); if (Regex.IsMatch(fullLink, ConfigDownloadBook.RegexProduct, RegexOptions.None)) { string urlDownload = ""; string nameBook = ""; int countPage = 0; var nodeFindLink = document.DocumentNode.SelectNodes(@"//td//a/@href"); foreach (var varData in nodeFindLink) { string href = varData.GetAttributeValue("href", ""); if (Regex.IsMatch(href, "*")) { urlDownload = href; nameBook = varData.InnerText; } } bool bOK = this.sqldb.RunQuery("insert into LinkItbook (Link, Name, CountPage, LinkDownload) values (@Link, @Name, @CountPage, @LinkDownload)" , CommandType.Text, new SqlParameter[] { SqlDb.CreateParamteterSQL("Link", fullLink, SqlDbType.NVarChar), SqlDb.CreateParamteterSQL("Name", nameBook, SqlDbType.NVarChar), SqlDb.CreateParamteterSQL("CountPage", countPage, SqlDbType.Int), SqlDb.CreateParamteterSQL("LinkDownload", urlDownload, SqlDbType.NVarChar) }); } } } } } _log.InfoFormat("Processed link {0}", linkProcess); this.GetChannel().BasicAck(message.DeliveryTag, true); }