public void GetPagesAuto() { HashSet <String> listOfHrefs = new HashSet <string>(); if (proxys.Count > 0) { foreach (var proxy in proxys) { try { WebRequest webRequest = WebRequest.Create("https://rozetka.com.ua/ua/all-categories-goods/"); webRequest.Method = "GET"; WebProxy web = new WebProxy(); string[] fulladress = proxy.Split(":"); var(adress, port) = (fulladress[0], int.Parse(fulladress[1])); WebProxy prox = new WebProxy(adress, port); prox.BypassProxyOnLocal = false; webRequest.Proxy = prox; string html = ""; ThreadDelay.Delay(); using (var response = webRequest.GetResponse()) { using (var streamReader = new StreamReader(response.GetResponseStream())) { html = streamReader.ReadToEnd(); } } HtmlParser parser = new HtmlParser(); var htmlDocument = parser.ParseDocument(html); var bodyDiv = htmlDocument.GetElementsByClassName("all-cat-b-l-i-parent"); if (bodyDiv != null) { GetInsidingHrefsInHrefs(bodyDiv, ref listOfHrefs); } break; } catch (Exception ex) { } } } }
private void GetInsidingHrefsInHrefs(IHtmlCollection <IElement> hrefs, ref HashSet <string> listOfHrefs) { HtmlParser parser = new HtmlParser(); if (proxys.Count > 0) { foreach (var href in hrefs) { for (int i = 0; i < proxys.Count - 1; i++) { try { ThreadDelay.Delay(); string fullHref = href.GetElementsByTagName("a")[0].GetAttribute("href"); var webRequest = WebRequest.Create(fullHref); string[] fulladress = proxys[i].Split(":"); var(adress, port) = (fulladress[0], int.Parse(fulladress[1])); WebProxy prox = new WebProxy(adress, port); prox.BypassProxyOnLocal = false; webRequest.Proxy = prox; using (var response = webRequest.GetResponse()) { string html = ""; using (StreamReader streamReader = new StreamReader(response.GetResponseStream())) { html = streamReader.ReadToEnd(); var htmlDocument = parser.ParseDocument(html); if (htmlDocument.GetElementById("block_with_goods") != null) { listOfHrefs.Add(fullHref); //analog break i = proxys.Count; } else { var portal = htmlDocument.GetElementsByClassName("portal-automatic"); if (portal.Length != 0) { var p_auto_block = portal[0].GetElementsByClassName("p-auto-block"); foreach (var p_auto in p_auto_block) { foreach (var p in p_auto.GetElementsByClassName("arrow-link")) { listOfHrefs.Add(p.GetAttribute("href")); } } //if (p_auto_block.Length != 0) //{ // foreach(var p in p_auto_block) // { // var table = p.GetElementsByClassName("pab-table"); // if (table.Length != 0) // { // foreach (var tab in table) // { // var imghref = tab.GetElementsByClassName("pab-img"); // if (imghref.Length != 0) { // foreach (var img in imghref) // { // var hrefFromImg = img.GetElementsByTagName("a"); // if (hrefFromImg.Length != 0) { // var hrefToProduct = hrefFromImg[0].GetAttribute("href"); // if(hrefToProduct.) // listOfHrefs.Add(); // } // } // } //} //} //} //} i = proxys.Count; } } } } } catch (Exception ex) { } } } } using (FileStream fs = new FileStream("RozetkaHrefs.txt", FileMode.Create, FileAccess.Write)) { BinaryFormatter bf = new BinaryFormatter(); bf.Serialize(fs, listOfHrefs.ToList()); } }