public List <Proxy> Scrape(string data) { List <Proxy> scraped = new List <Proxy>(); Generic g = new Generic(); var url = data.RegexMatch(@"(http:\/\/|https:\/\/)(.*?)\/favicon.ico").Groups[2].Value; var searchPageURL = "http://" + url + "/search?max-results=10"; var searchPage = HTTP.DoWebRequest(searchPageURL); string[] pages = GetPages(searchPage, url); if (pages == null) { return(scraped); } var options = new ParallelOptions { MaxDegreeOfParallelism = 10 }; Task t = new Task(() => { Parallel.ForEach(pages, options, (item) => { try { string html = HTTP.DoWebRequest(item); if (string.IsNullOrEmpty(html)) { return; } lock (scraped) { scraped.AddRange(g.Scrape(html)); } } catch { } }); }); t.Start(); Task.WaitAll(t); return(scraped); }
public List <Proxy> Scrape(string data) { List <string> pages = new List <string>(); List <Proxy> scraped = new List <Proxy>(); Generic g = new Generic(); var options = new ParallelOptions { MaxDegreeOfParallelism = 10 }; for (int i = 0; i < 23; i++) { pages.Add("https://proxyrox.com/?p=" + i + "&sortdir=desc&sort=reliability"); } Task t = new Task(() => { Parallel.ForEach(pages, options, (item) => { try { string html = HTTP.DoWebRequest(item); if (string.IsNullOrEmpty(html)) { return; } lock (scraped) { scraped.AddRange(g.Scrape(html)); } } catch { } }); }); t.Start(); Task.WaitAll(t); return(scraped); }
private async void btnScrape_Click(object sender, EventArgs e) { btnScrape.Enabled = false; var hosts = new List <string>(); if (rbCustom.Checked) { if (CustomSources.Count == 0) { MessageBox.Show("You have selected custom source list. Please load some before scraping.", "Form Validation Failed", MessageBoxButtons.OK, MessageBoxIcon.Information); return; } hosts.Clear(); hosts.AddRange(CustomSources.ToArray()); } // hosts.Add("https://orca.tech/?action=real-time-proxy-list"); //hosts.Add("http://free-proxy-list.net/anonymous-proxy.html"); // hosts.Add("http://www.us-proxy.org/"); // hosts.Add("www.sslproxies.org"); //hosts.Add("http://irc-proxies24.blogspot.com/2016/08/26-08-16-irc-proxy-servers-900_26.html"); // hosts.Add("http://www.samair.ru/proxy/"); //hosts.Add("https://www.hide-my-ip.com/proxylist.shtml"); //hosts.Add("http://fineproxy.org/eng/?p=6"); //hosts.Add("http://www.blackhatworld.com/seo/new-fresh-big-proxy-lists-worldwide-usa-and-elite-proxies-updated-daily.753956/page-21"); //hosts.Add("https://us-proxy-server.blogspot.com/"); // hosts.Add("http://txt.proxyspy.net/proxy.txt"); //hosts.Add("http://txt.proxyspy.net/proxy.txt"); // hosts.Add("http://proxyrox.com"); //hosts.Add("https://nordvpn.com/wp-admin/admin-ajax.php?searchParameters[0][name]=proxy-country&searchParameters[0][value]=&searchParameters[1][name]=proxy-ports&searchParameters[1][value]=&offset=25&limit=10000&action=getProxies"); lvProxies.BeginUpdate(); // BLOGSPOT //hosts.Add("http://proxyserverlist-24.blogspot.com/"); //hosts.Add("http://sslproxies24.blogspot.ro"); // hosts.Add("http://sslproxies24.blogspot.ro"); bool checkLimit = cbLimit.Checked; var numLimit = (int)this.numLimit.Value; var options = new ParallelOptions() { MaxDegreeOfParallelism = 10 }; var _Scraper = new Scraper.Scraper(); Hashtable hash = new Hashtable(); Stopwatch s = new Stopwatch(); s.Start(); await Task.Run(() => { Parallel.ForEach(hosts, options, (item) => { try { if (checkLimit && hash.Count >= numLimit) { return; } if (!item.StartsWith("http://") && !item.StartsWith("https://")) { item = "http://" + item; } string html = HTTP.DoWebRequest(item); if (string.IsNullOrEmpty(html)) { return; } List <Proxy> proxies = _Scraper.Scrape(item, html); if (proxies == null) { return; } Parallel.ForEach(proxies, options, (proxy) => { if (proxy == null) { return; } if (checkLimit && hash.Count >= numLimit) { return; } lock (hash) { if (!hash.Contains(proxy.Proxy_)) { hash.Add(proxy.Proxy_, proxy); } } }); } catch { } }); }); foreach (DictionaryEntry element in hash) { if (checkLimit && lvProxies.Items.Count >= numLimit) { break; } Proxy proxy = (Proxy)(element.Value); Invoke(new MethodInvoker(() => { ListViewItem i = new ListViewItem((lvProxies.Items.Count + 1).ToString()); var countryCode = CountryInfo.GetCode(proxy.Country); if (!imageList.Images.Keys.Contains(countryCode)) { imageList.Images.Add(countryCode, Image.FromFile(@"Flags\" + countryCode + ".png")); } i.ImageKey = countryCode; // i.UseItemStyleForSubItems = false; i.SubItems.Add(proxy.Proxy_); i.SubItems.Add(proxy.Anonymity); i.SubItems.Add(proxy.Country); i.SubItems.Add(""); i.SubItems.Add(""); i.SubItems.Add(""); lvProxies.Items.Add(i); })); } s.Stop(); lvProxies.EndUpdate(); MessageBox.Show("Done!\r\nTime Elapsed: " + s.Elapsed); btnScrape.Enabled = true; }
public List <Proxy> Scrape(string data) { List <Proxy> scraped = new List <Proxy>(); List <string> pages = new List <string>(); List <string> ippages = new List <string>(); Generic g = new Generic(); var options = new ParallelOptions { MaxDegreeOfParallelism = 10 }; pages.Add("http://www.samair.ru/proxy/proxy-1.htm"); for (int i = 2; i < 30; i++) { if (i <= 9 && i > 1) { pages.Add("http://www.samair.ru/proxy/proxy-0" + i + ".htm"); } else { pages.Add("http://www.samair.ru/proxy/proxy-" + i + ".htm"); } //<a href="/proxy/ip-port/977482367.html">You can do it there</a> } Task t = new Task(() => { Parallel.ForEach(pages, options, (item) => { try { string html = HTTP.DoWebRequest(item); if (string.IsNullOrEmpty(html)) { return; } var page = html.GetBetween("<a href=\"/proxy/ip-port/", ".html"); var linkToPage = "http://www.samair.ru/proxy/ip-port/" + page + ".html"; ippages.Add(linkToPage); var page2 = HTTP.DoWebRequest(linkToPage); if (string.IsNullOrEmpty(page2)) { return; } lock (scraped) { scraped.AddRange(g.Scrape(page2)); } } catch { } }); }); t.Start(); Task.WaitAll(t); return(scraped); }