static HtmlDownloader() { webClient = new MyWebClient(); }
/// <summary> /// Gets up to maxAmount URLs from Google searching the searchString. /// </summary> /// <param name="searchString">The string to search for.</param> /// <param name="maxAmount">The maximum number of results.</param> /// <returns>The URLs.</returns> private List<string> GetGoogleResults(string searchString, int maxAmount) { WebClient client = new MyWebClient(Crawler.WEB_TIMEOUT); int pageNum = 1; List<string> result = new List<string>(); try { while (result.Count < maxAmount) { string query = ConstructGoogleSearch(searchString, pageNum); // Sleep 5s before each request so that Google does not consider us as a bot. // Note: This will stop working if you'd create a thread for each topic Thread.Sleep(GOOGLE_SLEEP_MS); string page = client.DownloadString(query); List<string> results = ExtractGoogleResults(page); if (results.Count == 0) { break; } result.AddRange(results.Take(maxAmount - result.Count)); pageNum++; } } catch (WebException e) { ShowMessageBox("Could not generate training sets for " + searchString + ": " + e.Message + "\n"); } return result; }