static HtmlDownloader()
 {
     webClient = new MyWebClient();
 }
Esempio n. 2
0
 static HtmlDownloader()
 {
     webClient = new MyWebClient();
 }
Esempio n. 3
0
        /// <summary>
        /// Gets up to maxAmount URLs from Google searching the searchString.
        /// </summary>
        /// <param name="searchString">The string to search for.</param>
        /// <param name="maxAmount">The maximum number of results.</param>
        /// <returns>The URLs.</returns>
        private List<string> GetGoogleResults(string searchString, int maxAmount)
        {
            WebClient client = new MyWebClient(Crawler.WEB_TIMEOUT);
            int pageNum = 1;
            List<string> result = new List<string>();

            try
            {
                while (result.Count < maxAmount)
                {
                    string query = ConstructGoogleSearch(searchString, pageNum);
                    // Sleep 5s before each request so that Google does not consider us as a bot.
                    // Note: This will stop working if you'd create a thread for each topic
                    Thread.Sleep(GOOGLE_SLEEP_MS);

                    string page = client.DownloadString(query);
                    List<string> results = ExtractGoogleResults(page);
                    if (results.Count == 0)
                    {
                        break;
                    }
                    result.AddRange(results.Take(maxAmount - result.Count));
                    pageNum++;
                }
            }
            catch (WebException e)
            {
                ShowMessageBox("Could not generate training sets for " + searchString + ": " + e.Message + "\n");
            }

            return result;
        }