/// <summary> /// Returns a config by the entered parameters. /// </summary> /// <param name="userAgent">The user agent of the crawler.</param> /// <param name="connectionString">The string which used for connecting the database.</param> /// <param name="maxWaitForWebpages">The max webpages the crawler will wait for finishing the process of them.</param> /// <param name="timeoutInSeconds">The time in seconds that a webpage has to be loaded.</param> /// <param name="id">The Id of this crawler.</param> /// <returns>A new <see cref="DataHelperConfig"/>.</returns> public static WebCrawlerConfig Create(string userAgent, string connectionString, int maxWaitForWebpages, int timeoutInSeconds, int timeoutForKeywordsParsingInMinutes, int id) { var dhc = new WebCrawlerConfig(userAgent, connectionString, maxWaitForWebpages, timeoutInSeconds, timeoutForKeywordsParsingInMinutes, id); return(dhc); }
/// <summary> /// The constructor of the <see cref="WebCrawler"/>. /// </summary> /// <param name="config"></param> public WebCrawler(WebCrawlerConfig config) { this.config = config; stemmer = new PorterStemmer(); }
private static async Task Main() { #region Config pathToProgramConfig = AppDomain.CurrentDomain.BaseDirectory + pathToProgramConfig; if (!File.Exists(pathToProgramConfig)) { var txt = JsonConvert.SerializeObject(new ProgramConfig(), Formatting.Indented); try { File.WriteAllText(pathToProgramConfig, txt); } catch (Exception e) { LogMaster($"Couldn't create or write to `{pathToProgramConfig}`. Error: {e.Message}", DebugLevel.Error); return; } LogMaster($"Created config at `{pathToProgramConfig}`. Please fill it and restart the program.", DebugLevel.Warning); return; } else { try { var txt = File.ReadAllText(pathToProgramConfig); config = JsonConvert.DeserializeObject <ProgramConfig>(txt); } catch (Exception e) { LogMaster($"Couldn't read or deserialize the config at `{pathToProgramConfig}`. Error: {e.Message}", DebugLevel.Error); return; } } LogMaster("Config serialized successfully.", DebugLevel.Info); #endregion #region Check Connection var(isSucceeded, isFirstTime) = CheckConnection(); if (!isSucceeded) { return; } if (isFirstTime) { config.NumberOfCrawlers = 1; config.Crawler_TimeoutForKeywordsParsingInMinutes = 1000000; // Don't timeout the process. // Setting case-sensitive to the keywords. using var dh = new DataHelper(DataHelperConfig.Create(config.Crawler_ConnectionString)); dh.Database.ExecuteSqlCommand( $"ALTER TABLE [{nameof(DataHelper.Keywords)}] ALTER COLUMN [{nameof(Keyword.RootKeywordForm)}] " + "nvarchar(64) COLLATE SQL_Latin1_General_CP1_CS_AS;"); } #endregion webCrawlers = new List <WebCrawler>(); for (int i = 0; i < config.NumberOfCrawlers; i++) { // Create a web crawler and assign config var wc = new WebCrawler(WebCrawlerConfig.Create( userAgent: config.Crawler_UserAgent, connectionString: config.Crawler_ConnectionString, maxWaitForWebpages: config.Crawler_MaxWaitForWebpages, timeoutInSeconds: config.Crawler_TimeoutInSeconds, timeoutForKeywordsParsingInMinutes: config.Crawler_TimeoutForKeywordsParsingInMinutes, id: i )); wc.Log += Log; await Task.Run(wc.StartAsync); webCrawlers.Add(wc); } // Don't close the application string cmd; do { cmd = Console.ReadLine().ToLower(); if (cmd == "stop all") { while (webCrawlers.Count > 0) { var wc = webCrawlers[0]; webCrawlers.Remove(wc); wc.StopAsync().Wait(); } } else { var match = Regex.Match(cmd, @"stop (\d+)"); if (match.Success) { var n = int.Parse(match.Groups[1].Value); for (int i = 0; i < n && i <= webCrawlers.Count; i++) { webCrawlers[i].StopAsync().Wait(); } } } } while(cmd != "exit"); }