/// <summary>
        /// Returns a config by the entered parameters.
        /// </summary>
        /// <param name="userAgent">The user agent of the crawler.</param>
        /// <param name="connectionString">The string which used for connecting the database.</param>
        /// <param name="maxWaitForWebpages">The max webpages the crawler will wait for finishing the process of them.</param>
        /// <param name="timeoutInSeconds">The time in seconds that a webpage has to be loaded.</param>
        /// <param name="id">The Id of this crawler.</param>
        /// <returns>A new <see cref="DataHelperConfig"/>.</returns>
        public static WebCrawlerConfig Create(string userAgent, string connectionString, int maxWaitForWebpages,
                                              int timeoutInSeconds, int timeoutForKeywordsParsingInMinutes, int id)
        {
            var dhc = new WebCrawlerConfig(userAgent, connectionString, maxWaitForWebpages, timeoutInSeconds, timeoutForKeywordsParsingInMinutes, id);

            return(dhc);
        }
Exemple #2
0
 /// <summary>
 /// The constructor of the <see cref="WebCrawler"/>.
 /// </summary>
 /// <param name="config"></param>
 public WebCrawler(WebCrawlerConfig config)
 {
     this.config = config;
     stemmer     = new PorterStemmer();
 }
Exemple #3
0
        private static async Task Main()
        {
            #region Config

            pathToProgramConfig = AppDomain.CurrentDomain.BaseDirectory + pathToProgramConfig;

            if (!File.Exists(pathToProgramConfig))
            {
                var txt = JsonConvert.SerializeObject(new ProgramConfig(), Formatting.Indented);
                try
                {
                    File.WriteAllText(pathToProgramConfig, txt);
                }
                catch (Exception e)
                {
                    LogMaster($"Couldn't create or write to `{pathToProgramConfig}`. Error: {e.Message}", DebugLevel.Error);
                    return;
                }
                LogMaster($"Created config at `{pathToProgramConfig}`. Please fill it and restart the program.", DebugLevel.Warning);
                return;
            }
            else
            {
                try
                {
                    var txt = File.ReadAllText(pathToProgramConfig);
                    config = JsonConvert.DeserializeObject <ProgramConfig>(txt);
                }
                catch (Exception e)
                {
                    LogMaster($"Couldn't read or deserialize the config at `{pathToProgramConfig}`. Error: {e.Message}", DebugLevel.Error);
                    return;
                }
            }

            LogMaster("Config serialized successfully.", DebugLevel.Info);

            #endregion

            #region Check Connection

            var(isSucceeded, isFirstTime) = CheckConnection();

            if (!isSucceeded)
            {
                return;
            }
            if (isFirstTime)
            {
                config.NumberOfCrawlers = 1;
                config.Crawler_TimeoutForKeywordsParsingInMinutes = 1000000; // Don't timeout the process.
                // Setting case-sensitive to the keywords.
                using var dh = new DataHelper(DataHelperConfig.Create(config.Crawler_ConnectionString));
                dh.Database.ExecuteSqlCommand(
                    $"ALTER TABLE [{nameof(DataHelper.Keywords)}] ALTER COLUMN [{nameof(Keyword.RootKeywordForm)}] " +
                    "nvarchar(64) COLLATE SQL_Latin1_General_CP1_CS_AS;");
            }

            #endregion

            webCrawlers = new List <WebCrawler>();

            for (int i = 0; i < config.NumberOfCrawlers; i++)
            {
                // Create a web crawler and assign config
                var wc = new WebCrawler(WebCrawlerConfig.Create(
                                            userAgent: config.Crawler_UserAgent,
                                            connectionString: config.Crawler_ConnectionString,
                                            maxWaitForWebpages: config.Crawler_MaxWaitForWebpages,
                                            timeoutInSeconds: config.Crawler_TimeoutInSeconds,
                                            timeoutForKeywordsParsingInMinutes: config.Crawler_TimeoutForKeywordsParsingInMinutes,
                                            id: i
                                            ));

                wc.Log += Log;
                await Task.Run(wc.StartAsync);

                webCrawlers.Add(wc);
            }

            // Don't close the application
            string cmd;
            do
            {
                cmd = Console.ReadLine().ToLower();
                if (cmd == "stop all")
                {
                    while (webCrawlers.Count > 0)
                    {
                        var wc = webCrawlers[0];
                        webCrawlers.Remove(wc);
                        wc.StopAsync().Wait();
                    }
                }
                else
                {
                    var match = Regex.Match(cmd, @"stop (\d+)");
                    if (match.Success)
                    {
                        var n = int.Parse(match.Groups[1].Value);
                        for (int i = 0; i < n && i <= webCrawlers.Count; i++)
                        {
                            webCrawlers[i].StopAsync().Wait();
                        }
                    }
                }
            } while(cmd != "exit");
        }