static void Main(string[] args) { DI.Initialize(); DI.webCrawler.Initialize(); DI.DPC.Initialize(); if (crawl) { WebCrawler.WebCrawler webCrawler = DI.webCrawler; webCrawler.Run(); } Indexer.Indexer indexer = DI.indexer; DI.indexer.Initialize(arePagesInMemory, numberOfPagesToLoad); DI.ranker.Initialize(); indexer.Run(numberOfPagesToLoad); List <string> urlsOfMatchedDocuments = QueryPages(indexer); if (urlsOfMatchedDocuments.Count == 0) { Console.WriteLine("No matching pages"); } else { Console.WriteLine("Url of matching pages:"); foreach (string url in urlsOfMatchedDocuments) { Console.WriteLine(url); } } Console.ReadLine(); }
public static void Initialize() { ranker = new Ranker.Ranker(); indexer = new Indexer.Indexer(); urlFilter = new UrlFilter(); urlFrontier = new UrlFrontier(); indexCreator = new IndexCreator(); pageRetriever = new PageRetriever(); termContructor = new TermConstructor(); tokenizer = new Tokenizer(); webCrawler = new WebCrawler.WebCrawler(); pageDB = new PageDB(); DPC = new DuplicatePageChecker(); DUC = new DuplicateURLChecker(); pageFetcher = new PageFetcher(); pageParser = new PageParser(); }
private static async Task Main() { #region Config pathToProgramConfig = AppDomain.CurrentDomain.BaseDirectory + pathToProgramConfig; if (!File.Exists(pathToProgramConfig)) { var txt = JsonConvert.SerializeObject(new ProgramConfig(), Formatting.Indented); try { File.WriteAllText(pathToProgramConfig, txt); } catch (Exception e) { LogMaster($"Couldn't create or write to `{pathToProgramConfig}`. Error: {e.Message}", DebugLevel.Error); return; } LogMaster($"Created config at `{pathToProgramConfig}`. Please fill it and restart the program.", DebugLevel.Warning); return; } else { try { var txt = File.ReadAllText(pathToProgramConfig); config = JsonConvert.DeserializeObject <ProgramConfig>(txt); } catch (Exception e) { LogMaster($"Couldn't read or deserialize the config at `{pathToProgramConfig}`. Error: {e.Message}", DebugLevel.Error); return; } } LogMaster("Config serialized successfully.", DebugLevel.Info); #endregion #region Check Connection var(isSucceeded, isFirstTime) = CheckConnection(); if (!isSucceeded) { return; } if (isFirstTime) { config.NumberOfCrawlers = 1; config.Crawler_TimeoutForKeywordsParsingInMinutes = 1000000; // Don't timeout the process. // Setting case-sensitive to the keywords. using var dh = new DataHelper(DataHelperConfig.Create(config.Crawler_ConnectionString)); dh.Database.ExecuteSqlCommand( $"ALTER TABLE [{nameof(DataHelper.Keywords)}] ALTER COLUMN [{nameof(Keyword.RootKeywordForm)}] " + "nvarchar(64) COLLATE SQL_Latin1_General_CP1_CS_AS;"); } #endregion webCrawlers = new List <WebCrawler>(); for (int i = 0; i < config.NumberOfCrawlers; i++) { // Create a web crawler and assign config var wc = new WebCrawler(WebCrawlerConfig.Create( userAgent: config.Crawler_UserAgent, connectionString: config.Crawler_ConnectionString, maxWaitForWebpages: config.Crawler_MaxWaitForWebpages, timeoutInSeconds: config.Crawler_TimeoutInSeconds, timeoutForKeywordsParsingInMinutes: config.Crawler_TimeoutForKeywordsParsingInMinutes, id: i )); wc.Log += Log; await Task.Run(wc.StartAsync); webCrawlers.Add(wc); } // Don't close the application string cmd; do { cmd = Console.ReadLine().ToLower(); if (cmd == "stop all") { while (webCrawlers.Count > 0) { var wc = webCrawlers[0]; webCrawlers.Remove(wc); wc.StopAsync().Wait(); } } else { var match = Regex.Match(cmd, @"stop (\d+)"); if (match.Success) { var n = int.Parse(match.Groups[1].Value); for (int i = 0; i < n && i <= webCrawlers.Count; i++) { webCrawlers[i].StopAsync().Wait(); } } } } while(cmd != "exit"); }