Esempio n. 1
0
        static void Main(string[] args)
        {
            DI.Initialize();

            DI.webCrawler.Initialize();
            DI.DPC.Initialize();

            if (crawl)
            {
                WebCrawler.WebCrawler webCrawler = DI.webCrawler;
                webCrawler.Run();
            }

            Indexer.Indexer indexer = DI.indexer;
            DI.indexer.Initialize(arePagesInMemory, numberOfPagesToLoad);
            DI.ranker.Initialize();
            indexer.Run(numberOfPagesToLoad);
            List <string> urlsOfMatchedDocuments = QueryPages(indexer);

            if (urlsOfMatchedDocuments.Count == 0)
            {
                Console.WriteLine("No matching pages");
            }
            else
            {
                Console.WriteLine("Url of matching pages:");
                foreach (string url in urlsOfMatchedDocuments)
                {
                    Console.WriteLine(url);
                }
            }
            Console.ReadLine();
        }
Esempio n. 2
0
 public static void Initialize()
 {
     ranker         = new Ranker.Ranker();
     indexer        = new Indexer.Indexer();
     urlFilter      = new UrlFilter();
     urlFrontier    = new UrlFrontier();
     indexCreator   = new IndexCreator();
     pageRetriever  = new PageRetriever();
     termContructor = new TermConstructor();
     tokenizer      = new Tokenizer();
     webCrawler     = new WebCrawler.WebCrawler();
     pageDB         = new PageDB();
     DPC            = new DuplicatePageChecker();
     DUC            = new DuplicateURLChecker();
     pageFetcher    = new PageFetcher();
     pageParser     = new PageParser();
 }
Esempio n. 3
0
        private static async Task Main()
        {
            #region Config

            pathToProgramConfig = AppDomain.CurrentDomain.BaseDirectory + pathToProgramConfig;

            if (!File.Exists(pathToProgramConfig))
            {
                var txt = JsonConvert.SerializeObject(new ProgramConfig(), Formatting.Indented);
                try
                {
                    File.WriteAllText(pathToProgramConfig, txt);
                }
                catch (Exception e)
                {
                    LogMaster($"Couldn't create or write to `{pathToProgramConfig}`. Error: {e.Message}", DebugLevel.Error);
                    return;
                }
                LogMaster($"Created config at `{pathToProgramConfig}`. Please fill it and restart the program.", DebugLevel.Warning);
                return;
            }
            else
            {
                try
                {
                    var txt = File.ReadAllText(pathToProgramConfig);
                    config = JsonConvert.DeserializeObject <ProgramConfig>(txt);
                }
                catch (Exception e)
                {
                    LogMaster($"Couldn't read or deserialize the config at `{pathToProgramConfig}`. Error: {e.Message}", DebugLevel.Error);
                    return;
                }
            }

            LogMaster("Config serialized successfully.", DebugLevel.Info);

            #endregion

            #region Check Connection

            var(isSucceeded, isFirstTime) = CheckConnection();

            if (!isSucceeded)
            {
                return;
            }
            if (isFirstTime)
            {
                config.NumberOfCrawlers = 1;
                config.Crawler_TimeoutForKeywordsParsingInMinutes = 1000000; // Don't timeout the process.
                // Setting case-sensitive to the keywords.
                using var dh = new DataHelper(DataHelperConfig.Create(config.Crawler_ConnectionString));
                dh.Database.ExecuteSqlCommand(
                    $"ALTER TABLE [{nameof(DataHelper.Keywords)}] ALTER COLUMN [{nameof(Keyword.RootKeywordForm)}] " +
                    "nvarchar(64) COLLATE SQL_Latin1_General_CP1_CS_AS;");
            }

            #endregion

            webCrawlers = new List <WebCrawler>();

            for (int i = 0; i < config.NumberOfCrawlers; i++)
            {
                // Create a web crawler and assign config
                var wc = new WebCrawler(WebCrawlerConfig.Create(
                                            userAgent: config.Crawler_UserAgent,
                                            connectionString: config.Crawler_ConnectionString,
                                            maxWaitForWebpages: config.Crawler_MaxWaitForWebpages,
                                            timeoutInSeconds: config.Crawler_TimeoutInSeconds,
                                            timeoutForKeywordsParsingInMinutes: config.Crawler_TimeoutForKeywordsParsingInMinutes,
                                            id: i
                                            ));

                wc.Log += Log;
                await Task.Run(wc.StartAsync);

                webCrawlers.Add(wc);
            }

            // Don't close the application
            string cmd;
            do
            {
                cmd = Console.ReadLine().ToLower();
                if (cmd == "stop all")
                {
                    while (webCrawlers.Count > 0)
                    {
                        var wc = webCrawlers[0];
                        webCrawlers.Remove(wc);
                        wc.StopAsync().Wait();
                    }
                }
                else
                {
                    var match = Regex.Match(cmd, @"stop (\d+)");
                    if (match.Success)
                    {
                        var n = int.Parse(match.Groups[1].Value);
                        for (int i = 0; i < n && i <= webCrawlers.Count; i++)
                        {
                            webCrawlers[i].StopAsync().Wait();
                        }
                    }
                }
            } while(cmd != "exit");
        }