public Crawler(string seeds) { _logger = Log.Instance; _visitedServers = new ConcurrentDictionary<int, long>(); _domainToIpMap = new ConcurrentDictionary<int, int>(); _urlFrontier = new SimpleUrlFrontier(seeds); _store = new Store(); _parser = new Parser(); _robotsTxts = new ConcurrentDictionary<int, RobotsTxt>(); _cts = new CancellationTokenSource(); }
static void Main(string[] args) { Crawler crawler = new Crawler("Ressources\\Seeds.txt"); Console.WriteLine("Starting..."); crawler.Start(1000); Console.WriteLine("Finished crawling..."); Console.WriteLine("Press any key to start indexing..."); Console.ReadLine(); Store fileStore = new Store(); Indexer indexer = new Indexer(); Dictionary<string, string> filesMap = fileStore.LoadFileMap(); Console.WriteLine("Indexing started"); DateTime start = DateTime.Now; int filesIndexed = 0; foreach (var file in filesMap) { string fileContent = File.ReadAllText(Path.Combine(fileStore.OutputPath, file.Value)); indexer.Index(file.Key, fileContent); Console.SetCursorPosition(0, 8); Console.WriteLine("Pages indexed {0} / {1}", ++filesIndexed, filesMap.Count); } DateTime end = DateTime.Now; TimeSpan timeElapsed = end.Subtract(start); double pagesPerSec = filesMap.Count / timeElapsed.TotalSeconds; Console.WriteLine("Indexed {0} pages per second", pagesPerSec); Console.WriteLine("Indexing finished in {0}", timeElapsed.ToString(@"hh\:mm\:ss")); Ranker ranker = new Ranker(); Console.Write("The search engine ready for your query: "); string searchQuery = Console.ReadLine(); Tokenizer tokenizer = new Tokenizer(); List<string> queryTerms = tokenizer.TokenizeQuery(searchQuery).ToList(); for (int i = 0; i < queryTerms.Count(); i++) { Stemmer stemmer = new Stemmer(); stemmer.Stem(queryTerms[i]); queryTerms[i] = stemmer.ToString(); } IEnumerable<KeyValuePair<int, double>> scores = ranker.Rank(queryTerms, indexer.Terms, indexer.TotalDocuments); int resultCount = 1; foreach (var item in scores) { string url = indexer.GetUrlFromHash(item.Key); if (url == null) continue; Console.WriteLine("{0}: {1}", resultCount++, url); if (resultCount > 10) break; } PageRank pageRank = new PageRank(); float[] vector = pageRank.DoRank(); for (int i = 0; i < vector.Length; i++) { Console.Write(vector[i]); } Console.WriteLine(); Console.WriteLine("Press any key to exit..."); Console.ReadLine(); }