示例#1
0
        public void Index(string uri, string fileContent)
        {
            int postingId = uri.GetHashCode();
            _postingMap.Add(postingId, uri);
            IEnumerable<string> unStemmedTokens = _tokenizer.Tokenize(fileContent);
            foreach (var token in unStemmedTokens)
            {
                Stemmer stemmer = new Stemmer();
                stemmer.Stem(token);
                string stemmedToken = stemmer.ToString();


                Term term;
                if (_terms.ContainsKey(stemmedToken))
                {
                    term = _terms[stemmedToken];
                }
                else
                {
                    term = new Term(stemmedToken);
                    _terms.Add(stemmedToken, term);
                }
                term.AddPosting(postingId);
            }
        }
示例#2
0
        static void Main(string[] args)
        {
            Crawler crawler = new Crawler("Ressources\\Seeds.txt");

            Console.WriteLine("Starting...");
            crawler.Start(1000);
            Console.WriteLine("Finished crawling...");
            Console.WriteLine("Press any key to start indexing...");
            Console.ReadLine();

            Store fileStore = new Store();
            Indexer indexer = new Indexer();
            Dictionary<string, string> filesMap = fileStore.LoadFileMap();
            Console.WriteLine("Indexing started");
            DateTime start = DateTime.Now;
            int filesIndexed = 0;
            foreach (var file in filesMap)
            {
                string fileContent = File.ReadAllText(Path.Combine(fileStore.OutputPath, file.Value));
                indexer.Index(file.Key, fileContent);
                Console.SetCursorPosition(0, 8);
                Console.WriteLine("Pages indexed {0} / {1}", ++filesIndexed, filesMap.Count);
            }
            DateTime end = DateTime.Now;
            TimeSpan timeElapsed = end.Subtract(start);
            double pagesPerSec = filesMap.Count / timeElapsed.TotalSeconds;
            Console.WriteLine("Indexed {0} pages per second", pagesPerSec);
            Console.WriteLine("Indexing finished in {0}", timeElapsed.ToString(@"hh\:mm\:ss"));

            Ranker ranker = new Ranker();

            Console.Write("The search engine ready for your query: ");
            string searchQuery = Console.ReadLine();

            Tokenizer tokenizer = new Tokenizer();
            List<string> queryTerms = tokenizer.TokenizeQuery(searchQuery).ToList();
            for (int i = 0; i < queryTerms.Count(); i++)
            {
                Stemmer stemmer = new Stemmer();
                stemmer.Stem(queryTerms[i]);
                queryTerms[i] = stemmer.ToString();
            }


            IEnumerable<KeyValuePair<int, double>> scores = ranker.Rank(queryTerms, indexer.Terms, indexer.TotalDocuments);

            int resultCount = 1;
            foreach (var item in scores)
            {
                string url = indexer.GetUrlFromHash(item.Key);
                if (url == null)
                    continue;
                Console.WriteLine("{0}: {1}", resultCount++, url);
                if (resultCount > 10)
                    break;
            }

            PageRank pageRank = new PageRank();
            float[] vector = pageRank.DoRank();
            for (int i = 0; i < vector.Length; i++)
            {
                Console.Write(vector[i]);
            }
            Console.WriteLine();

            Console.WriteLine("Press any key to exit...");
            Console.ReadLine();
        }