Beispiel #1
0
        public IInvertedIndex CreateIndex(IndexType kind)
        {
            LanguageFactory    _languageFactory = new LanguageFactory();
            PageProcesser      processer        = new PageProcesser();
            IEnumerable <Page> allPages         = fetchPages();
            IInvertedIndex     index            = null;

            if (kind.Equals(IndexType.Boolean))
            {
                index = new InvertedIndexBoolean();
            }
            else if (kind.Equals(IndexType.ContentBased))
            {
                index = new InvertedIndexTF(allPages.Count());
            }

            //var temp = allPages.Take(100);

            Parallel.ForEach(allPages, p =>
            {
                ILanguageBehaviour language = _languageFactory.GetLanguage(p.Url);
                p.Tokens = processer.PreprocessPage(p.SiteText, language);
                index.AddDocumentToIndex(p);
            });

            if (kind.Equals(IndexType.ContentBased))
            {
                index.InitialiseIndex();
            }

            return(index);
        }
Beispiel #2
0
        public IEnumerable <string> PreprocessPage(String pageToIndex, ILanguageBehaviour language)
        {
            IEnumerable <String> tokens = tokenise(pageToIndex);

            tokens     = prettifyTokens(tokens);
            _stopWords = language.StopWords;
            tokens     = removeStopWords(tokens);
            tokens     = language.Stem(tokens);
            return(tokens);
        }
Beispiel #3
0
 public IEnumerable <IEnumerable <string> > PreprocessPage(IEnumerable <string> pagesToIndex, ILanguageBehaviour language)
 {
     foreach (String page in pagesToIndex)
     {
         yield return(PreprocessPage(page, language));
     }
 }