/// <summary>
        /// Parses a search query to produce an IQueryComponent for Ranked Retrieval.
        /// <param name="query">query to be parsed to a query component</param>
        /// <returns>a query component</returns>
        public List <string> ParseQuery(string query)
        {
            if (query.Contains('*'))
            {
                return(new List <string>());
            }

            string[] terms = query.Split(' ');
            List <List <string> > processedTerms = new List <List <string> >();

            ITokenProcessor processor = new StemmingTokenProcesor();

            foreach (string term in terms)
            {
                processedTerms.Add(processor.ProcessToken(term));
            }

            List <string> finalTerms = new List <string>();

            foreach (List <string> term in processedTerms)
            {
                foreach (string independentTerm in term)
                {
                    finalTerms.Add(independentTerm);
                }
            }

            return(finalTerms);
        }
        /// <summary>
        /// Returns stemmed version of a string
        /// </summary>
        /// <param name="term">string to be stemmed</param>
        public string StemTerm(string term)
        {
            //send the term into the stemmer
            string result = new StemmingTokenProcesor().StemWords(term);

            //resturn the result
            return(result);
        }
Example #3
0
        /// <summary>
        /// Constructs an index from a corpus of documents
        /// </summary>
        /// <param name="corpus">a corpus to be indexed</param>
        public static IIndex IndexCorpus(IDocumentCorpus corpus)
        {
            Console.WriteLine($"[Indexer] Indexing {corpus.CorpusSize} documents in the corpus...");
            // Time how long it takes to index the corpus
            Stopwatch elapsedTime = new Stopwatch();

            elapsedTime.Start();

            // Set the index type and token processor to use
            DiskPositionalIndex index   = new DiskPositionalIndex(Indexer.path);
            DiskSoundEx         soundEx = new DiskSoundEx(Indexer.path);
            DiskKGram           kGram   = new DiskKGram(Indexer.path);

            index.Clear();
            soundEx.Clear();
            kGram.Clear();

            ITokenProcessor processor = new StemmingTokenProcesor();

            HashSet <string> unstemmedVocabulary = new HashSet <string>();

            // Index the document
            foreach (IDocument doc in corpus.GetDocuments())
            {
                //Tokenize the documents
                ITokenStream stream = new EnglishTokenStream(doc.GetContent());

                IEnumerable <string> tokens = stream.GetTokens();

                //keeptrack of tokens per document
                int tokenCount = 0;

                //keep track of file size
                int position = 0;

                foreach (string token in tokens)
                {
                    tokenCount++;
                    //Process token to term
                    List <string> terms = processor.ProcessToken(token);

                    //Add term to the index
                    bool termsIsAdded = false;


                    foreach (string term in terms)
                    {
                        if (term.Length > 0)
                        {
                            index.AddTerm(term, doc.DocumentId, position);

                            termsIsAdded = true;
                        }
                    }

                    //Increase the position num
                    position = termsIsAdded ? position + 1 : position;


                    //Keep track of vocabularies for K-gram
                    foreach (string term in ((NormalTokenProcessor)processor).ProcessToken(token))
                    {
                        unstemmedVocabulary.Add(term);
                    }
                }

                //Add token count per document
                index.AddTokensPerDocument(doc.DocumentId, tokenCount);

                //get number of bytes in file
                string docFilePath    = doc.FilePath;
                int    fileSizeInByte = (int)(new FileInfo(docFilePath).Length / 8f);
                index.AddByteSize(doc.DocumentId, fileSizeInByte);


                //calculates Average term Frequency for a specific document
                index.CalcAveTermFreq(doc.DocumentId);

                //calculate L_{d} for the document and store it index so that we can write it to disk later
                index.CalculateDocWeight(doc.DocumentId);

                Indexer.averageDocLength = index.calculateAverageDocLength();

                //Add author to SoundEx Index
                soundEx.AddDocIdByAuthor(doc.Author, doc.DocumentId);
                stream.Dispose();
            }


            kGram.buildKGram(unstemmedVocabulary);
            index.Save();
            soundEx.Save();

            elapsedTime.Stop();
            Console.WriteLine("[Indexer] Done Indexing! Time Elapsed " + elapsedTime.Elapsed.ToString("mm':'ss':'fff"));
            GC.Collect();
            return(index);
        }
        /// <summary>
        /// Returns postings for a query
        /// </summary>
        /// <param name="query">the query which the user is making to the search engine</param>
        public List <string> SearchQuery(string query)
        {
            try
            {
                //the list of strings to return
                List <String> results = new List <string>();


                if (mode == false)
                {
                    Console.WriteLine("In Ranked Retrieval");
                    Console.WriteLine("Query:" + query);

                    //parser to parse the query
                    RankedRetrievalParser parser = new RankedRetrievalParser();

                    List <string> finalTerms = parser.ParseQuery(query);

                    //retrieves the top ten documents of the normalized tokens
                    RankedRetrieval rv = new RankedRetrieval(corpus, index, RankedRetrievalMode);

                    IList <MaxPriorityQueue.InvertedIndex> topTenDocs = rv.GetTopTen(finalTerms);

                    //parse the query
                    List <string> terms = parser.ParseQuery(query);

                    if (topTenDocs.Count > 0)
                    {
                        //add the count of the postings to the list of strings to be returned
                        results.Add(topTenDocs.Count.ToString());

                        //for each posting...
                        int numberRank = 1;
                        foreach (MaxPriorityQueue.InvertedIndex p in topTenDocs)
                        {
                            //use the document id to access the document
                            IDocument doc = corpus.GetDocument(p.GetDocumentId());

                            //add the title to the list of strings to be returned
                            results.Add("#" + numberRank + ": (" + Math.Round(p.GetRank(), 5).ToString() + ") " + doc.Title);

                            //add the document id to the list of strings to be returned
                            results.Add(doc.DocumentId.ToString());
                            Console.WriteLine(p.GetDocumentId() + "" + doc.Title);
                            numberRank++;
                        }
                    }

                    return(results);
                }
                // end of ranked retrieval segment (if statement)
                else
                {
                    Console.WriteLine(query);


                    //the list of postings
                    IList <Posting> postings;
                    IQueryComponent component;
                    //create a stemming token processor
                    ITokenProcessor processor = new StemmingTokenProcesor();
                    //create a boolean query parser
                    BooleanQueryParser parser = new BooleanQueryParser();
                    //parse the query
                    component = parser.ParseQuery(query);

                    //get the postings
                    postings = component.GetPostings(index, processor);


                    //if there are any postings...
                    if (postings.Count > 0)
                    {
                        //add the count of the postings to the list of strings to be returned
                        results.Add(postings.Count.ToString());
                        //for each posting...
                        foreach (Posting p in postings)
                        {
                            //use the document id to access the document
                            IDocument doc = corpus.GetDocument(p.DocumentId);
                            //add the title to the list of strings to be returned
                            results.Add(doc.Title);
                            //add the document id to the list of strings to be returned
                            results.Add(doc.DocumentId.ToString());
                        }
                        Console.WriteLine(results.Count);
                    }
                    //if there aren't any postings...
                    else
                    {
                        //add a zero to the list of strings to be returned
                        results.Add("0");
                    }
                    //return the list of strings
                    return(results);
                }
            }
            catch (Exception e)
            {
                return(new List <string>());
            }

            // Console.Write("Corpus size is:");
            // Console.WriteLine(corpus.CorpusSize);
        }
Example #5
0
 public TokenProcessorsUnitTest()
 {
     normalProcesser   = new NormalTokenProcessor();
     stemmingProcessor = new StemmingTokenProcesor();
 }