/// <summary> /// Parses a search query to produce an IQueryComponent for Ranked Retrieval. /// <param name="query">query to be parsed to a query component</param> /// <returns>a query component</returns> public List <string> ParseQuery(string query) { if (query.Contains('*')) { return(new List <string>()); } string[] terms = query.Split(' '); List <List <string> > processedTerms = new List <List <string> >(); ITokenProcessor processor = new StemmingTokenProcesor(); foreach (string term in terms) { processedTerms.Add(processor.ProcessToken(term)); } List <string> finalTerms = new List <string>(); foreach (List <string> term in processedTerms) { foreach (string independentTerm in term) { finalTerms.Add(independentTerm); } } return(finalTerms); }
/// <summary> /// Returns stemmed version of a string /// </summary> /// <param name="term">string to be stemmed</param> public string StemTerm(string term) { //send the term into the stemmer string result = new StemmingTokenProcesor().StemWords(term); //resturn the result return(result); }
/// <summary> /// Constructs an index from a corpus of documents /// </summary> /// <param name="corpus">a corpus to be indexed</param> public static IIndex IndexCorpus(IDocumentCorpus corpus) { Console.WriteLine($"[Indexer] Indexing {corpus.CorpusSize} documents in the corpus..."); // Time how long it takes to index the corpus Stopwatch elapsedTime = new Stopwatch(); elapsedTime.Start(); // Set the index type and token processor to use DiskPositionalIndex index = new DiskPositionalIndex(Indexer.path); DiskSoundEx soundEx = new DiskSoundEx(Indexer.path); DiskKGram kGram = new DiskKGram(Indexer.path); index.Clear(); soundEx.Clear(); kGram.Clear(); ITokenProcessor processor = new StemmingTokenProcesor(); HashSet <string> unstemmedVocabulary = new HashSet <string>(); // Index the document foreach (IDocument doc in corpus.GetDocuments()) { //Tokenize the documents ITokenStream stream = new EnglishTokenStream(doc.GetContent()); IEnumerable <string> tokens = stream.GetTokens(); //keeptrack of tokens per document int tokenCount = 0; //keep track of file size int position = 0; foreach (string token in tokens) { tokenCount++; //Process token to term List <string> terms = processor.ProcessToken(token); //Add term to the index bool termsIsAdded = false; foreach (string term in terms) { if (term.Length > 0) { index.AddTerm(term, doc.DocumentId, position); termsIsAdded = true; } } //Increase the position num position = termsIsAdded ? position + 1 : position; //Keep track of vocabularies for K-gram foreach (string term in ((NormalTokenProcessor)processor).ProcessToken(token)) { unstemmedVocabulary.Add(term); } } //Add token count per document index.AddTokensPerDocument(doc.DocumentId, tokenCount); //get number of bytes in file string docFilePath = doc.FilePath; int fileSizeInByte = (int)(new FileInfo(docFilePath).Length / 8f); index.AddByteSize(doc.DocumentId, fileSizeInByte); //calculates Average term Frequency for a specific document index.CalcAveTermFreq(doc.DocumentId); //calculate L_{d} for the document and store it index so that we can write it to disk later index.CalculateDocWeight(doc.DocumentId); Indexer.averageDocLength = index.calculateAverageDocLength(); //Add author to SoundEx Index soundEx.AddDocIdByAuthor(doc.Author, doc.DocumentId); stream.Dispose(); } kGram.buildKGram(unstemmedVocabulary); index.Save(); soundEx.Save(); elapsedTime.Stop(); Console.WriteLine("[Indexer] Done Indexing! Time Elapsed " + elapsedTime.Elapsed.ToString("mm':'ss':'fff")); GC.Collect(); return(index); }
/// <summary> /// Returns postings for a query /// </summary> /// <param name="query">the query which the user is making to the search engine</param> public List <string> SearchQuery(string query) { try { //the list of strings to return List <String> results = new List <string>(); if (mode == false) { Console.WriteLine("In Ranked Retrieval"); Console.WriteLine("Query:" + query); //parser to parse the query RankedRetrievalParser parser = new RankedRetrievalParser(); List <string> finalTerms = parser.ParseQuery(query); //retrieves the top ten documents of the normalized tokens RankedRetrieval rv = new RankedRetrieval(corpus, index, RankedRetrievalMode); IList <MaxPriorityQueue.InvertedIndex> topTenDocs = rv.GetTopTen(finalTerms); //parse the query List <string> terms = parser.ParseQuery(query); if (topTenDocs.Count > 0) { //add the count of the postings to the list of strings to be returned results.Add(topTenDocs.Count.ToString()); //for each posting... int numberRank = 1; foreach (MaxPriorityQueue.InvertedIndex p in topTenDocs) { //use the document id to access the document IDocument doc = corpus.GetDocument(p.GetDocumentId()); //add the title to the list of strings to be returned results.Add("#" + numberRank + ": (" + Math.Round(p.GetRank(), 5).ToString() + ") " + doc.Title); //add the document id to the list of strings to be returned results.Add(doc.DocumentId.ToString()); Console.WriteLine(p.GetDocumentId() + "" + doc.Title); numberRank++; } } return(results); } // end of ranked retrieval segment (if statement) else { Console.WriteLine(query); //the list of postings IList <Posting> postings; IQueryComponent component; //create a stemming token processor ITokenProcessor processor = new StemmingTokenProcesor(); //create a boolean query parser BooleanQueryParser parser = new BooleanQueryParser(); //parse the query component = parser.ParseQuery(query); //get the postings postings = component.GetPostings(index, processor); //if there are any postings... if (postings.Count > 0) { //add the count of the postings to the list of strings to be returned results.Add(postings.Count.ToString()); //for each posting... foreach (Posting p in postings) { //use the document id to access the document IDocument doc = corpus.GetDocument(p.DocumentId); //add the title to the list of strings to be returned results.Add(doc.Title); //add the document id to the list of strings to be returned results.Add(doc.DocumentId.ToString()); } Console.WriteLine(results.Count); } //if there aren't any postings... else { //add a zero to the list of strings to be returned results.Add("0"); } //return the list of strings return(results); } } catch (Exception e) { return(new List <string>()); } // Console.Write("Corpus size is:"); // Console.WriteLine(corpus.CorpusSize); }
public TokenProcessorsUnitTest() { normalProcesser = new NormalTokenProcessor(); stemmingProcessor = new StemmingTokenProcesor(); }