/// <summary>
        /// Returns postings for soundex query
        /// </summary>
        /// <param name="name">the author name being queried</param>
        public List <string> SearchSoundexQuery(string name)
        {
            //list of strings to return
            List <String> results = new List <string>();

            try
            {
                //get a list of postings given the name
                IList <Posting> postings = new DiskSoundEx(Indexer.path).GetPostings(name);
                //if the query returns any results
                if (postings.Count > 0)
                {
                    //add the number of postings to the list of strings to return
                    results.Add(postings.Count.ToString());
                    //for each posting
                    foreach (Posting p in postings)
                    {
                        //use the posting's id to access the document
                        IDocument doc = corpus.GetDocument(p.DocumentId);
                        //add the title and name of the author to the list of strings to be returned
                        results.Add(doc.Title + " (Author: " + doc.Author + ")");
                        //also add the document id to the list of strings to be returned
                        results.Add(doc.DocumentId.ToString());
                    }
                }
                else
                {
                    //if there are no postings just return a list with a zero in it
                    results.Add("0");
                }
                //return the final list of strings
            }
            catch (Exception e)
            {
                Console.WriteLine(e);
            }

            return(results);
        }
예제 #2
0
        /// <summary>
        /// Constructs an index from a corpus of documents
        /// </summary>
        /// <param name="corpus">a corpus to be indexed</param>
        public static IIndex IndexCorpus(IDocumentCorpus corpus)
        {
            Console.WriteLine($"[Indexer] Indexing {corpus.CorpusSize} documents in the corpus...");
            // Time how long it takes to index the corpus
            Stopwatch elapsedTime = new Stopwatch();

            elapsedTime.Start();

            // Set the index type and token processor to use
            DiskPositionalIndex index   = new DiskPositionalIndex(Indexer.path);
            DiskSoundEx         soundEx = new DiskSoundEx(Indexer.path);
            DiskKGram           kGram   = new DiskKGram(Indexer.path);

            index.Clear();
            soundEx.Clear();
            kGram.Clear();

            ITokenProcessor processor = new StemmingTokenProcesor();

            HashSet <string> unstemmedVocabulary = new HashSet <string>();

            // Index the document
            foreach (IDocument doc in corpus.GetDocuments())
            {
                //Tokenize the documents
                ITokenStream stream = new EnglishTokenStream(doc.GetContent());

                IEnumerable <string> tokens = stream.GetTokens();

                //keeptrack of tokens per document
                int tokenCount = 0;

                //keep track of file size
                int position = 0;

                foreach (string token in tokens)
                {
                    tokenCount++;
                    //Process token to term
                    List <string> terms = processor.ProcessToken(token);

                    //Add term to the index
                    bool termsIsAdded = false;


                    foreach (string term in terms)
                    {
                        if (term.Length > 0)
                        {
                            index.AddTerm(term, doc.DocumentId, position);

                            termsIsAdded = true;
                        }
                    }

                    //Increase the position num
                    position = termsIsAdded ? position + 1 : position;


                    //Keep track of vocabularies for K-gram
                    foreach (string term in ((NormalTokenProcessor)processor).ProcessToken(token))
                    {
                        unstemmedVocabulary.Add(term);
                    }
                }

                //Add token count per document
                index.AddTokensPerDocument(doc.DocumentId, tokenCount);

                //get number of bytes in file
                string docFilePath    = doc.FilePath;
                int    fileSizeInByte = (int)(new FileInfo(docFilePath).Length / 8f);
                index.AddByteSize(doc.DocumentId, fileSizeInByte);


                //calculates Average term Frequency for a specific document
                index.CalcAveTermFreq(doc.DocumentId);

                //calculate L_{d} for the document and store it index so that we can write it to disk later
                index.CalculateDocWeight(doc.DocumentId);

                Indexer.averageDocLength = index.calculateAverageDocLength();

                //Add author to SoundEx Index
                soundEx.AddDocIdByAuthor(doc.Author, doc.DocumentId);
                stream.Dispose();
            }


            kGram.buildKGram(unstemmedVocabulary);
            index.Save();
            soundEx.Save();

            elapsedTime.Stop();
            Console.WriteLine("[Indexer] Done Indexing! Time Elapsed " + elapsedTime.Elapsed.ToString("mm':'ss':'fff"));
            GC.Collect();
            return(index);
        }