Beispiel #1
0
        public void SoundexIndexTest()
        {
            IDocumentCorpus corpus     = DirectoryCorpus.LoadTextDirectory(directory);
            DiskSoundEx     soundIndex = new DiskSoundEx("./");

            soundIndex.BuildSoundexIndex(corpus);
            soundIndex.GetCount().Should().Be(5);
            soundIndex.Clear();
        }
        public void TestingTierIndex()
        {
            //Path to where all the bin file will be write to
            string pathToIndex = Path.Join(corpusDir, "/index/");

            //Let Indexer know where should it writes all bin files
            Indexer.path = pathToIndex;

            //Read corpus
            IDocumentCorpus corpus = DirectoryCorpus.LoadTextDirectory(corpusDir);

            //Load Corpus to Index
            IIndex index = Indexer.IndexCorpus(corpus);

            //Create new DiskPositional Index from on disk files
            index = new DiskPositionalIndex(pathToIndex);

            //Check Info Of Postings Collected from Tier 1

      

            IList<String> results = new List<string>();

            //The rest of your code...
            List<string> terms = new List<string>();
            terms.Add("hello");
            terms.Add("world");

                  //get the postings
            IList<Posting> postings = new List<Posting>();
            postings = index.GetPositionalPostings(terms);


            //add the count of the postings to the list of strings to be returned
            results.Add(postings.Count.ToString());
            foreach (Posting p in postings)
            {
                if (results.Count < 20)
                {
                    //use the document id to access the document
                    IDocument doc = corpus.GetDocument(p.DocumentId);
                    results.Add(doc.Title);
                    results.Add(doc.DocumentId.ToString());
                }

            }

            foreach (string s in results)
            {
                Console.WriteLine(s);
            }

        } //end TestingTierIndexer()
 /// <summary>
 /// Constructs soundex index(hash map) from the author of documents in the corpus
 /// </summary>
 /// <param name="corpus">the corpus of documents</param>
 public void BuildSoundexIndex(IDocumentCorpus corpus)
 {
     foreach (IDocument d in corpus.GetDocuments())
     {
         //Skip document with no author field
         if (d.Author == null)
         {
             continue;
         }
         AddDocIdByAuthor(d.Author, d.DocumentId);
     }
     Save();
 }
Beispiel #4
0
        public void GetPostingsTest_MultipleNames()
        {
            //Arrange
            IDocumentCorpus corpus      = DirectoryCorpus.LoadTextDirectory(directory);
            DiskSoundEx     authorIndex = new DiskSoundEx("./");

            authorIndex.BuildSoundexIndex(corpus);
            //Act
            var actual = authorIndex.GetPostings("yashua ovando");

            //Assert
            actual.Should().HaveCount(2);
            authorIndex.Clear();
        }
Beispiel #5
0
        public void GetPostingsTest_NotExistingName_ReturnsEmpty()
        {
            //Arrange
            IDocumentCorpus corpus      = DirectoryCorpus.LoadTextDirectory(directory);
            DiskSoundEx     authorIndex = new DiskSoundEx("./");

            authorIndex.BuildSoundexIndex(corpus);
            //Act
            var actual = authorIndex.GetPostings("hella");

            //Assert
            actual.Should().BeEmpty();
            authorIndex.Clear();
        }
        public RankedRetrieval(IDocumentCorpus corpus, IIndex index, string RankedRetrievalMode)
        {
            query2termWeight = new int();
            doc2termWeight   = new int();
            accumulator      = new Dictionary <int, double>();
            documentIds      = new List <int>();

            this.rankVariant = SetRankedRetrievalMode(RankedRetrievalMode);
            this.index       = index;
            this.corpus      = corpus;

            string path = Indexer.path;

            this.corpusSize = this.GetCorpusSize(path);
        }
        public void PostionalIndexTest_OnePosition()
        {
            //Arrange
            string          term     = "sun";
            IList <Posting> expected = UnitTest.GeneratePostings("(3,[3])");

            //Act
            IDocumentCorpus corpus = DirectoryCorpus.LoadTextDirectory(directory);
            IIndex          index  = Indexer.IndexCorpus(corpus);
            var             result = index.GetPostings(term);

            //Assert
            result.Should().HaveSameCount(expected);
            result.Should().BeEquivalentTo(expected, config => config.WithStrictOrdering());
        }
Beispiel #8
0
        public void GetPostingsTest_SimilarSoundingName()
        {
            //Arrange
            IDocumentCorpus corpus      = DirectoryCorpus.LoadTextDirectory(directory);
            DiskSoundEx     authorIndex = new DiskSoundEx("./");

            authorIndex.BuildSoundexIndex(corpus);
            //Act
            var result1 = authorIndex.GetPostings("bloclic");
            var result2 = authorIndex.GetPostings("blacklock");

            //Assert
            result1.Should().BeEquivalentTo(result2);
            authorIndex.Clear();
        }
Beispiel #9
0
        public fmInvIndex()
        {
            InitializeComponent();

            DocumentCorpus = new DocumentCorpus();

            InvertedIndex              = new InvertedIndex();
            InvertedIndexBuildBegin   += OnInvertedIndexBuildBegin;
            InvertedIndexHasBeenBuilt += OnInvertedIndexHasBeenBuilt;

            TextProgress            = new DisplayTextProgressMergeToBegin();
            TextProgress.IsChanged += ProgressInfoIsChanged;

            SavingBegin += OnSavingBegin;
            SavingEnd   += OnSavingEnd;

            InitBackGrounWorkers();
        }
        public void PostionalIndexTest_MultiplePositions()
        {
            //Arrange
            string          term = "hello";
            IList <Posting> expected;

            // expected = UnitTest.GeneratePostings("(0,[0,1]), (2,[0,2,3])");  //this not work for macOS yet
            if (RuntimeInformation.IsOSPlatform(OSPlatform.OSX))
            {
                System.Console.WriteLine(" for MacOSX");
                expected = new List <Posting> {
                    new Posting(2, new List <int> {
                        0, 2, 3
                    }),
                    new Posting(4, new List <int> {
                        0, 1
                    })
                };
            }
            else
            {
                System.Console.WriteLine(" for Windows and other OSs");
                expected = new List <Posting> {
                    new Posting(0, new List <int> {
                        0, 1
                    }),
                    new Posting(2, new List <int> {
                        0, 2, 3
                    })
                };
            }

            //Act
            IDocumentCorpus corpus = DirectoryCorpus.LoadTextDirectory(directory);
            IIndex          index  = Indexer.IndexCorpus(corpus);
            var             result = index.GetPostings(term);

            //Assert
            result.Should().HaveSameCount(expected);
            result.Should().BeEquivalentTo(expected, config => config.WithStrictOrdering());
        }
        public void VocabTest_WithStemmer()
        {
            //Arrange
            IDocumentCorpus corpus        = DirectoryCorpus.LoadTextDirectory(directory);
            IIndex          index         = Indexer.IndexCorpus(corpus);
            var             expectedVocab = new List <string> {
                "hello", "world", "it", "is", "snow",
                "the", "full", "of", "mystery", "mr.snowman",
                "love", "sun", "a"
            };  //expected vocabulary with stemmed terms

            expectedVocab.Sort();

            //Act
            var actual = index.GetVocabulary();

            //Assert
            index.Should().NotBeNull("because indexCorpus shouldn't return null");
            actual.Should().HaveSameCount(expectedVocab, "because the index used StemmingTokenProcessor");
            //actual.Should().BeEquivalentTo(expectedVocab);    //TODO: why "mystery" became "mysteri"??
        }
        /// <summary>
        /// Gets on-disk index or generate a new index out of the selected corpus
        /// </summary>
        /// <param name="path">the path to the selected corpus</param>
        public void GetIndex(string path)
        {
            Console.WriteLine("\n-------------------------------------------------------------------------------");
            Console.WriteLine($"Path: {path}");

            try
            {
                string pathToIndex = Path.Join(path, "/index/");
                Indexer.path = pathToIndex;

                bool doesOnDiskIndexExist = Directory.Exists(pathToIndex);
                // bool doesOnDiskIndexExist = Directory.Exists(pathToIndex) && (Directory.GetFiles(pathToIndex).Length != 0);


                //make corpus out of the selected directory path
                corpus = DirectoryCorpus.LoadTextDirectory(path);

                if (doesOnDiskIndexExist)
                {
                    Console.WriteLine("Reading the existing on-disk index.");
                    index = new DiskPositionalIndex(pathToIndex);
                }
                else
                {
                    Console.WriteLine("Generating new index.");
                    GenerateIndex(path);
                }
            }
            catch (Exception e)
            {
                Console.WriteLine(e);
            }

            Console.WriteLine("Ready to go!");
            Console.WriteLine("-------------------------------------------------------------------------------");
        }
Beispiel #13
0
        /// <summary>
        /// Constructs an index from a corpus of documents
        /// </summary>
        /// <param name="corpus">a corpus to be indexed</param>
        public static IIndex IndexCorpus(IDocumentCorpus corpus)
        {
            Console.WriteLine($"[Indexer] Indexing {corpus.CorpusSize} documents in the corpus...");
            // Time how long it takes to index the corpus
            Stopwatch elapsedTime = new Stopwatch();

            elapsedTime.Start();

            // Set the index type and token processor to use
            DiskPositionalIndex index   = new DiskPositionalIndex(Indexer.path);
            DiskSoundEx         soundEx = new DiskSoundEx(Indexer.path);
            DiskKGram           kGram   = new DiskKGram(Indexer.path);

            index.Clear();
            soundEx.Clear();
            kGram.Clear();

            ITokenProcessor processor = new StemmingTokenProcesor();

            HashSet <string> unstemmedVocabulary = new HashSet <string>();

            // Index the document
            foreach (IDocument doc in corpus.GetDocuments())
            {
                //Tokenize the documents
                ITokenStream stream = new EnglishTokenStream(doc.GetContent());

                IEnumerable <string> tokens = stream.GetTokens();

                //keeptrack of tokens per document
                int tokenCount = 0;

                //keep track of file size
                int position = 0;

                foreach (string token in tokens)
                {
                    tokenCount++;
                    //Process token to term
                    List <string> terms = processor.ProcessToken(token);

                    //Add term to the index
                    bool termsIsAdded = false;


                    foreach (string term in terms)
                    {
                        if (term.Length > 0)
                        {
                            index.AddTerm(term, doc.DocumentId, position);

                            termsIsAdded = true;
                        }
                    }

                    //Increase the position num
                    position = termsIsAdded ? position + 1 : position;


                    //Keep track of vocabularies for K-gram
                    foreach (string term in ((NormalTokenProcessor)processor).ProcessToken(token))
                    {
                        unstemmedVocabulary.Add(term);
                    }
                }

                //Add token count per document
                index.AddTokensPerDocument(doc.DocumentId, tokenCount);

                //get number of bytes in file
                string docFilePath    = doc.FilePath;
                int    fileSizeInByte = (int)(new FileInfo(docFilePath).Length / 8f);
                index.AddByteSize(doc.DocumentId, fileSizeInByte);


                //calculates Average term Frequency for a specific document
                index.CalcAveTermFreq(doc.DocumentId);

                //calculate L_{d} for the document and store it index so that we can write it to disk later
                index.CalculateDocWeight(doc.DocumentId);

                Indexer.averageDocLength = index.calculateAverageDocLength();

                //Add author to SoundEx Index
                soundEx.AddDocIdByAuthor(doc.Author, doc.DocumentId);
                stream.Dispose();
            }


            kGram.buildKGram(unstemmedVocabulary);
            index.Save();
            soundEx.Save();

            elapsedTime.Stop();
            Console.WriteLine("[Indexer] Done Indexing! Time Elapsed " + elapsedTime.Elapsed.ToString("mm':'ss':'fff"));
            GC.Collect();
            return(index);
        }
        public void GetRankedDocumentsTest()
        {
            //Path to where all the bin file will be write to
            string pathToIndex = Path.Join(corpusDir, "/index/");

            //Let Indexer know where should it writes all bin files
            Indexer.path = pathToIndex;

            //Read corpus
            IDocumentCorpus corpus = DirectoryCorpus.LoadTextDirectory(corpusDir);

            //Create directory  to bins folders if it doesn't exist
            Directory.CreateDirectory(pathToIndex);

            //Initialize the index. 
            IIndex index = Indexer.IndexCorpus(corpus);


            //The rest of your code...
            List<string> terms = new List<string>();
            terms.Add("hello");
            terms.Add("world");

            //Testing ranked retrieval AND accumulated values
            index = new DiskPositionalIndex(pathToIndex);
            RankedRetrieval rv = new RankedRetrieval(corpus, index, "Default");
            IList<MaxPriorityQueue.InvertedIndex> actual = rv.GetTopTen(terms);
            actual[0].GetDocumentId().Should().Be(0); //should be document 1 which is of doc id 0
            actual[0].GetRank().Should().BeApproximately(1.183748156, 9); //A_{doccument} = 3.10195041 L_{1} = 2.620447934
            actual[1].GetDocumentId().Should().Be(2); //
            actual[2].GetDocumentId().Should().Be(1); //
            actual[3].GetDocumentId().Should().Be(4); //

            //tests tf-idf
            rv = new RankedRetrieval(corpus, index, "Tf-idf");
            IList<MaxPriorityQueue.InvertedIndex> actual1 = rv.GetTopTen(terms);
            actual1[0].GetDocumentId().Should().Be(2);
            actual1[0].GetRank().Should().BeApproximately(0.948215482, 9);
            actual1[1].GetDocumentId().Should().Be(0);
            actual1[1].GetRank().Should().BeApproximately(0.893296803, 9);
            actual1[2].GetDocumentId().Should().Be(1);
            actual1[2].GetRank().Should().BeApproximately(0.150554959, 9);
            actual1[3].GetDocumentId().Should().Be(4);
            actual1[3].GetRank().Should().BeApproximately(0.150554959, 9);


            //tests Okapi BM25
            rv = new RankedRetrieval(corpus, index, "Okapi");
            IList<MaxPriorityQueue.InvertedIndex> actual2 = rv.GetTopTen(terms);
            actual2[0].GetDocumentId().Should().Be(0);
            actual2[0].GetRank().Should().BeApproximately(0.66590893, 9);
            actual2[1].GetDocumentId().Should().Be(2);
            actual2[1].GetRank().Should().BeApproximately(0.507521667, 9);
            actual2[2].GetDocumentId().Should().Be(1);
            actual2[2].GetRank().Should().BeApproximately(0.1089371981, 9);
            actual2[3].GetDocumentId().Should().Be(4);
            actual2[3].GetRank().Should().BeApproximately(0.1084371981, 9);


            //tests Wacky 
            rv = new RankedRetrieval(corpus, index, "Wacky");
            IList<MaxPriorityQueue.InvertedIndex> actual3 = rv.GetTopTen(terms);
            actual3[0].GetDocumentId().Should().Be(0);
            actual3[0].GetRank().Should().BeApproximately(0.284824391, 9);
            actual3[1].GetDocumentId().Should().Be(2);
            actual3[1].GetRank().Should().BeApproximately(0.259673474, 9);
            actual3[2].GetDocumentId().Should().Be(1);
            actual3[2].GetRank().Should().Be(0.0);
            actual3[3].GetDocumentId().Should().Be(4);
            actual3[3].GetRank().Should().Be(0.0);

        }// end of GetRankedDocumentTest()