예제 #1
0
        public void SoundexIndexTest()
        {
            IDocumentCorpus corpus     = DirectoryCorpus.LoadTextDirectory(directory);
            DiskSoundEx     soundIndex = new DiskSoundEx("./");

            soundIndex.BuildSoundexIndex(corpus);
            soundIndex.GetCount().Should().Be(5);
            soundIndex.Clear();
        }
        public void TestingTierIndex()
        {
            //Path to where all the bin file will be write to
            string pathToIndex = Path.Join(corpusDir, "/index/");

            //Let Indexer know where should it writes all bin files
            Indexer.path = pathToIndex;

            //Read corpus
            IDocumentCorpus corpus = DirectoryCorpus.LoadTextDirectory(corpusDir);

            //Load Corpus to Index
            IIndex index = Indexer.IndexCorpus(corpus);

            //Create new DiskPositional Index from on disk files
            index = new DiskPositionalIndex(pathToIndex);

            //Check Info Of Postings Collected from Tier 1

      

            IList<String> results = new List<string>();

            //The rest of your code...
            List<string> terms = new List<string>();
            terms.Add("hello");
            terms.Add("world");

                  //get the postings
            IList<Posting> postings = new List<Posting>();
            postings = index.GetPositionalPostings(terms);


            //add the count of the postings to the list of strings to be returned
            results.Add(postings.Count.ToString());
            foreach (Posting p in postings)
            {
                if (results.Count < 20)
                {
                    //use the document id to access the document
                    IDocument doc = corpus.GetDocument(p.DocumentId);
                    results.Add(doc.Title);
                    results.Add(doc.DocumentId.ToString());
                }

            }

            foreach (string s in results)
            {
                Console.WriteLine(s);
            }

        } //end TestingTierIndexer()
예제 #3
0
        public void GetPostingsTest_MultipleNames()
        {
            //Arrange
            IDocumentCorpus corpus      = DirectoryCorpus.LoadTextDirectory(directory);
            DiskSoundEx     authorIndex = new DiskSoundEx("./");

            authorIndex.BuildSoundexIndex(corpus);
            //Act
            var actual = authorIndex.GetPostings("yashua ovando");

            //Assert
            actual.Should().HaveCount(2);
            authorIndex.Clear();
        }
예제 #4
0
        public void GetPostingsTest_NotExistingName_ReturnsEmpty()
        {
            //Arrange
            IDocumentCorpus corpus      = DirectoryCorpus.LoadTextDirectory(directory);
            DiskSoundEx     authorIndex = new DiskSoundEx("./");

            authorIndex.BuildSoundexIndex(corpus);
            //Act
            var actual = authorIndex.GetPostings("hella");

            //Assert
            actual.Should().BeEmpty();
            authorIndex.Clear();
        }
        public void PostionalIndexTest_OnePosition()
        {
            //Arrange
            string          term     = "sun";
            IList <Posting> expected = UnitTest.GeneratePostings("(3,[3])");

            //Act
            IDocumentCorpus corpus = DirectoryCorpus.LoadTextDirectory(directory);
            IIndex          index  = Indexer.IndexCorpus(corpus);
            var             result = index.GetPostings(term);

            //Assert
            result.Should().HaveSameCount(expected);
            result.Should().BeEquivalentTo(expected, config => config.WithStrictOrdering());
        }
예제 #6
0
        public void GetPostingsTest_SimilarSoundingName()
        {
            //Arrange
            IDocumentCorpus corpus      = DirectoryCorpus.LoadTextDirectory(directory);
            DiskSoundEx     authorIndex = new DiskSoundEx("./");

            authorIndex.BuildSoundexIndex(corpus);
            //Act
            var result1 = authorIndex.GetPostings("bloclic");
            var result2 = authorIndex.GetPostings("blacklock");

            //Assert
            result1.Should().BeEquivalentTo(result2);
            authorIndex.Clear();
        }
        public void PostionalIndexTest_MultiplePositions()
        {
            //Arrange
            string          term = "hello";
            IList <Posting> expected;

            // expected = UnitTest.GeneratePostings("(0,[0,1]), (2,[0,2,3])");  //this not work for macOS yet
            if (RuntimeInformation.IsOSPlatform(OSPlatform.OSX))
            {
                System.Console.WriteLine(" for MacOSX");
                expected = new List <Posting> {
                    new Posting(2, new List <int> {
                        0, 2, 3
                    }),
                    new Posting(4, new List <int> {
                        0, 1
                    })
                };
            }
            else
            {
                System.Console.WriteLine(" for Windows and other OSs");
                expected = new List <Posting> {
                    new Posting(0, new List <int> {
                        0, 1
                    }),
                    new Posting(2, new List <int> {
                        0, 2, 3
                    })
                };
            }

            //Act
            IDocumentCorpus corpus = DirectoryCorpus.LoadTextDirectory(directory);
            IIndex          index  = Indexer.IndexCorpus(corpus);
            var             result = index.GetPostings(term);

            //Assert
            result.Should().HaveSameCount(expected);
            result.Should().BeEquivalentTo(expected, config => config.WithStrictOrdering());
        }
        public void VocabTest_WithStemmer()
        {
            //Arrange
            IDocumentCorpus corpus        = DirectoryCorpus.LoadTextDirectory(directory);
            IIndex          index         = Indexer.IndexCorpus(corpus);
            var             expectedVocab = new List <string> {
                "hello", "world", "it", "is", "snow",
                "the", "full", "of", "mystery", "mr.snowman",
                "love", "sun", "a"
            };  //expected vocabulary with stemmed terms

            expectedVocab.Sort();

            //Act
            var actual = index.GetVocabulary();

            //Assert
            index.Should().NotBeNull("because indexCorpus shouldn't return null");
            actual.Should().HaveSameCount(expectedVocab, "because the index used StemmingTokenProcessor");
            //actual.Should().BeEquivalentTo(expectedVocab);    //TODO: why "mystery" became "mysteri"??
        }
        /// <summary>
        /// Gets on-disk index or generate a new index out of the selected corpus
        /// </summary>
        /// <param name="path">the path to the selected corpus</param>
        public void GetIndex(string path)
        {
            Console.WriteLine("\n-------------------------------------------------------------------------------");
            Console.WriteLine($"Path: {path}");

            try
            {
                string pathToIndex = Path.Join(path, "/index/");
                Indexer.path = pathToIndex;

                bool doesOnDiskIndexExist = Directory.Exists(pathToIndex);
                // bool doesOnDiskIndexExist = Directory.Exists(pathToIndex) && (Directory.GetFiles(pathToIndex).Length != 0);


                //make corpus out of the selected directory path
                corpus = DirectoryCorpus.LoadTextDirectory(path);

                if (doesOnDiskIndexExist)
                {
                    Console.WriteLine("Reading the existing on-disk index.");
                    index = new DiskPositionalIndex(pathToIndex);
                }
                else
                {
                    Console.WriteLine("Generating new index.");
                    GenerateIndex(path);
                }
            }
            catch (Exception e)
            {
                Console.WriteLine(e);
            }

            Console.WriteLine("Ready to go!");
            Console.WriteLine("-------------------------------------------------------------------------------");
        }
        public void GetRankedDocumentsTest()
        {
            //Path to where all the bin file will be write to
            string pathToIndex = Path.Join(corpusDir, "/index/");

            //Let Indexer know where should it writes all bin files
            Indexer.path = pathToIndex;

            //Read corpus
            IDocumentCorpus corpus = DirectoryCorpus.LoadTextDirectory(corpusDir);

            //Create directory  to bins folders if it doesn't exist
            Directory.CreateDirectory(pathToIndex);

            //Initialize the index. 
            IIndex index = Indexer.IndexCorpus(corpus);


            //The rest of your code...
            List<string> terms = new List<string>();
            terms.Add("hello");
            terms.Add("world");

            //Testing ranked retrieval AND accumulated values
            index = new DiskPositionalIndex(pathToIndex);
            RankedRetrieval rv = new RankedRetrieval(corpus, index, "Default");
            IList<MaxPriorityQueue.InvertedIndex> actual = rv.GetTopTen(terms);
            actual[0].GetDocumentId().Should().Be(0); //should be document 1 which is of doc id 0
            actual[0].GetRank().Should().BeApproximately(1.183748156, 9); //A_{doccument} = 3.10195041 L_{1} = 2.620447934
            actual[1].GetDocumentId().Should().Be(2); //
            actual[2].GetDocumentId().Should().Be(1); //
            actual[3].GetDocumentId().Should().Be(4); //

            //tests tf-idf
            rv = new RankedRetrieval(corpus, index, "Tf-idf");
            IList<MaxPriorityQueue.InvertedIndex> actual1 = rv.GetTopTen(terms);
            actual1[0].GetDocumentId().Should().Be(2);
            actual1[0].GetRank().Should().BeApproximately(0.948215482, 9);
            actual1[1].GetDocumentId().Should().Be(0);
            actual1[1].GetRank().Should().BeApproximately(0.893296803, 9);
            actual1[2].GetDocumentId().Should().Be(1);
            actual1[2].GetRank().Should().BeApproximately(0.150554959, 9);
            actual1[3].GetDocumentId().Should().Be(4);
            actual1[3].GetRank().Should().BeApproximately(0.150554959, 9);


            //tests Okapi BM25
            rv = new RankedRetrieval(corpus, index, "Okapi");
            IList<MaxPriorityQueue.InvertedIndex> actual2 = rv.GetTopTen(terms);
            actual2[0].GetDocumentId().Should().Be(0);
            actual2[0].GetRank().Should().BeApproximately(0.66590893, 9);
            actual2[1].GetDocumentId().Should().Be(2);
            actual2[1].GetRank().Should().BeApproximately(0.507521667, 9);
            actual2[2].GetDocumentId().Should().Be(1);
            actual2[2].GetRank().Should().BeApproximately(0.1089371981, 9);
            actual2[3].GetDocumentId().Should().Be(4);
            actual2[3].GetRank().Should().BeApproximately(0.1084371981, 9);


            //tests Wacky 
            rv = new RankedRetrieval(corpus, index, "Wacky");
            IList<MaxPriorityQueue.InvertedIndex> actual3 = rv.GetTopTen(terms);
            actual3[0].GetDocumentId().Should().Be(0);
            actual3[0].GetRank().Should().BeApproximately(0.284824391, 9);
            actual3[1].GetDocumentId().Should().Be(2);
            actual3[1].GetRank().Should().BeApproximately(0.259673474, 9);
            actual3[2].GetDocumentId().Should().Be(1);
            actual3[2].GetRank().Should().Be(0.0);
            actual3[3].GetDocumentId().Should().Be(4);
            actual3[3].GetRank().Should().Be(0.0);

        }// end of GetRankedDocumentTest()