Пример #1
0
        public void Test_LinearSearch_vs_InvertedIndexLookup_InMemory()
        {
            // Setup the in-memory index for search
            var indexOnDiskPath = Path.Join(_testDirectory, "TestFixtures", "TestIndex");
            var sut             = new SimpleBooleanSearchEngine(indexOnDiskPath);

            sut.LoadIndexFromDisk();

            // Setup the in-memory flat (forward) index for linear search
            var rawDataOnDiskPath = Path.Join("TestFixtures", "Data.zip");
            var forwardIndexer    = new ForwardIndexer();

            forwardIndexer.BuildInMemoryForwardIndex(rawDataOnDiskPath);

            // Time the in-memory inverted index search
            var invStart  = DateTime.Now;
            var invResult = sut.GetDocumentsContainingTerm("california");
            var invEnd    = DateTime.Now;

            // Time the in-memory forward index (linear) search
            var fwdStart  = DateTime.Now;
            var fwdResult = forwardIndexer.GetDocumentsContainingTerm(" california ");
            var fwdEnd    = DateTime.Now;

            _output.WriteLine("Inverted Index Lookup time taken : " + (invEnd - invStart));
            _output.WriteLine("Forward Index Lookup time taken : " + (fwdEnd - fwdStart));

            // Note that grepping with a forward index will give us char-level substrings -
            // We will need to improve our linguistic preprocessing to identify instances
            // of the tokens we're looking for.  Like "Califorinian", "California's", "California-esque"
            // will be identified via grepping, but lost on our rudimentary indexer due to
            // (crude) whitespace tokenization.
        }
Пример #2
0
        public void Test_GetDocumentsContainingTerm_Returns_ExpectedDocuments()
        {
            var indexOnDiskPath = Path.Join(_testDirectory, "TestFixtures", "TestIndex");
            var sut             = new SimpleBooleanSearchEngine(indexOnDiskPath);

            sut.LoadIndexFromDisk();
            var result = sut.GetDocumentsContainingTerm("california");

            result.Postings.Count.Should().Be(21);
        }
Пример #3
0
        public void Test_Intersection_Returns_ExpectedDocuments()
        {
            var indexOnDiskPath = Path.Join(_testDirectory, "TestFixtures", "TestIndex");
            var sut             = new SimpleBooleanSearchEngine(indexOnDiskPath);

            sut.LoadIndexFromDisk();
            var searchTerms = new List <string> {
                "red", "dry"
            };
            var result = sut.IntersectionQuery(searchTerms);

            result.Count.Should().Be(84);
        }
Пример #4
0
        public void Test_GetDocumentsContainingTerm_Returns_ExpectedDocuments()
        {
            var index = new SortedDictionary <string, PostingList>()
            {
                ["red"] = new PostingList {
                    Postings = new List <long> {
                        1, 2, 3, 4, 5
                    }
                },
                ["blue"] = new PostingList {
                    Postings = new List <long> {
                        2, 3, 4, 7
                    }
                },
                ["green"] = new PostingList {
                    Postings = new List <long> {
                        6, 7
                    }
                },
            };

            var inMemoryIndex = new IndexSegment()
            {
                Index = index
            };
            var sut = new SimpleBooleanSearchEngine();

            sut.LoadIndexFromMemory(inMemoryIndex);

            var result       = sut.GetDocumentsContainingTerm("blue");
            var expectedDocs = new PostingList {
                Postings = new List <long> {
                    2, 3, 4, 7
                }
            };

            result.Postings.Count.Should().Be(4);
            result.Should().BeEquivalentTo(expectedDocs);
        }
Пример #5
0
        public void Test_Intersection_Returns_ExpectedDocuments(string t1, string t2, long expectedCount)
        {
            var index = new SortedDictionary <string, PostingList>()
            {
                ["red"] = new PostingList {
                    Postings = new List <long> {
                        1, 2, 3, 4, 5
                    }
                },
                ["blue"] = new PostingList {
                    Postings = new List <long> {
                        2, 3, 4, 7
                    }
                },
                ["green"] = new PostingList {
                    Postings = new List <long> {
                        6, 7
                    }
                },
            };

            var inMemoryIndex = new IndexSegment()
            {
                Index = index
            };
            var sut = new SimpleBooleanSearchEngine();

            sut.LoadIndexFromMemory(inMemoryIndex);

            var searchTerms = new List <string> {
                t1, t2
            };
            var result = sut.IntersectionQuery(searchTerms);

            result.Count.Should().Be((int)expectedCount);
        }