public void Test_LinearSearch_vs_InvertedIndexLookup_InMemory() { // Setup the in-memory index for search var indexOnDiskPath = Path.Join(_testDirectory, "TestFixtures", "TestIndex"); var sut = new SimpleBooleanSearchEngine(indexOnDiskPath); sut.LoadIndexFromDisk(); // Setup the in-memory flat (forward) index for linear search var rawDataOnDiskPath = Path.Join("TestFixtures", "Data.zip"); var forwardIndexer = new ForwardIndexer(); forwardIndexer.BuildInMemoryForwardIndex(rawDataOnDiskPath); // Time the in-memory inverted index search var invStart = DateTime.Now; var invResult = sut.GetDocumentsContainingTerm("california"); var invEnd = DateTime.Now; // Time the in-memory forward index (linear) search var fwdStart = DateTime.Now; var fwdResult = forwardIndexer.GetDocumentsContainingTerm(" california "); var fwdEnd = DateTime.Now; _output.WriteLine("Inverted Index Lookup time taken : " + (invEnd - invStart)); _output.WriteLine("Forward Index Lookup time taken : " + (fwdEnd - fwdStart)); // Note that grepping with a forward index will give us char-level substrings - // We will need to improve our linguistic preprocessing to identify instances // of the tokens we're looking for. Like "Califorinian", "California's", "California-esque" // will be identified via grepping, but lost on our rudimentary indexer due to // (crude) whitespace tokenization. }
public void Test_GetDocumentsContainingTerm_Returns_ExpectedDocuments() { var indexOnDiskPath = Path.Join(_testDirectory, "TestFixtures", "TestIndex"); var sut = new SimpleBooleanSearchEngine(indexOnDiskPath); sut.LoadIndexFromDisk(); var result = sut.GetDocumentsContainingTerm("california"); result.Postings.Count.Should().Be(21); }
public void Test_Intersection_Returns_ExpectedDocuments() { var indexOnDiskPath = Path.Join(_testDirectory, "TestFixtures", "TestIndex"); var sut = new SimpleBooleanSearchEngine(indexOnDiskPath); sut.LoadIndexFromDisk(); var searchTerms = new List <string> { "red", "dry" }; var result = sut.IntersectionQuery(searchTerms); result.Count.Should().Be(84); }
public void Test_GetDocumentsContainingTerm_Returns_ExpectedDocuments() { var index = new SortedDictionary <string, PostingList>() { ["red"] = new PostingList { Postings = new List <long> { 1, 2, 3, 4, 5 } }, ["blue"] = new PostingList { Postings = new List <long> { 2, 3, 4, 7 } }, ["green"] = new PostingList { Postings = new List <long> { 6, 7 } }, }; var inMemoryIndex = new IndexSegment() { Index = index }; var sut = new SimpleBooleanSearchEngine(); sut.LoadIndexFromMemory(inMemoryIndex); var result = sut.GetDocumentsContainingTerm("blue"); var expectedDocs = new PostingList { Postings = new List <long> { 2, 3, 4, 7 } }; result.Postings.Count.Should().Be(4); result.Should().BeEquivalentTo(expectedDocs); }
public void Test_Intersection_Returns_ExpectedDocuments(string t1, string t2, long expectedCount) { var index = new SortedDictionary <string, PostingList>() { ["red"] = new PostingList { Postings = new List <long> { 1, 2, 3, 4, 5 } }, ["blue"] = new PostingList { Postings = new List <long> { 2, 3, 4, 7 } }, ["green"] = new PostingList { Postings = new List <long> { 6, 7 } }, }; var inMemoryIndex = new IndexSegment() { Index = index }; var sut = new SimpleBooleanSearchEngine(); sut.LoadIndexFromMemory(inMemoryIndex); var searchTerms = new List <string> { t1, t2 }; var result = sut.IntersectionQuery(searchTerms); result.Count.Should().Be((int)expectedCount); }