public void SoundexIndexTest() { IDocumentCorpus corpus = DirectoryCorpus.LoadTextDirectory(directory); DiskSoundEx soundIndex = new DiskSoundEx("./"); soundIndex.BuildSoundexIndex(corpus); soundIndex.GetCount().Should().Be(5); soundIndex.Clear(); }
public void TestingTierIndex() { //Path to where all the bin file will be write to string pathToIndex = Path.Join(corpusDir, "/index/"); //Let Indexer know where should it writes all bin files Indexer.path = pathToIndex; //Read corpus IDocumentCorpus corpus = DirectoryCorpus.LoadTextDirectory(corpusDir); //Load Corpus to Index IIndex index = Indexer.IndexCorpus(corpus); //Create new DiskPositional Index from on disk files index = new DiskPositionalIndex(pathToIndex); //Check Info Of Postings Collected from Tier 1 IList<String> results = new List<string>(); //The rest of your code... List<string> terms = new List<string>(); terms.Add("hello"); terms.Add("world"); //get the postings IList<Posting> postings = new List<Posting>(); postings = index.GetPositionalPostings(terms); //add the count of the postings to the list of strings to be returned results.Add(postings.Count.ToString()); foreach (Posting p in postings) { if (results.Count < 20) { //use the document id to access the document IDocument doc = corpus.GetDocument(p.DocumentId); results.Add(doc.Title); results.Add(doc.DocumentId.ToString()); } } foreach (string s in results) { Console.WriteLine(s); } } //end TestingTierIndexer()
public void GetPostingsTest_MultipleNames() { //Arrange IDocumentCorpus corpus = DirectoryCorpus.LoadTextDirectory(directory); DiskSoundEx authorIndex = new DiskSoundEx("./"); authorIndex.BuildSoundexIndex(corpus); //Act var actual = authorIndex.GetPostings("yashua ovando"); //Assert actual.Should().HaveCount(2); authorIndex.Clear(); }
public void GetPostingsTest_NotExistingName_ReturnsEmpty() { //Arrange IDocumentCorpus corpus = DirectoryCorpus.LoadTextDirectory(directory); DiskSoundEx authorIndex = new DiskSoundEx("./"); authorIndex.BuildSoundexIndex(corpus); //Act var actual = authorIndex.GetPostings("hella"); //Assert actual.Should().BeEmpty(); authorIndex.Clear(); }
public void PostionalIndexTest_OnePosition() { //Arrange string term = "sun"; IList <Posting> expected = UnitTest.GeneratePostings("(3,[3])"); //Act IDocumentCorpus corpus = DirectoryCorpus.LoadTextDirectory(directory); IIndex index = Indexer.IndexCorpus(corpus); var result = index.GetPostings(term); //Assert result.Should().HaveSameCount(expected); result.Should().BeEquivalentTo(expected, config => config.WithStrictOrdering()); }
public void GetPostingsTest_SimilarSoundingName() { //Arrange IDocumentCorpus corpus = DirectoryCorpus.LoadTextDirectory(directory); DiskSoundEx authorIndex = new DiskSoundEx("./"); authorIndex.BuildSoundexIndex(corpus); //Act var result1 = authorIndex.GetPostings("bloclic"); var result2 = authorIndex.GetPostings("blacklock"); //Assert result1.Should().BeEquivalentTo(result2); authorIndex.Clear(); }
public void PostionalIndexTest_MultiplePositions() { //Arrange string term = "hello"; IList <Posting> expected; // expected = UnitTest.GeneratePostings("(0,[0,1]), (2,[0,2,3])"); //this not work for macOS yet if (RuntimeInformation.IsOSPlatform(OSPlatform.OSX)) { System.Console.WriteLine(" for MacOSX"); expected = new List <Posting> { new Posting(2, new List <int> { 0, 2, 3 }), new Posting(4, new List <int> { 0, 1 }) }; } else { System.Console.WriteLine(" for Windows and other OSs"); expected = new List <Posting> { new Posting(0, new List <int> { 0, 1 }), new Posting(2, new List <int> { 0, 2, 3 }) }; } //Act IDocumentCorpus corpus = DirectoryCorpus.LoadTextDirectory(directory); IIndex index = Indexer.IndexCorpus(corpus); var result = index.GetPostings(term); //Assert result.Should().HaveSameCount(expected); result.Should().BeEquivalentTo(expected, config => config.WithStrictOrdering()); }
public void VocabTest_WithStemmer() { //Arrange IDocumentCorpus corpus = DirectoryCorpus.LoadTextDirectory(directory); IIndex index = Indexer.IndexCorpus(corpus); var expectedVocab = new List <string> { "hello", "world", "it", "is", "snow", "the", "full", "of", "mystery", "mr.snowman", "love", "sun", "a" }; //expected vocabulary with stemmed terms expectedVocab.Sort(); //Act var actual = index.GetVocabulary(); //Assert index.Should().NotBeNull("because indexCorpus shouldn't return null"); actual.Should().HaveSameCount(expectedVocab, "because the index used StemmingTokenProcessor"); //actual.Should().BeEquivalentTo(expectedVocab); //TODO: why "mystery" became "mysteri"?? }
/// <summary> /// Gets on-disk index or generate a new index out of the selected corpus /// </summary> /// <param name="path">the path to the selected corpus</param> public void GetIndex(string path) { Console.WriteLine("\n-------------------------------------------------------------------------------"); Console.WriteLine($"Path: {path}"); try { string pathToIndex = Path.Join(path, "/index/"); Indexer.path = pathToIndex; bool doesOnDiskIndexExist = Directory.Exists(pathToIndex); // bool doesOnDiskIndexExist = Directory.Exists(pathToIndex) && (Directory.GetFiles(pathToIndex).Length != 0); //make corpus out of the selected directory path corpus = DirectoryCorpus.LoadTextDirectory(path); if (doesOnDiskIndexExist) { Console.WriteLine("Reading the existing on-disk index."); index = new DiskPositionalIndex(pathToIndex); } else { Console.WriteLine("Generating new index."); GenerateIndex(path); } } catch (Exception e) { Console.WriteLine(e); } Console.WriteLine("Ready to go!"); Console.WriteLine("-------------------------------------------------------------------------------"); }
public void GetRankedDocumentsTest() { //Path to where all the bin file will be write to string pathToIndex = Path.Join(corpusDir, "/index/"); //Let Indexer know where should it writes all bin files Indexer.path = pathToIndex; //Read corpus IDocumentCorpus corpus = DirectoryCorpus.LoadTextDirectory(corpusDir); //Create directory to bins folders if it doesn't exist Directory.CreateDirectory(pathToIndex); //Initialize the index. IIndex index = Indexer.IndexCorpus(corpus); //The rest of your code... List<string> terms = new List<string>(); terms.Add("hello"); terms.Add("world"); //Testing ranked retrieval AND accumulated values index = new DiskPositionalIndex(pathToIndex); RankedRetrieval rv = new RankedRetrieval(corpus, index, "Default"); IList<MaxPriorityQueue.InvertedIndex> actual = rv.GetTopTen(terms); actual[0].GetDocumentId().Should().Be(0); //should be document 1 which is of doc id 0 actual[0].GetRank().Should().BeApproximately(1.183748156, 9); //A_{doccument} = 3.10195041 L_{1} = 2.620447934 actual[1].GetDocumentId().Should().Be(2); // actual[2].GetDocumentId().Should().Be(1); // actual[3].GetDocumentId().Should().Be(4); // //tests tf-idf rv = new RankedRetrieval(corpus, index, "Tf-idf"); IList<MaxPriorityQueue.InvertedIndex> actual1 = rv.GetTopTen(terms); actual1[0].GetDocumentId().Should().Be(2); actual1[0].GetRank().Should().BeApproximately(0.948215482, 9); actual1[1].GetDocumentId().Should().Be(0); actual1[1].GetRank().Should().BeApproximately(0.893296803, 9); actual1[2].GetDocumentId().Should().Be(1); actual1[2].GetRank().Should().BeApproximately(0.150554959, 9); actual1[3].GetDocumentId().Should().Be(4); actual1[3].GetRank().Should().BeApproximately(0.150554959, 9); //tests Okapi BM25 rv = new RankedRetrieval(corpus, index, "Okapi"); IList<MaxPriorityQueue.InvertedIndex> actual2 = rv.GetTopTen(terms); actual2[0].GetDocumentId().Should().Be(0); actual2[0].GetRank().Should().BeApproximately(0.66590893, 9); actual2[1].GetDocumentId().Should().Be(2); actual2[1].GetRank().Should().BeApproximately(0.507521667, 9); actual2[2].GetDocumentId().Should().Be(1); actual2[2].GetRank().Should().BeApproximately(0.1089371981, 9); actual2[3].GetDocumentId().Should().Be(4); actual2[3].GetRank().Should().BeApproximately(0.1084371981, 9); //tests Wacky rv = new RankedRetrieval(corpus, index, "Wacky"); IList<MaxPriorityQueue.InvertedIndex> actual3 = rv.GetTopTen(terms); actual3[0].GetDocumentId().Should().Be(0); actual3[0].GetRank().Should().BeApproximately(0.284824391, 9); actual3[1].GetDocumentId().Should().Be(2); actual3[1].GetRank().Should().BeApproximately(0.259673474, 9); actual3[2].GetDocumentId().Should().Be(1); actual3[2].GetRank().Should().Be(0.0); actual3[3].GetDocumentId().Should().Be(4); actual3[3].GetRank().Should().Be(0.0); }// end of GetRankedDocumentTest()