public void SoundexIndexTest() { IDocumentCorpus corpus = DirectoryCorpus.LoadTextDirectory(directory); DiskSoundEx soundIndex = new DiskSoundEx("./"); soundIndex.BuildSoundexIndex(corpus); soundIndex.GetCount().Should().Be(5); soundIndex.Clear(); }
public void TestingTierIndex() { //Path to where all the bin file will be write to string pathToIndex = Path.Join(corpusDir, "/index/"); //Let Indexer know where should it writes all bin files Indexer.path = pathToIndex; //Read corpus IDocumentCorpus corpus = DirectoryCorpus.LoadTextDirectory(corpusDir); //Load Corpus to Index IIndex index = Indexer.IndexCorpus(corpus); //Create new DiskPositional Index from on disk files index = new DiskPositionalIndex(pathToIndex); //Check Info Of Postings Collected from Tier 1 IList<String> results = new List<string>(); //The rest of your code... List<string> terms = new List<string>(); terms.Add("hello"); terms.Add("world"); //get the postings IList<Posting> postings = new List<Posting>(); postings = index.GetPositionalPostings(terms); //add the count of the postings to the list of strings to be returned results.Add(postings.Count.ToString()); foreach (Posting p in postings) { if (results.Count < 20) { //use the document id to access the document IDocument doc = corpus.GetDocument(p.DocumentId); results.Add(doc.Title); results.Add(doc.DocumentId.ToString()); } } foreach (string s in results) { Console.WriteLine(s); } } //end TestingTierIndexer()
/// <summary> /// Constructs soundex index(hash map) from the author of documents in the corpus /// </summary> /// <param name="corpus">the corpus of documents</param> public void BuildSoundexIndex(IDocumentCorpus corpus) { foreach (IDocument d in corpus.GetDocuments()) { //Skip document with no author field if (d.Author == null) { continue; } AddDocIdByAuthor(d.Author, d.DocumentId); } Save(); }
public void GetPostingsTest_MultipleNames() { //Arrange IDocumentCorpus corpus = DirectoryCorpus.LoadTextDirectory(directory); DiskSoundEx authorIndex = new DiskSoundEx("./"); authorIndex.BuildSoundexIndex(corpus); //Act var actual = authorIndex.GetPostings("yashua ovando"); //Assert actual.Should().HaveCount(2); authorIndex.Clear(); }
public void GetPostingsTest_NotExistingName_ReturnsEmpty() { //Arrange IDocumentCorpus corpus = DirectoryCorpus.LoadTextDirectory(directory); DiskSoundEx authorIndex = new DiskSoundEx("./"); authorIndex.BuildSoundexIndex(corpus); //Act var actual = authorIndex.GetPostings("hella"); //Assert actual.Should().BeEmpty(); authorIndex.Clear(); }
public RankedRetrieval(IDocumentCorpus corpus, IIndex index, string RankedRetrievalMode) { query2termWeight = new int(); doc2termWeight = new int(); accumulator = new Dictionary <int, double>(); documentIds = new List <int>(); this.rankVariant = SetRankedRetrievalMode(RankedRetrievalMode); this.index = index; this.corpus = corpus; string path = Indexer.path; this.corpusSize = this.GetCorpusSize(path); }
public void PostionalIndexTest_OnePosition() { //Arrange string term = "sun"; IList <Posting> expected = UnitTest.GeneratePostings("(3,[3])"); //Act IDocumentCorpus corpus = DirectoryCorpus.LoadTextDirectory(directory); IIndex index = Indexer.IndexCorpus(corpus); var result = index.GetPostings(term); //Assert result.Should().HaveSameCount(expected); result.Should().BeEquivalentTo(expected, config => config.WithStrictOrdering()); }
public void GetPostingsTest_SimilarSoundingName() { //Arrange IDocumentCorpus corpus = DirectoryCorpus.LoadTextDirectory(directory); DiskSoundEx authorIndex = new DiskSoundEx("./"); authorIndex.BuildSoundexIndex(corpus); //Act var result1 = authorIndex.GetPostings("bloclic"); var result2 = authorIndex.GetPostings("blacklock"); //Assert result1.Should().BeEquivalentTo(result2); authorIndex.Clear(); }
public fmInvIndex() { InitializeComponent(); DocumentCorpus = new DocumentCorpus(); InvertedIndex = new InvertedIndex(); InvertedIndexBuildBegin += OnInvertedIndexBuildBegin; InvertedIndexHasBeenBuilt += OnInvertedIndexHasBeenBuilt; TextProgress = new DisplayTextProgressMergeToBegin(); TextProgress.IsChanged += ProgressInfoIsChanged; SavingBegin += OnSavingBegin; SavingEnd += OnSavingEnd; InitBackGrounWorkers(); }
public void PostionalIndexTest_MultiplePositions() { //Arrange string term = "hello"; IList <Posting> expected; // expected = UnitTest.GeneratePostings("(0,[0,1]), (2,[0,2,3])"); //this not work for macOS yet if (RuntimeInformation.IsOSPlatform(OSPlatform.OSX)) { System.Console.WriteLine(" for MacOSX"); expected = new List <Posting> { new Posting(2, new List <int> { 0, 2, 3 }), new Posting(4, new List <int> { 0, 1 }) }; } else { System.Console.WriteLine(" for Windows and other OSs"); expected = new List <Posting> { new Posting(0, new List <int> { 0, 1 }), new Posting(2, new List <int> { 0, 2, 3 }) }; } //Act IDocumentCorpus corpus = DirectoryCorpus.LoadTextDirectory(directory); IIndex index = Indexer.IndexCorpus(corpus); var result = index.GetPostings(term); //Assert result.Should().HaveSameCount(expected); result.Should().BeEquivalentTo(expected, config => config.WithStrictOrdering()); }
public void VocabTest_WithStemmer() { //Arrange IDocumentCorpus corpus = DirectoryCorpus.LoadTextDirectory(directory); IIndex index = Indexer.IndexCorpus(corpus); var expectedVocab = new List <string> { "hello", "world", "it", "is", "snow", "the", "full", "of", "mystery", "mr.snowman", "love", "sun", "a" }; //expected vocabulary with stemmed terms expectedVocab.Sort(); //Act var actual = index.GetVocabulary(); //Assert index.Should().NotBeNull("because indexCorpus shouldn't return null"); actual.Should().HaveSameCount(expectedVocab, "because the index used StemmingTokenProcessor"); //actual.Should().BeEquivalentTo(expectedVocab); //TODO: why "mystery" became "mysteri"?? }
/// <summary> /// Gets on-disk index or generate a new index out of the selected corpus /// </summary> /// <param name="path">the path to the selected corpus</param> public void GetIndex(string path) { Console.WriteLine("\n-------------------------------------------------------------------------------"); Console.WriteLine($"Path: {path}"); try { string pathToIndex = Path.Join(path, "/index/"); Indexer.path = pathToIndex; bool doesOnDiskIndexExist = Directory.Exists(pathToIndex); // bool doesOnDiskIndexExist = Directory.Exists(pathToIndex) && (Directory.GetFiles(pathToIndex).Length != 0); //make corpus out of the selected directory path corpus = DirectoryCorpus.LoadTextDirectory(path); if (doesOnDiskIndexExist) { Console.WriteLine("Reading the existing on-disk index."); index = new DiskPositionalIndex(pathToIndex); } else { Console.WriteLine("Generating new index."); GenerateIndex(path); } } catch (Exception e) { Console.WriteLine(e); } Console.WriteLine("Ready to go!"); Console.WriteLine("-------------------------------------------------------------------------------"); }
/// <summary> /// Constructs an index from a corpus of documents /// </summary> /// <param name="corpus">a corpus to be indexed</param> public static IIndex IndexCorpus(IDocumentCorpus corpus) { Console.WriteLine($"[Indexer] Indexing {corpus.CorpusSize} documents in the corpus..."); // Time how long it takes to index the corpus Stopwatch elapsedTime = new Stopwatch(); elapsedTime.Start(); // Set the index type and token processor to use DiskPositionalIndex index = new DiskPositionalIndex(Indexer.path); DiskSoundEx soundEx = new DiskSoundEx(Indexer.path); DiskKGram kGram = new DiskKGram(Indexer.path); index.Clear(); soundEx.Clear(); kGram.Clear(); ITokenProcessor processor = new StemmingTokenProcesor(); HashSet <string> unstemmedVocabulary = new HashSet <string>(); // Index the document foreach (IDocument doc in corpus.GetDocuments()) { //Tokenize the documents ITokenStream stream = new EnglishTokenStream(doc.GetContent()); IEnumerable <string> tokens = stream.GetTokens(); //keeptrack of tokens per document int tokenCount = 0; //keep track of file size int position = 0; foreach (string token in tokens) { tokenCount++; //Process token to term List <string> terms = processor.ProcessToken(token); //Add term to the index bool termsIsAdded = false; foreach (string term in terms) { if (term.Length > 0) { index.AddTerm(term, doc.DocumentId, position); termsIsAdded = true; } } //Increase the position num position = termsIsAdded ? position + 1 : position; //Keep track of vocabularies for K-gram foreach (string term in ((NormalTokenProcessor)processor).ProcessToken(token)) { unstemmedVocabulary.Add(term); } } //Add token count per document index.AddTokensPerDocument(doc.DocumentId, tokenCount); //get number of bytes in file string docFilePath = doc.FilePath; int fileSizeInByte = (int)(new FileInfo(docFilePath).Length / 8f); index.AddByteSize(doc.DocumentId, fileSizeInByte); //calculates Average term Frequency for a specific document index.CalcAveTermFreq(doc.DocumentId); //calculate L_{d} for the document and store it index so that we can write it to disk later index.CalculateDocWeight(doc.DocumentId); Indexer.averageDocLength = index.calculateAverageDocLength(); //Add author to SoundEx Index soundEx.AddDocIdByAuthor(doc.Author, doc.DocumentId); stream.Dispose(); } kGram.buildKGram(unstemmedVocabulary); index.Save(); soundEx.Save(); elapsedTime.Stop(); Console.WriteLine("[Indexer] Done Indexing! Time Elapsed " + elapsedTime.Elapsed.ToString("mm':'ss':'fff")); GC.Collect(); return(index); }
public void GetRankedDocumentsTest() { //Path to where all the bin file will be write to string pathToIndex = Path.Join(corpusDir, "/index/"); //Let Indexer know where should it writes all bin files Indexer.path = pathToIndex; //Read corpus IDocumentCorpus corpus = DirectoryCorpus.LoadTextDirectory(corpusDir); //Create directory to bins folders if it doesn't exist Directory.CreateDirectory(pathToIndex); //Initialize the index. IIndex index = Indexer.IndexCorpus(corpus); //The rest of your code... List<string> terms = new List<string>(); terms.Add("hello"); terms.Add("world"); //Testing ranked retrieval AND accumulated values index = new DiskPositionalIndex(pathToIndex); RankedRetrieval rv = new RankedRetrieval(corpus, index, "Default"); IList<MaxPriorityQueue.InvertedIndex> actual = rv.GetTopTen(terms); actual[0].GetDocumentId().Should().Be(0); //should be document 1 which is of doc id 0 actual[0].GetRank().Should().BeApproximately(1.183748156, 9); //A_{doccument} = 3.10195041 L_{1} = 2.620447934 actual[1].GetDocumentId().Should().Be(2); // actual[2].GetDocumentId().Should().Be(1); // actual[3].GetDocumentId().Should().Be(4); // //tests tf-idf rv = new RankedRetrieval(corpus, index, "Tf-idf"); IList<MaxPriorityQueue.InvertedIndex> actual1 = rv.GetTopTen(terms); actual1[0].GetDocumentId().Should().Be(2); actual1[0].GetRank().Should().BeApproximately(0.948215482, 9); actual1[1].GetDocumentId().Should().Be(0); actual1[1].GetRank().Should().BeApproximately(0.893296803, 9); actual1[2].GetDocumentId().Should().Be(1); actual1[2].GetRank().Should().BeApproximately(0.150554959, 9); actual1[3].GetDocumentId().Should().Be(4); actual1[3].GetRank().Should().BeApproximately(0.150554959, 9); //tests Okapi BM25 rv = new RankedRetrieval(corpus, index, "Okapi"); IList<MaxPriorityQueue.InvertedIndex> actual2 = rv.GetTopTen(terms); actual2[0].GetDocumentId().Should().Be(0); actual2[0].GetRank().Should().BeApproximately(0.66590893, 9); actual2[1].GetDocumentId().Should().Be(2); actual2[1].GetRank().Should().BeApproximately(0.507521667, 9); actual2[2].GetDocumentId().Should().Be(1); actual2[2].GetRank().Should().BeApproximately(0.1089371981, 9); actual2[3].GetDocumentId().Should().Be(4); actual2[3].GetRank().Should().BeApproximately(0.1084371981, 9); //tests Wacky rv = new RankedRetrieval(corpus, index, "Wacky"); IList<MaxPriorityQueue.InvertedIndex> actual3 = rv.GetTopTen(terms); actual3[0].GetDocumentId().Should().Be(0); actual3[0].GetRank().Should().BeApproximately(0.284824391, 9); actual3[1].GetDocumentId().Should().Be(2); actual3[1].GetRank().Should().BeApproximately(0.259673474, 9); actual3[2].GetDocumentId().Should().Be(1); actual3[2].GetRank().Should().Be(0.0); actual3[3].GetDocumentId().Should().Be(4); actual3[3].GetRank().Should().Be(0.0); }// end of GetRankedDocumentTest()