public void TestingTierIndex() { //Path to where all the bin file will be write to string pathToIndex = Path.Join(corpusDir, "/index/"); //Let Indexer know where should it writes all bin files Indexer.path = pathToIndex; //Read corpus IDocumentCorpus corpus = DirectoryCorpus.LoadTextDirectory(corpusDir); //Load Corpus to Index IIndex index = Indexer.IndexCorpus(corpus); //Create new DiskPositional Index from on disk files index = new DiskPositionalIndex(pathToIndex); //Check Info Of Postings Collected from Tier 1 IList<String> results = new List<string>(); //The rest of your code... List<string> terms = new List<string>(); terms.Add("hello"); terms.Add("world"); //get the postings IList<Posting> postings = new List<Posting>(); postings = index.GetPositionalPostings(terms); //add the count of the postings to the list of strings to be returned results.Add(postings.Count.ToString()); foreach (Posting p in postings) { if (results.Count < 20) { //use the document id to access the document IDocument doc = corpus.GetDocument(p.DocumentId); results.Add(doc.Title); results.Add(doc.DocumentId.ToString()); } } foreach (string s in results) { Console.WriteLine(s); } } //end TestingTierIndexer()
public static List<int> ProcessWildcardQuery(string query, DiskPositionalIndex index) { var orQuery = KGramIndex.GenerateNormalQuery(query); var words = SplitOrQuery(orQuery); var orQueryItemsResultsDocIds = new List<List<int>>(); foreach (string word in words) { var term = PorterStemmer.ProcessToken(word.Trim()); orQueryItemsResultsDocIds.Add(GetDocIds(index.GetPostings(term, false))); //Add the term to to liste of the found term if (!FoundTerms.Contains(term)) FoundTerms.Add(term); } return MergeOrResults(orQueryItemsResultsDocIds).Last(); }
public static IEnumerable<KeyValuePair<int, double>> ProcessRankQuery(string query, DiskPositionalIndex index, string folder) { if (string.IsNullOrWhiteSpace(query)) return null; double numberOfDocuments = index.FileNames.Count; var ads = new Dictionary<int, double>(); var reader = new FileStream(Path.Combine(folder, "docWeights.bin"), FileMode.Open, FileAccess.Read); for (int i = 0; i < numberOfDocuments; i++) { ads.Add(i, 0); } foreach (var term in SplitWhiteSpace(query)) { var processedTerm = PorterStemmer.ProcessToken(term); var postings = index.GetPostings(processedTerm, true); if (postings != null) { //Add the term to to liste of the found term if (!FoundTerms.Contains(processedTerm)) FoundTerms.Add(processedTerm); double dft = postings.Count(); double wqt = Math.Log(1.0 + (numberOfDocuments / dft)); for (int j = 0; j < postings.Count(); j++) { double tftd = postings[j].Count() - 1; double wdt = 1.0 + Math.Log(tftd); ads[postings[j][0]] += wqt * wdt; } } } for (int i = 0; i < ads.Count; i++) { if (ads.ElementAt(i).Value > 0) { // Read Ld in file and divide Ad by Ld. reader.Seek(ads.ElementAt(i).Key * 8, SeekOrigin.Begin); var buffer = new byte[8]; reader.Read(buffer, 0, buffer.Length); if (BitConverter.IsLittleEndian) Array.Reverse(buffer); double ld = BitConverter.ToDouble(buffer, 0); ads[ads.ElementAt(i).Key] = Math.Truncate(10000000 * ads.ElementAt(i).Value / ld) / 10000000; } } reader.Close(); return ads.OrderByDescending(i => i.Value); }
public static List<int> ProcessQuery(string query, DiskPositionalIndex index) { //Empty the potential misspelled words PotentialMisspelledWords.Clear(); // Trim the query. query = query.Trim(); if (query == string.Empty) return new List<int>(); // Verify the syntax is correct. if (!IsQuerySyntaxCorrect(query)) return null; // Split by +, it gives us all the Qs. We will process the "OR" later. var qList = SplitOrQuery(query); // The list that will contain the final result of the query as docids. var finalResultsDocIds = new List<int>(); var orQueryItemsResultsDocIds = new List<List<int>>(); // Process each Q: foreach (string qTemp in qList) { int positiveLiterals = 0; bool notQuery; string q = qTemp.Trim(); var andQueryItemsResultsDocIds = new List<List<int>>(); var andNotQueryItemsResultsDocIds = new List<List<int>>(); var notQueriesTempList = new List<List<int>>(); // Parentheses. if (Regex.IsMatch(q, @"-\((.+?)\)")) notQuery = true; else { notQuery = false; positiveLiterals++; } var parentheses = Regex.Matches(q, @"\((.+?)\)") .Cast<Match>() .Select(m => m.Groups[1].Value) .ToList(); foreach (string expression in parentheses) { var andQueryTerms = SplitWhiteSpace(expression); var secondAndQueryItemsResultsDocIds = new List<List<int>>(); foreach (string termTemp in andQueryTerms) { string term = termTemp.Trim(); if (term == string.Empty) continue; // If Wildcard query. if (Regex.IsMatch(term, @"(.*\*.*)+")) secondAndQueryItemsResultsDocIds.Add(ProcessWildcardQuery(term, index)); else { var processedTerm = PorterStemmer.ProcessToken(term); var postings = index.GetPostings(processedTerm, false); // Add the term to the list of the found term. if (!FoundTerms.Contains(processedTerm)) FoundTerms.Add(processedTerm); if (postings == null) { secondAndQueryItemsResultsDocIds.Add(new List<int>()); if (!PotentialMisspelledWords.Contains(term)) PotentialMisspelledWords.Add(term); } else { secondAndQueryItemsResultsDocIds.Add(GetDocIds(postings)); if (!PotentialMisspelledWords.Contains(term) && postings.Count() < 5) PotentialMisspelledWords.Add(term); } } } if (secondAndQueryItemsResultsDocIds.Count > 0) andQueryItemsResultsDocIds.Add(MergeAndResults(secondAndQueryItemsResultsDocIds).Last()); } // Remove parentheses from the Q. q = Regex.Replace(q, @"\((.+?)\)", ""); // Phrase queries with " ". if (Regex.IsMatch(q, "-\"(.+?)\"")) notQuery = true; else { notQuery = false; positiveLiterals++; } var phraseQueries = Regex.Matches(q, "\"(.+?)\"") .Cast<Match>() .Select(m => m.Groups[1].Value) .ToList(); foreach (string phraseQuery in phraseQueries) { var phraseQueryTerms = SplitWhiteSpace(phraseQuery.Trim()); var results = ProcessPhraseQuery(index, phraseQueryTerms); if (results == null) { if (notQuery) notQueriesTempList.Add(new List<int>()); else andQueryItemsResultsDocIds.Add(new List<int>()); } else { if (notQuery) notQueriesTempList.Add(results.Keys.ToList()); else andQueryItemsResultsDocIds.Add(results.Keys.ToList()); } } // Remove phrase queries from the Q. if (notQuery) q = Regex.Replace(q, "-\"(.+?)\"", ""); else q = Regex.Replace(q, "\"(.+?)\"", ""); // In the Q, it only remains simple words. if (q != string.Empty) { var terms = SplitWhiteSpace(q); foreach (string termTemp in terms) { string term = termTemp.Trim(); if (term != string.Empty) { // If Wildcard query. if (Regex.IsMatch(term, @"(-.*\*.*)+")) { positiveLiterals++; notQueriesTempList.Add(ProcessWildcardQuery(term, index)); } else if (Regex.IsMatch(term, @"(.*\*.*)+")) andQueryItemsResultsDocIds.Add(ProcessWildcardQuery(term, index)); // Not query. else if (Regex.IsMatch(term, @"-\S+")) { term = term.Replace("-", ""); var processedTerm = PorterStemmer.ProcessToken(term); var postings = index.GetPostings(processedTerm, false); if (postings != null) { //Add the term to to liste of the found term if (!FoundTerms.Contains(processedTerm)) FoundTerms.Add(processedTerm); notQueriesTempList.Add(GetDocIds(postings)); } } // Simple word. else { var processedTerm = PorterStemmer.ProcessToken(term); var postings = index.GetPostings(processedTerm, false); if (postings == null) { andQueryItemsResultsDocIds.Add(new List<int>()); if (!PotentialMisspelledWords.Contains(term)) PotentialMisspelledWords.Add(term); } else { andQueryItemsResultsDocIds.Add(GetDocIds(postings)); //Add the term to to liste of the found term if (!FoundTerms.Contains(processedTerm)) FoundTerms.Add(processedTerm); if (!PotentialMisspelledWords.Contains(term) && postings.Count() < 5) PotentialMisspelledWords.Add(term); } } } } } if (positiveLiterals == 0) return null; // If there are NOT queries. if (notQueriesTempList.Count > 0) { if (andQueryItemsResultsDocIds.Count > 0) andNotQueryItemsResultsDocIds.Add(MergeAndResults(andQueryItemsResultsDocIds).Last()); andNotQueryItemsResultsDocIds.AddRange(notQueriesTempList); orQueryItemsResultsDocIds.Add(MergeAndNotResults(andNotQueryItemsResultsDocIds).Last()); } else // Merge all the results in a AND query. if (andQueryItemsResultsDocIds.Count > 0) orQueryItemsResultsDocIds.Add(MergeAndResults(andQueryItemsResultsDocIds).Last()); } // Merge all the OR query items results. if (orQueryItemsResultsDocIds.Count > 0) { orQueryItemsResultsDocIds = MergeOrResults(orQueryItemsResultsDocIds); finalResultsDocIds.AddRange(orQueryItemsResultsDocIds.Last()); } return finalResultsDocIds; }
public static Dictionary<int, List<int>> ProcessPhraseQuery(DiskPositionalIndex index, List<string> wordsList) { int[][] word1Postings = null; foreach (var word in wordsList) { if (word1Postings == null) { word1Postings = index.GetPostings(PorterStemmer.ProcessToken(word.Trim()), true); //Check if the word could be mispelled if (word1Postings != null) { if (word1Postings.Count() < 5 && !PotentialMisspelledWords.Contains(word)) PotentialMisspelledWords.Add(word); } else if (!PotentialMisspelledWords.Contains(word)) PotentialMisspelledWords.Add(word); } else { var word2Postings = index.GetPostings(PorterStemmer.ProcessToken(word.Trim()), true); if (word2Postings == null) { if (!PotentialMisspelledWords.Contains(word)) PotentialMisspelledWords.Add(word); return null; } if (word1Postings.Count() < 5 && !PotentialMisspelledWords.Contains(word)) PotentialMisspelledWords.Add(word); word1Postings = Process2WordPhraseQuery(word1Postings, word2Postings); } if (word1Postings == null) return null; } //Add phrase query to found words as a group string tmp = ""; foreach (var word in wordsList) { tmp += word + " "; } FoundTerms.Add(tmp.Trim()); var resultPostingsList = new Dictionary<int, List<int>>(); for (int i = 0; i < word1Postings.Length; i++) { resultPostingsList.Add(word1Postings[i][0], new List<int>()); for (int j = 1; j < word1Postings[i].Length; j++) { resultPostingsList[word1Postings[i][0]].Add(word1Postings[i][j]); } } return resultPostingsList; }
private void indexADirectoryToolStripMenuItem_Click(object sender, EventArgs e) { var fbd = new FolderBrowserDialog { ShowNewFolderButton = false, Description = "Choose the directory you want to index" }; fbd.ShowDialog(); _directoryPath = fbd.SelectedPath; if (string.IsNullOrEmpty(_directoryPath)) return; var filenames = Directory.GetFiles(_directoryPath, "*.bin") .Select(Path.GetFileNameWithoutExtension) .ToArray(); DialogResult result = DialogResult.No; if (filenames.Contains("kGramIndex") && filenames.Contains("kGramVocab") && filenames.Contains("kGram") && filenames.Contains("vocab") && filenames.Contains("vocabTable") && filenames.Contains("postings") && filenames.Contains("statistics") && filenames.Contains("mostFreqWord") && filenames.Contains("docWeights") && filenames.Contains("matrix") && filenames.Contains("vocabMatrix") && filenames.Contains("vocabTableMatrix")) result = MessageBox.Show("This directory is already indexed, let's skip the long indexation! :)", "Directory already indexed", MessageBoxButtons.YesNo); if (result == DialogResult.No) { if (_index != null) _index.Dispose(); labelIndexing.Show(); panelArticle.Hide(); panelResults.Hide(); panelSearch.Hide(); labelIndexing.BringToFront(); progressBar.BringToFront(); Update(); var writer = new IndexWriter(_directoryPath); writer.BuildIndex(this); //Write the KGram Index to the disk KGramIndex.ToDisk(_directoryPath); } //Load the Disk positional index into memory _index = new DiskPositionalIndex(_directoryPath); //Load the KGram index in memory KGramIndex.ToMemory(_directoryPath); //Load the matrix to memory QueryReformulation.ToMemory(_directoryPath); toolStripMenuItemStatistics.Enabled = true; labelIndexing.Hide(); textBoxSearch.Enabled = true; textBoxSearch.Select(); textBoxSearch.Text = "Indexing done ^^"; textBoxSearch.SelectionStart = 0; textBoxSearch.SelectionLength = textBoxSearch.Text.Length; checkBoxBool.Enabled = true; checkBoxRank.Enabled = true; }
public void GetRankedDocumentsTest() { //Path to where all the bin file will be write to string pathToIndex = Path.Join(corpusDir, "/index/"); //Let Indexer know where should it writes all bin files Indexer.path = pathToIndex; //Read corpus IDocumentCorpus corpus = DirectoryCorpus.LoadTextDirectory(corpusDir); //Create directory to bins folders if it doesn't exist Directory.CreateDirectory(pathToIndex); //Initialize the index. IIndex index = Indexer.IndexCorpus(corpus); //The rest of your code... List<string> terms = new List<string>(); terms.Add("hello"); terms.Add("world"); //Testing ranked retrieval AND accumulated values index = new DiskPositionalIndex(pathToIndex); RankedRetrieval rv = new RankedRetrieval(corpus, index, "Default"); IList<MaxPriorityQueue.InvertedIndex> actual = rv.GetTopTen(terms); actual[0].GetDocumentId().Should().Be(0); //should be document 1 which is of doc id 0 actual[0].GetRank().Should().BeApproximately(1.183748156, 9); //A_{doccument} = 3.10195041 L_{1} = 2.620447934 actual[1].GetDocumentId().Should().Be(2); // actual[2].GetDocumentId().Should().Be(1); // actual[3].GetDocumentId().Should().Be(4); // //tests tf-idf rv = new RankedRetrieval(corpus, index, "Tf-idf"); IList<MaxPriorityQueue.InvertedIndex> actual1 = rv.GetTopTen(terms); actual1[0].GetDocumentId().Should().Be(2); actual1[0].GetRank().Should().BeApproximately(0.948215482, 9); actual1[1].GetDocumentId().Should().Be(0); actual1[1].GetRank().Should().BeApproximately(0.893296803, 9); actual1[2].GetDocumentId().Should().Be(1); actual1[2].GetRank().Should().BeApproximately(0.150554959, 9); actual1[3].GetDocumentId().Should().Be(4); actual1[3].GetRank().Should().BeApproximately(0.150554959, 9); //tests Okapi BM25 rv = new RankedRetrieval(corpus, index, "Okapi"); IList<MaxPriorityQueue.InvertedIndex> actual2 = rv.GetTopTen(terms); actual2[0].GetDocumentId().Should().Be(0); actual2[0].GetRank().Should().BeApproximately(0.66590893, 9); actual2[1].GetDocumentId().Should().Be(2); actual2[1].GetRank().Should().BeApproximately(0.507521667, 9); actual2[2].GetDocumentId().Should().Be(1); actual2[2].GetRank().Should().BeApproximately(0.1089371981, 9); actual2[3].GetDocumentId().Should().Be(4); actual2[3].GetRank().Should().BeApproximately(0.1084371981, 9); //tests Wacky rv = new RankedRetrieval(corpus, index, "Wacky"); IList<MaxPriorityQueue.InvertedIndex> actual3 = rv.GetTopTen(terms); actual3[0].GetDocumentId().Should().Be(0); actual3[0].GetRank().Should().BeApproximately(0.284824391, 9); actual3[1].GetDocumentId().Should().Be(2); actual3[1].GetRank().Should().BeApproximately(0.259673474, 9); actual3[2].GetDocumentId().Should().Be(1); actual3[2].GetRank().Should().Be(0.0); actual3[3].GetDocumentId().Should().Be(4); actual3[3].GetRank().Should().Be(0.0); }// end of GetRankedDocumentTest()