コード例 #1
0
        public void TestingTierIndex()
        {
            //Path to where all the bin file will be write to
            string pathToIndex = Path.Join(corpusDir, "/index/");

            //Let Indexer know where should it writes all bin files
            Indexer.path = pathToIndex;

            //Read corpus
            IDocumentCorpus corpus = DirectoryCorpus.LoadTextDirectory(corpusDir);

            //Load Corpus to Index
            IIndex index = Indexer.IndexCorpus(corpus);

            //Create new DiskPositional Index from on disk files
            index = new DiskPositionalIndex(pathToIndex);

            //Check Info Of Postings Collected from Tier 1

      

            IList<String> results = new List<string>();

            //The rest of your code...
            List<string> terms = new List<string>();
            terms.Add("hello");
            terms.Add("world");

                  //get the postings
            IList<Posting> postings = new List<Posting>();
            postings = index.GetPositionalPostings(terms);


            //add the count of the postings to the list of strings to be returned
            results.Add(postings.Count.ToString());
            foreach (Posting p in postings)
            {
                if (results.Count < 20)
                {
                    //use the document id to access the document
                    IDocument doc = corpus.GetDocument(p.DocumentId);
                    results.Add(doc.Title);
                    results.Add(doc.DocumentId.ToString());
                }

            }

            foreach (string s in results)
            {
                Console.WriteLine(s);
            }

        } //end TestingTierIndexer()
コード例 #2
0
        public static List<int> ProcessWildcardQuery(string query, DiskPositionalIndex index)
        {
            var orQuery = KGramIndex.GenerateNormalQuery(query);
            var words = SplitOrQuery(orQuery);
            var orQueryItemsResultsDocIds = new List<List<int>>();
            foreach (string word in words)
            {
                var term = PorterStemmer.ProcessToken(word.Trim());
                orQueryItemsResultsDocIds.Add(GetDocIds(index.GetPostings(term, false)));

                //Add the term to to  liste of the found term
                if (!FoundTerms.Contains(term))
                    FoundTerms.Add(term);
            }

            return MergeOrResults(orQueryItemsResultsDocIds).Last();
        }
コード例 #3
0
        public static IEnumerable<KeyValuePair<int, double>> ProcessRankQuery(string query,
            DiskPositionalIndex index, string folder)
        {
            if (string.IsNullOrWhiteSpace(query)) return null;

            double numberOfDocuments = index.FileNames.Count;

            var ads = new Dictionary<int, double>();

            var reader = new FileStream(Path.Combine(folder, "docWeights.bin"), FileMode.Open, FileAccess.Read);

            for (int i = 0; i < numberOfDocuments; i++)
            {
                ads.Add(i, 0);
            }

            foreach (var term in SplitWhiteSpace(query))
            {
                var processedTerm = PorterStemmer.ProcessToken(term);
                var postings = index.GetPostings(processedTerm, true);

                if (postings != null)
                {
                    //Add the term to to  liste of the found term
                    if (!FoundTerms.Contains(processedTerm))
                        FoundTerms.Add(processedTerm);

                    double dft = postings.Count();

                    double wqt = Math.Log(1.0 + (numberOfDocuments / dft));

                    for (int j = 0; j < postings.Count(); j++)
                    {
                        double tftd = postings[j].Count() - 1;

                        double wdt = 1.0 + Math.Log(tftd);

                        ads[postings[j][0]] += wqt * wdt;
                    }
                }
                    }

            for (int i = 0; i < ads.Count; i++)
            {
                if (ads.ElementAt(i).Value > 0)
                {
                        // Read Ld in file and divide Ad by Ld.
                    reader.Seek(ads.ElementAt(i).Key * 8, SeekOrigin.Begin);
                        var buffer = new byte[8];
                        reader.Read(buffer, 0, buffer.Length);
                        if (BitConverter.IsLittleEndian)
                            Array.Reverse(buffer);
                        double ld = BitConverter.ToDouble(buffer, 0);

                    ads[ads.ElementAt(i).Key] = Math.Truncate(10000000 * ads.ElementAt(i).Value / ld) / 10000000;
                            }
                        }

            reader.Close();
            return ads.OrderByDescending(i => i.Value);
        }
コード例 #4
0
        public static List<int> ProcessQuery(string query, DiskPositionalIndex index)
        {
            //Empty the potential misspelled words
            PotentialMisspelledWords.Clear();

            // Trim the query.
            query = query.Trim();

            if (query == string.Empty)
                return new List<int>();

            // Verify the syntax is correct.
            if (!IsQuerySyntaxCorrect(query))
                return null;

            // Split by +, it gives us all the Qs. We will process the "OR" later.
            var qList = SplitOrQuery(query);

            // The list that will contain the final result of the query as docids.
            var finalResultsDocIds = new List<int>();

            var orQueryItemsResultsDocIds = new List<List<int>>();
            // Process each Q:
            foreach (string qTemp in qList)
            {
                int positiveLiterals = 0;
                bool notQuery;

                string q = qTemp.Trim();

                var andQueryItemsResultsDocIds = new List<List<int>>();
                var andNotQueryItemsResultsDocIds = new List<List<int>>();
                var notQueriesTempList = new List<List<int>>();

                // Parentheses.
                if (Regex.IsMatch(q, @"-\((.+?)\)")) notQuery = true;
                else
                {
                    notQuery = false;
                    positiveLiterals++;
                }
                var parentheses = Regex.Matches(q, @"\((.+?)\)")
                    .Cast<Match>()
                    .Select(m => m.Groups[1].Value)
                    .ToList();
                foreach (string expression in parentheses)
                {
                    var andQueryTerms = SplitWhiteSpace(expression);
                    var secondAndQueryItemsResultsDocIds = new List<List<int>>();

                    foreach (string termTemp in andQueryTerms)
                    {
                        string term = termTemp.Trim();
                        if (term == string.Empty) continue;

                        // If Wildcard query.
                        if (Regex.IsMatch(term, @"(.*\*.*)+"))
                            secondAndQueryItemsResultsDocIds.Add(ProcessWildcardQuery(term, index));
                        else
                        {
                            var processedTerm = PorterStemmer.ProcessToken(term);
                            var postings = index.GetPostings(processedTerm, false);

                            // Add the term to the list of the found term.
                            if (!FoundTerms.Contains(processedTerm))
                                FoundTerms.Add(processedTerm);
                            if (postings == null)
                            {
                                secondAndQueryItemsResultsDocIds.Add(new List<int>());
                                if (!PotentialMisspelledWords.Contains(term))
                                    PotentialMisspelledWords.Add(term);
                            }
                            else
                            {
                                secondAndQueryItemsResultsDocIds.Add(GetDocIds(postings));
                                if (!PotentialMisspelledWords.Contains(term) && postings.Count() < 5)
                                    PotentialMisspelledWords.Add(term);
                            }
                        }
                    }
                    if (secondAndQueryItemsResultsDocIds.Count > 0)
                        andQueryItemsResultsDocIds.Add(MergeAndResults(secondAndQueryItemsResultsDocIds).Last());
                }
                // Remove parentheses from the Q.
                q = Regex.Replace(q, @"\((.+?)\)", "");

                // Phrase queries with " ".
                if (Regex.IsMatch(q, "-\"(.+?)\"")) notQuery = true;
                else
                {
                    notQuery = false;
                    positiveLiterals++;
                }
                var phraseQueries = Regex.Matches(q, "\"(.+?)\"")
                    .Cast<Match>()
                    .Select(m => m.Groups[1].Value)
                    .ToList();
                foreach (string phraseQuery in phraseQueries)
                {
                    var phraseQueryTerms = SplitWhiteSpace(phraseQuery.Trim());
                    var results = ProcessPhraseQuery(index, phraseQueryTerms);
                    if (results == null)
                    {
                        if (notQuery) notQueriesTempList.Add(new List<int>());
                        else andQueryItemsResultsDocIds.Add(new List<int>());
                    }
                    else
                    {
                        if (notQuery) notQueriesTempList.Add(results.Keys.ToList());
                        else andQueryItemsResultsDocIds.Add(results.Keys.ToList());
                    }
                }
                // Remove phrase queries from the Q.
                if (notQuery) q = Regex.Replace(q, "-\"(.+?)\"", "");
                else q = Regex.Replace(q, "\"(.+?)\"", "");

                // In the Q, it only remains simple words.
                if (q != string.Empty)
                {
                    var terms = SplitWhiteSpace(q);
                    foreach (string termTemp in terms)
                    {
                        string term = termTemp.Trim();
                        if (term != string.Empty)
                        {
                            // If Wildcard query.
                            if (Regex.IsMatch(term, @"(-.*\*.*)+"))
                            {
                                positiveLiterals++;
                                notQueriesTempList.Add(ProcessWildcardQuery(term, index));
                            }
                            else if (Regex.IsMatch(term, @"(.*\*.*)+"))
                                andQueryItemsResultsDocIds.Add(ProcessWildcardQuery(term, index));

                            // Not query.
                            else if (Regex.IsMatch(term, @"-\S+"))
                            {
                                term = term.Replace("-", "");
                                var processedTerm = PorterStemmer.ProcessToken(term);
                                var postings = index.GetPostings(processedTerm, false);
                                if (postings != null)
                                {
                                    //Add the term to to  liste of the found term
                                    if (!FoundTerms.Contains(processedTerm))
                                        FoundTerms.Add(processedTerm);

                                    notQueriesTempList.Add(GetDocIds(postings));
                                }
                            }

                            // Simple word.
                            else
                            {
                                var processedTerm = PorterStemmer.ProcessToken(term);
                                var postings = index.GetPostings(processedTerm, false);
                                if (postings == null)
                                {
                                    andQueryItemsResultsDocIds.Add(new List<int>());
                                    if (!PotentialMisspelledWords.Contains(term))
                                        PotentialMisspelledWords.Add(term);
                                }
                                else
                                {
                                    andQueryItemsResultsDocIds.Add(GetDocIds(postings));
                                    //Add the term to to  liste of the found term
                                    if (!FoundTerms.Contains(processedTerm))
                                        FoundTerms.Add(processedTerm);
                                    if (!PotentialMisspelledWords.Contains(term) && postings.Count() < 5)
                                        PotentialMisspelledWords.Add(term);
                                }
                            }
                        }
                    }
                }

                if (positiveLiterals == 0) return null;

                // If there are NOT queries.
                if (notQueriesTempList.Count > 0)
                {
                    if (andQueryItemsResultsDocIds.Count > 0)
                        andNotQueryItemsResultsDocIds.Add(MergeAndResults(andQueryItemsResultsDocIds).Last());

                    andNotQueryItemsResultsDocIds.AddRange(notQueriesTempList);

                    orQueryItemsResultsDocIds.Add(MergeAndNotResults(andNotQueryItemsResultsDocIds).Last());
                }

                else
                    // Merge all the results in a AND query.
                    if (andQueryItemsResultsDocIds.Count > 0)
                    orQueryItemsResultsDocIds.Add(MergeAndResults(andQueryItemsResultsDocIds).Last());
            }

            // Merge all the OR query items results.
            if (orQueryItemsResultsDocIds.Count > 0)
            {
                orQueryItemsResultsDocIds = MergeOrResults(orQueryItemsResultsDocIds);
                finalResultsDocIds.AddRange(orQueryItemsResultsDocIds.Last());
            }

            return finalResultsDocIds;
        }
コード例 #5
0
        public static Dictionary<int, List<int>> ProcessPhraseQuery(DiskPositionalIndex index, List<string> wordsList)
        {
            int[][] word1Postings = null;

            foreach (var word in wordsList)
            {
                if (word1Postings == null)
                {
                    word1Postings = index.GetPostings(PorterStemmer.ProcessToken(word.Trim()), true);

                    //Check if the word could be mispelled
                    if (word1Postings != null)
                    {
                        if (word1Postings.Count() < 5 && !PotentialMisspelledWords.Contains(word))
                            PotentialMisspelledWords.Add(word);
                    }
                    else if (!PotentialMisspelledWords.Contains(word))
                        PotentialMisspelledWords.Add(word);
                }

                else
                {
                    var word2Postings = index.GetPostings(PorterStemmer.ProcessToken(word.Trim()), true);
                    if (word2Postings == null)
                    {
                        if (!PotentialMisspelledWords.Contains(word))
                            PotentialMisspelledWords.Add(word);
                        return null;
                    }

                    if (word1Postings.Count() < 5 && !PotentialMisspelledWords.Contains(word))
                        PotentialMisspelledWords.Add(word);

                    word1Postings = Process2WordPhraseQuery(word1Postings, word2Postings);
                }
                if (word1Postings == null)
                    return null;
            }

            //Add phrase query to found words as a group
            string tmp = "";
            foreach (var word in wordsList)
            {
                tmp += word + " ";
            }
            FoundTerms.Add(tmp.Trim());

            var resultPostingsList = new Dictionary<int, List<int>>();
            for (int i = 0; i < word1Postings.Length; i++)
            {
                resultPostingsList.Add(word1Postings[i][0], new List<int>());
                for (int j = 1; j < word1Postings[i].Length; j++)
                {
                    resultPostingsList[word1Postings[i][0]].Add(word1Postings[i][j]);
                }
            }
            return resultPostingsList;
        }
コード例 #6
0
        private void indexADirectoryToolStripMenuItem_Click(object sender, EventArgs e)
        {
            var fbd = new FolderBrowserDialog
            {
                ShowNewFolderButton = false,
                Description = "Choose the directory you want to index"
            };
            fbd.ShowDialog();
            _directoryPath = fbd.SelectedPath;

            if (string.IsNullOrEmpty(_directoryPath)) return;

            var filenames = Directory.GetFiles(_directoryPath, "*.bin")
                                     .Select(Path.GetFileNameWithoutExtension)
                                     .ToArray();

            DialogResult result = DialogResult.No;
            if (filenames.Contains("kGramIndex") && filenames.Contains("kGramVocab") && filenames.Contains("kGram") && filenames.Contains("vocab") && filenames.Contains("vocabTable") && filenames.Contains("postings") && filenames.Contains("statistics") && filenames.Contains("mostFreqWord") && filenames.Contains("docWeights") && filenames.Contains("matrix") && filenames.Contains("vocabMatrix") && filenames.Contains("vocabTableMatrix"))
                result = MessageBox.Show("This directory is already indexed, let's skip the long indexation! :)", "Directory already indexed", MessageBoxButtons.YesNo);

            if (result == DialogResult.No)
            {
                if (_index != null)
                    _index.Dispose();

                labelIndexing.Show();
                panelArticle.Hide();
                panelResults.Hide();
                panelSearch.Hide();
                labelIndexing.BringToFront();
                progressBar.BringToFront();
                Update();
                var writer = new IndexWriter(_directoryPath);
                writer.BuildIndex(this);

                //Write the KGram Index to the disk
                KGramIndex.ToDisk(_directoryPath);
            }

            //Load the Disk positional index into memory
            _index = new DiskPositionalIndex(_directoryPath);

            //Load the KGram index in memory
            KGramIndex.ToMemory(_directoryPath);

            //Load the matrix to memory
            QueryReformulation.ToMemory(_directoryPath);

            toolStripMenuItemStatistics.Enabled = true;
            labelIndexing.Hide();
            textBoxSearch.Enabled = true;
            textBoxSearch.Select();
            textBoxSearch.Text = "Indexing done ^^";
            textBoxSearch.SelectionStart = 0;
            textBoxSearch.SelectionLength = textBoxSearch.Text.Length;

            checkBoxBool.Enabled = true;
            checkBoxRank.Enabled = true;
        }
コード例 #7
0
        public void GetRankedDocumentsTest()
        {
            //Path to where all the bin file will be write to
            string pathToIndex = Path.Join(corpusDir, "/index/");

            //Let Indexer know where should it writes all bin files
            Indexer.path = pathToIndex;

            //Read corpus
            IDocumentCorpus corpus = DirectoryCorpus.LoadTextDirectory(corpusDir);

            //Create directory  to bins folders if it doesn't exist
            Directory.CreateDirectory(pathToIndex);

            //Initialize the index. 
            IIndex index = Indexer.IndexCorpus(corpus);


            //The rest of your code...
            List<string> terms = new List<string>();
            terms.Add("hello");
            terms.Add("world");

            //Testing ranked retrieval AND accumulated values
            index = new DiskPositionalIndex(pathToIndex);
            RankedRetrieval rv = new RankedRetrieval(corpus, index, "Default");
            IList<MaxPriorityQueue.InvertedIndex> actual = rv.GetTopTen(terms);
            actual[0].GetDocumentId().Should().Be(0); //should be document 1 which is of doc id 0
            actual[0].GetRank().Should().BeApproximately(1.183748156, 9); //A_{doccument} = 3.10195041 L_{1} = 2.620447934
            actual[1].GetDocumentId().Should().Be(2); //
            actual[2].GetDocumentId().Should().Be(1); //
            actual[3].GetDocumentId().Should().Be(4); //

            //tests tf-idf
            rv = new RankedRetrieval(corpus, index, "Tf-idf");
            IList<MaxPriorityQueue.InvertedIndex> actual1 = rv.GetTopTen(terms);
            actual1[0].GetDocumentId().Should().Be(2);
            actual1[0].GetRank().Should().BeApproximately(0.948215482, 9);
            actual1[1].GetDocumentId().Should().Be(0);
            actual1[1].GetRank().Should().BeApproximately(0.893296803, 9);
            actual1[2].GetDocumentId().Should().Be(1);
            actual1[2].GetRank().Should().BeApproximately(0.150554959, 9);
            actual1[3].GetDocumentId().Should().Be(4);
            actual1[3].GetRank().Should().BeApproximately(0.150554959, 9);


            //tests Okapi BM25
            rv = new RankedRetrieval(corpus, index, "Okapi");
            IList<MaxPriorityQueue.InvertedIndex> actual2 = rv.GetTopTen(terms);
            actual2[0].GetDocumentId().Should().Be(0);
            actual2[0].GetRank().Should().BeApproximately(0.66590893, 9);
            actual2[1].GetDocumentId().Should().Be(2);
            actual2[1].GetRank().Should().BeApproximately(0.507521667, 9);
            actual2[2].GetDocumentId().Should().Be(1);
            actual2[2].GetRank().Should().BeApproximately(0.1089371981, 9);
            actual2[3].GetDocumentId().Should().Be(4);
            actual2[3].GetRank().Should().BeApproximately(0.1084371981, 9);


            //tests Wacky 
            rv = new RankedRetrieval(corpus, index, "Wacky");
            IList<MaxPriorityQueue.InvertedIndex> actual3 = rv.GetTopTen(terms);
            actual3[0].GetDocumentId().Should().Be(0);
            actual3[0].GetRank().Should().BeApproximately(0.284824391, 9);
            actual3[1].GetDocumentId().Should().Be(2);
            actual3[1].GetRank().Should().BeApproximately(0.259673474, 9);
            actual3[2].GetDocumentId().Should().Be(1);
            actual3[2].GetRank().Should().Be(0.0);
            actual3[3].GetDocumentId().Should().Be(4);
            actual3[3].GetRank().Should().Be(0.0);

        }// end of GetRankedDocumentTest()