// Private methods below here -  only available within the SearchUtilities class.

        /*
         * Scans files and searches for terms/synonyms in each file.
         * Marks files true if all terms are found in file
         * Returns tyhe list of files that are true
         */
        private List <string> ScanFilesByFile(List <string> files, string[] searchTerms,
                                              Boolean synonymsOn, NewWordsDataSet dataSet)
        {
            db      = new Database(dataSet);
            stemmer = new PorterStemmer();
            List <string> fileContainsTerm = new List <string>();

            foreach (string file in files)
            {
                bool[]        isInFile  = new bool[searchTerms.Length]; // array for true/false search terms
                List <string> fileWords = ReadFromFile.GetWords(file);  // Read the file and return list of words

                foreach (string word in fileWords)
                {
                    // Search word over terms
                    int counter = 0; // counter for boolean array

                    foreach (string term in searchTerms)
                    {
                        if (synonymsOn)
                        {
                            List <string> checkList = new List <string> {
                                term.ToLower()
                            };
                            // get list of synonyms
                            List <string> synonyms = db.GetSynonyms(term);
                            if (synonyms != null)
                            {
                                foreach (string s in synonyms)
                                {
                                    checkList.Add(s);
                                }
                            }

                            // iterate over list
                            foreach (string s in checkList)
                            {
                                if (word.Equals(s))
                                {
                                    isInFile[counter] = true; //mark this term or synonyms as true
                                }
                            }
                        }

                        if (stemmer.StemWord(word).Equals(stemmer.StemWord(term.ToLower())))
                        {
                            isInFile[counter] = true; // mark this term as true
                        }

                        counter++;
                    }
                }

                if (isInFile.All(x => x))       // tests if ALL search terms are true
                {
                    fileContainsTerm.Add(file); // add file "string" to the List if true
                }
            }
            return(fileContainsTerm);
        }
예제 #2
0
        ///<summary>
        ///Creates an Hashtable that is an Inverted index oif the collection
        ///</summary>
        ///<param name="folder">The folder containing the collection</param>
        ///<returns>
        ///A Hashtable of the collection
        ///</returns>
        public Dictionary <string, Dictionary <int, double> > InvertedIndex(string folder)
        {
            if (internalIndex != null)
            {
                internalIndex.Clear();
            }                                                                     // clears the memory usage of exisitng Index
            internalIndex = new Dictionary <string, Dictionary <int, double> >(); // the invertedIndex to be returned
            searchUtil    = new SearchUtilities();                                // instantiate SearchUtilities class object
            dynamic form1 = Application.OpenForms[0];                             // will create a reference to the Main Form object

            indexCount = 0;                                                       // a counter for how large the inverted index is.

            Dictionary <int, double> fileList = new Dictionary <int, double>();   // a list to populate the files that match a term

            stemmer = new PorterStemmer();                                        // instantiate a PorterStemmer object to stem words from files

            foreach (string file in searchUtil.IndexingFolders(folder))
            {
                int fileID = converter.AssignId(file); // create an Id from the string of the file and store in HashMap Converter.paths

                foreach (string word in ReadFromFile.GetWords(file))
                {
                    // stem the word
                    string stemmedWord = stemmer.StemWord(word);
                    // create the Dictionary for the collection
                    if (internalIndex.ContainsKey(stemmedWord))
                    {
                        fileList = internalIndex[stemmedWord];
                        // check if the file is already in the list or not
                        if (fileList.ContainsKey(fileID))
                        {
                            fileList[fileID] = double.Parse(fileList[fileID].ToString()) + 1;
                        }
                        else
                        {
                            fileList.Add(fileID, 1.0);
                        }

                        internalIndex[stemmedWord] = fileList;
                    }
                    else
                    {
                        // create a new key and start new List of files for the key
                        fileList = new Dictionary <int, double>
                        {
                            { fileID, 1.0 }
                        };
                        internalIndex.Add(stemmedWord, fileList);
                        indexCount++;
                    }
                }
                form1.ShowIndexLength(false); // cross thread method to keep a running total of the index size on the Main form.
            }
            return(internalIndex);
        }
        /// <summary>
        /// Scans each file in collection by terms
        /// </summary>
        /// <param name="files">The list of files</param>
        /// <param name="searchTerms">the array of search terms</param>
        /// <param name="synonymsOn">check if synonym checking is on</param>
        /// <param name="dataSet">The dataset to retrieve synonyms from</param>
        /// <returns></returns>
        private List <string> ScanFilesByTerms(List <string> files, string[] searchTerms,
                                               Boolean synonymsOn, NewWordsDataSet dataSet)
        {
            db      = new Database(dataSet);
            stemmer = new PorterStemmer();
            List <string> searchFileList = files;

            for (int i = 0; i < searchTerms.Length; i++)
            {
                List <string> fileHasTerm = new List <string>();

                foreach (string file in searchFileList)
                {
                    bool          hasTerm   = false;
                    List <string> fileWords = ReadFromFile.GetWords(file);

                    foreach (string word in fileWords)
                    {
                        if (synonymsOn)
                        {
                            List <string> synonyms = db.GetSynonyms(searchTerms[i]);

                            if (synonyms != null)
                            {
                                foreach (string s in synonyms)
                                {
                                    if (word.Equals(s))
                                    {
                                        hasTerm = true;
                                    }
                                }
                            }
                        }

                        if (stemmer.StemWord(word).Equals(stemmer.StemWord(searchTerms[i])))
                        {
                            hasTerm = true;
                        }
                    }

                    if (hasTerm)
                    {
                        fileHasTerm.Add(file);
                    }
                }
                searchFileList = fileHasTerm;
            }
            return(searchFileList);
        }
        /*
         * Scans files in collection and returns all words into an array
         */
        private string[] ScanFilesForWords(List <string> files)
        {
            List <string> words = new List <string>();

            foreach (string file in files)
            {
                List <string> fileWords = ReadFromFile.GetWords(file);
                foreach (string word in fileWords)
                {
                    words.Add(word);
                }
            }

            return(words.ToArray());
        }