// Private methods below here - only available within the SearchUtilities class. /* * Scans files and searches for terms/synonyms in each file. * Marks files true if all terms are found in file * Returns tyhe list of files that are true */ private List <string> ScanFilesByFile(List <string> files, string[] searchTerms, Boolean synonymsOn, NewWordsDataSet dataSet) { db = new Database(dataSet); stemmer = new PorterStemmer(); List <string> fileContainsTerm = new List <string>(); foreach (string file in files) { bool[] isInFile = new bool[searchTerms.Length]; // array for true/false search terms List <string> fileWords = ReadFromFile.GetWords(file); // Read the file and return list of words foreach (string word in fileWords) { // Search word over terms int counter = 0; // counter for boolean array foreach (string term in searchTerms) { if (synonymsOn) { List <string> checkList = new List <string> { term.ToLower() }; // get list of synonyms List <string> synonyms = db.GetSynonyms(term); if (synonyms != null) { foreach (string s in synonyms) { checkList.Add(s); } } // iterate over list foreach (string s in checkList) { if (word.Equals(s)) { isInFile[counter] = true; //mark this term or synonyms as true } } } if (stemmer.StemWord(word).Equals(stemmer.StemWord(term.ToLower()))) { isInFile[counter] = true; // mark this term as true } counter++; } } if (isInFile.All(x => x)) // tests if ALL search terms are true { fileContainsTerm.Add(file); // add file "string" to the List if true } } return(fileContainsTerm); }
///<summary> ///Creates an Hashtable that is an Inverted index oif the collection ///</summary> ///<param name="folder">The folder containing the collection</param> ///<returns> ///A Hashtable of the collection ///</returns> public Dictionary <string, Dictionary <int, double> > InvertedIndex(string folder) { if (internalIndex != null) { internalIndex.Clear(); } // clears the memory usage of exisitng Index internalIndex = new Dictionary <string, Dictionary <int, double> >(); // the invertedIndex to be returned searchUtil = new SearchUtilities(); // instantiate SearchUtilities class object dynamic form1 = Application.OpenForms[0]; // will create a reference to the Main Form object indexCount = 0; // a counter for how large the inverted index is. Dictionary <int, double> fileList = new Dictionary <int, double>(); // a list to populate the files that match a term stemmer = new PorterStemmer(); // instantiate a PorterStemmer object to stem words from files foreach (string file in searchUtil.IndexingFolders(folder)) { int fileID = converter.AssignId(file); // create an Id from the string of the file and store in HashMap Converter.paths foreach (string word in ReadFromFile.GetWords(file)) { // stem the word string stemmedWord = stemmer.StemWord(word); // create the Dictionary for the collection if (internalIndex.ContainsKey(stemmedWord)) { fileList = internalIndex[stemmedWord]; // check if the file is already in the list or not if (fileList.ContainsKey(fileID)) { fileList[fileID] = double.Parse(fileList[fileID].ToString()) + 1; } else { fileList.Add(fileID, 1.0); } internalIndex[stemmedWord] = fileList; } else { // create a new key and start new List of files for the key fileList = new Dictionary <int, double> { { fileID, 1.0 } }; internalIndex.Add(stemmedWord, fileList); indexCount++; } } form1.ShowIndexLength(false); // cross thread method to keep a running total of the index size on the Main form. } return(internalIndex); }
/// <summary> /// Scans each file in collection by terms /// </summary> /// <param name="files">The list of files</param> /// <param name="searchTerms">the array of search terms</param> /// <param name="synonymsOn">check if synonym checking is on</param> /// <param name="dataSet">The dataset to retrieve synonyms from</param> /// <returns></returns> private List <string> ScanFilesByTerms(List <string> files, string[] searchTerms, Boolean synonymsOn, NewWordsDataSet dataSet) { db = new Database(dataSet); stemmer = new PorterStemmer(); List <string> searchFileList = files; for (int i = 0; i < searchTerms.Length; i++) { List <string> fileHasTerm = new List <string>(); foreach (string file in searchFileList) { bool hasTerm = false; List <string> fileWords = ReadFromFile.GetWords(file); foreach (string word in fileWords) { if (synonymsOn) { List <string> synonyms = db.GetSynonyms(searchTerms[i]); if (synonyms != null) { foreach (string s in synonyms) { if (word.Equals(s)) { hasTerm = true; } } } } if (stemmer.StemWord(word).Equals(stemmer.StemWord(searchTerms[i]))) { hasTerm = true; } } if (hasTerm) { fileHasTerm.Add(file); } } searchFileList = fileHasTerm; } return(searchFileList); }
/* * Scans files in collection and returns all words into an array */ private string[] ScanFilesForWords(List <string> files) { List <string> words = new List <string>(); foreach (string file in files) { List <string> fileWords = ReadFromFile.GetWords(file); foreach (string word in fileWords) { words.Add(word); } } return(words.ToArray()); }