// Private methods below here - only available within the SearchUtilities class. /* * Scans files and searches for terms/synonyms in each file. * Marks files true if all terms are found in file * Returns tyhe list of files that are true */ private List <string> ScanFilesByFile(List <string> files, string[] searchTerms, Boolean synonymsOn, NewWordsDataSet dataSet) { db = new Database(dataSet); stemmer = new PorterStemmer(); List <string> fileContainsTerm = new List <string>(); foreach (string file in files) { bool[] isInFile = new bool[searchTerms.Length]; // array for true/false search terms List <string> fileWords = ReadFromFile.GetWords(file); // Read the file and return list of words foreach (string word in fileWords) { // Search word over terms int counter = 0; // counter for boolean array foreach (string term in searchTerms) { if (synonymsOn) { List <string> checkList = new List <string> { term.ToLower() }; // get list of synonyms List <string> synonyms = db.GetSynonyms(term); if (synonyms != null) { foreach (string s in synonyms) { checkList.Add(s); } } // iterate over list foreach (string s in checkList) { if (word.Equals(s)) { isInFile[counter] = true; //mark this term or synonyms as true } } } if (stemmer.StemWord(word).Equals(stemmer.StemWord(term.ToLower()))) { isInFile[counter] = true; // mark this term as true } counter++; } } if (isInFile.All(x => x)) // tests if ALL search terms are true { fileContainsTerm.Add(file); // add file "string" to the List if true } } return(fileContainsTerm); }
/// <summary> /// Scans each file in collection by terms /// </summary> /// <param name="files">The list of files</param> /// <param name="searchTerms">the array of search terms</param> /// <param name="synonymsOn">check if synonym checking is on</param> /// <param name="dataSet">The dataset to retrieve synonyms from</param> /// <returns></returns> private List <string> ScanFilesByTerms(List <string> files, string[] searchTerms, Boolean synonymsOn, NewWordsDataSet dataSet) { db = new Database(dataSet); stemmer = new PorterStemmer(); List <string> searchFileList = files; for (int i = 0; i < searchTerms.Length; i++) { List <string> fileHasTerm = new List <string>(); foreach (string file in searchFileList) { bool hasTerm = false; List <string> fileWords = ReadFromFile.GetWords(file); foreach (string word in fileWords) { if (synonymsOn) { List <string> synonyms = db.GetSynonyms(searchTerms[i]); if (synonyms != null) { foreach (string s in synonyms) { if (word.Equals(s)) { hasTerm = true; } } } } if (stemmer.StemWord(word).Equals(stemmer.StemWord(searchTerms[i]))) { hasTerm = true; } } if (hasTerm) { fileHasTerm.Add(file); } } searchFileList = fileHasTerm; } return(searchFileList); }
/// <summary> /// Returns the frequency of search terms from the index. /// </summary> /// <param name="terms"></param> /// <returns></returns> public string GetQueryFrequencyFromIndex(string[] terms) { string result = ""; stemmer = new PorterStemmer(); foreach (string word in internalIndex.Keys) { for (int i = 0; i < terms.Length; i++) { if (word.Equals(stemmer.StemWord(terms[i]))) { double freqCount = 0; var frequency = from inner in internalIndex[word] select new { NewKey = inner.Key, NewValue = inner.Value }; foreach (var count in frequency) { freqCount += count.NewValue; } result += terms[i].ToLower() + ": " + freqCount + "\r\n"; } } } return(result); }
///<summary> ///Creates an Hashtable that is an Inverted index oif the collection ///</summary> ///<param name="folder">The folder containing the collection</param> ///<returns> ///A Hashtable of the collection ///</returns> public Dictionary <string, Dictionary <int, double> > InvertedIndex(string folder) { if (internalIndex != null) { internalIndex.Clear(); } // clears the memory usage of exisitng Index internalIndex = new Dictionary <string, Dictionary <int, double> >(); // the invertedIndex to be returned searchUtil = new SearchUtilities(); // instantiate SearchUtilities class object dynamic form1 = Application.OpenForms[0]; // will create a reference to the Main Form object indexCount = 0; // a counter for how large the inverted index is. Dictionary <int, double> fileList = new Dictionary <int, double>(); // a list to populate the files that match a term stemmer = new PorterStemmer(); // instantiate a PorterStemmer object to stem words from files foreach (string file in searchUtil.IndexingFolders(folder)) { int fileID = converter.AssignId(file); // create an Id from the string of the file and store in HashMap Converter.paths foreach (string word in ReadFromFile.GetWords(file)) { // stem the word string stemmedWord = stemmer.StemWord(word); // create the Dictionary for the collection if (internalIndex.ContainsKey(stemmedWord)) { fileList = internalIndex[stemmedWord]; // check if the file is already in the list or not if (fileList.ContainsKey(fileID)) { fileList[fileID] = double.Parse(fileList[fileID].ToString()) + 1; } else { fileList.Add(fileID, 1.0); } internalIndex[stemmedWord] = fileList; } else { // create a new key and start new List of files for the key fileList = new Dictionary <int, double> { { fileID, 1.0 } }; internalIndex.Add(stemmedWord, fileList); indexCount++; } } form1.ShowIndexLength(false); // cross thread method to keep a running total of the index size on the Main form. } return(internalIndex); }
/// <summary> /// Returns an array of the stemmed collection /// </summary> /// <param name="folder"></param> /// <returns></returns> public string[] GetStemmedCollection(string folder) { stemmer = new PorterStemmer(); string[] collection = GetWordCollection(folder); List <string> stemmedCollection = new List <string>(); foreach (string word in collection) { if (word.Length > 2) { stemmedCollection.Add(stemmer.StemWord(word)); } } return(stemmedCollection.ToArray()); }
///<summary>Search the InvertedIndex and return the files</summary> ///<param name="dictionary">Recieve the inverted index</param> ///<param name="querys">The query list</param> ///<return>A List of files</return> public List <string> GetFilesFromIndex(string[] querys) { List <string> files = new List <string>(); stemmer = new PorterStemmer(); Dictionary <string, double>[] lists = new Dictionary <string, double> [querys.Length]; int counter = 0; foreach (string query in querys) { string stemmedQuery = stemmer.StemWord(query); lists[counter] = new Dictionary <string, double>(); if (internalIndex.ContainsKey(stemmedQuery)) { var innerKeysAndValues = from inner in internalIndex[stemmedQuery] select new { NewKey = inner.Key, NewValue = inner.Value }; foreach (var innerKeyAndValue in innerKeysAndValues) { int fileID = innerKeyAndValue.NewKey; lists[counter].Add(converter.GetPath(fileID), innerKeyAndValue.NewValue); } } counter++; } if (querys.Length > 1) { for (int i = querys.Length - 1; i > 0; i--) { var dict = lists[i]; var nextDict = lists[i - 1]; var joined = from kvp1 in dict join kvp2 in nextDict on kvp1.Key equals kvp2.Key select new { kvp1.Key, Value = kvp1.Value + kvp2.Value }; var result = joined.ToDictionary(t => t.Key, t => t.Value); lists[i - 1] = result; } } return(ListOrderByDescending(lists)); }