// Private methods below here - only available within the SearchUtilities class. /* * Scans files and searches for terms/synonyms in each file. * Marks files true if all terms are found in file * Returns tyhe list of files that are true */ private List <string> ScanFilesByFile(List <string> files, string[] searchTerms, Boolean synonymsOn, NewWordsDataSet dataSet) { db = new Database(dataSet); stemmer = new PorterStemmer(); List <string> fileContainsTerm = new List <string>(); foreach (string file in files) { bool[] isInFile = new bool[searchTerms.Length]; // array for true/false search terms List <string> fileWords = ReadFromFile.GetWords(file); // Read the file and return list of words foreach (string word in fileWords) { // Search word over terms int counter = 0; // counter for boolean array foreach (string term in searchTerms) { if (synonymsOn) { List <string> checkList = new List <string> { term.ToLower() }; // get list of synonyms List <string> synonyms = db.GetSynonyms(term); if (synonyms != null) { foreach (string s in synonyms) { checkList.Add(s); } } // iterate over list foreach (string s in checkList) { if (word.Equals(s)) { isInFile[counter] = true; //mark this term or synonyms as true } } } if (stemmer.StemWord(word).Equals(stemmer.StemWord(term.ToLower()))) { isInFile[counter] = true; // mark this term as true } counter++; } } if (isInFile.All(x => x)) // tests if ALL search terms are true { fileContainsTerm.Add(file); // add file "string" to the List if true } } return(fileContainsTerm); }
/// <summary> /// Scans each file in collection by terms /// </summary> /// <param name="files">The list of files</param> /// <param name="searchTerms">the array of search terms</param> /// <param name="synonymsOn">check if synonym checking is on</param> /// <param name="dataSet">The dataset to retrieve synonyms from</param> /// <returns></returns> private List <string> ScanFilesByTerms(List <string> files, string[] searchTerms, Boolean synonymsOn, NewWordsDataSet dataSet) { db = new Database(dataSet); stemmer = new PorterStemmer(); List <string> searchFileList = files; for (int i = 0; i < searchTerms.Length; i++) { List <string> fileHasTerm = new List <string>(); foreach (string file in searchFileList) { bool hasTerm = false; List <string> fileWords = ReadFromFile.GetWords(file); foreach (string word in fileWords) { if (synonymsOn) { List <string> synonyms = db.GetSynonyms(searchTerms[i]); if (synonyms != null) { foreach (string s in synonyms) { if (word.Equals(s)) { hasTerm = true; } } } } if (stemmer.StemWord(word).Equals(stemmer.StemWord(searchTerms[i]))) { hasTerm = true; } } if (hasTerm) { fileHasTerm.Add(file); } } searchFileList = fileHasTerm; } return(searchFileList); }
/// <summary> /// Gets the files from the inverted index that contain the querys /// and their synonyms. /// </summary> /// <param name="dictionary">The inverted index</param> /// <param name="querys">The array of querys</param> /// <param name="dataSet">The dataset to draw synoyms from</param> /// <returns></returns> public List <string> GetFilesFromIndexWithSynonyms(string[] querys, NewWordsDataSet dataSet) { List <string> files = new List <string>(); stemmer = new PorterStemmer(); Database database = new Database(dataSet); Dictionary <string, double>[] lists = new Dictionary <string, double> [querys.Length]; int counter = 0; foreach (string query in querys) { string stemmedQuery = stemmer.StemWord(query); lists[counter] = new Dictionary <string, double>(); if (internalIndex.ContainsKey(stemmedQuery)) { var innerKeysAndValues = from inner in internalIndex[stemmedQuery] select new { NewKey = inner.Key, NewValue = inner.Value }; foreach (var innerKeyAndValue in innerKeysAndValues) { int fileID = innerKeyAndValue.NewKey; lists[counter].Add(converter.GetPath(fileID), innerKeyAndValue.NewValue); } } List <string> synonmys = database.GetSynonyms(query); if (synonmys != null) { foreach (string synonym in synonmys) { string stemmedSynonym = stemmer.StemWord(synonym); if (internalIndex.ContainsKey(stemmedSynonym)) { var innerKeysAndValues = from inner in internalIndex[stemmedSynonym] select new { NewKey = inner.Key, NewValue = inner.Value }; foreach (var innerKeyAndValue in innerKeysAndValues) { string path = converter.GetPath(innerKeyAndValue.NewKey); if (!lists[counter].ContainsKey(path)) { lists[counter].Add(path, innerKeyAndValue.NewValue); } } } } } counter++; } if (querys.Length > 1) { for (int i = querys.Length - 1; i > 0; i--) { var dict = lists[i]; var nextDict = lists[i - 1]; var joined = from kvp1 in dict join kvp2 in nextDict on kvp1.Key equals kvp2.Key select new { kvp1.Key, Value = kvp1.Value + kvp2.Value }; var result = joined.ToDictionary(t => t.Key, t => t.Value); lists[i - 1] = result; } } return(ListOrderByDescending(lists)); }
private NewWordsDataSet nwDataSet; // An instance variable pointing at the DataSet // Constructor for Database class, requires a NewWordsDataSet to be passed as an argument public Database(NewWordsDataSet dataSet) { this.nwDataSet = dataSet; // instantiates the passed dataSet for the class methods to use }
/* * Returns the list of files inside folder that contains the search terms or synonyms (if it is checked) * Iterating by the files in the folder */ public List <string> GetFilesContainingTermsByFiles(string folder, string[] terms, Boolean synonymsOn, NewWordsDataSet dataSet) { // List<string> folders = GetFolders(folder); List <string> files = IndexingFolders(folder); return(ScanFilesByFile(files, terms, synonymsOn, dataSet)); }