public void parseFile(object file, string[] stopWords) { int f; int.TryParse(file.ToString(), out f); //Get file ID FileStream filestream = new FileStream(files[f], FileMode.Open, FileAccess.Read); StreamReader streamreader = new StreamReader(filestream); string whole = streamreader.ReadToEnd(); //Get file as string PorterStemmer stemmer = new PorterStemmer(); streamreader.Close(); filestream.Close(); whole = whole.ToLower(); Regex rgx = new Regex(@"\r\n", RegexOptions.ECMAScript); //Remove special chars and change string to a single line whole = rgx.Replace(whole, " "); rgx = new Regex(@"[^0-9a-z ]+", RegexOptions.ECMAScript); whole = rgx.Replace(whole, ""); List <string> words = whole.Split(' ').ToList <string>(); words = words.Where(e => !stopWords.Any(g => g == e)).ToList <string>(); for (int i = 0; i < words.Count; i++) { lock (lock1) if (isStemming ?? false) { words[i] = stemmer.StemWord(words[i]); } //If stemming is turned on, stem the current word ConcurrentDictionary <int, List <int> > indexEntry = invertedIndex.GetOrAdd(words[i], new ConcurrentDictionary <int, List <int> >()); //If the word is already in the index; get it, otherwise; add it indexEntry.GetOrAdd(f, new List <int>()).Add(i); //If that word has already occurred in this file; get the occurance list, otherwise; create the list lock (lock1) counter++; //Non essential counter for metrics } }
public void searchIndex(object fq) { Stopwatch s = new Stopwatch(); PorterStemmer stemmer = new PorterStemmer(); s.Start(); string fullQuery = fq.ToString(); fullQuery = fullQuery.ToLower(); Regex rgx = new Regex(@"\r\n", RegexOptions.ECMAScript); //Remove special chars and change to a single line fullQuery = rgx.Replace(fullQuery, " "); rgx = new Regex(@"[^0-9a-z ]+", RegexOptions.ECMAScript); fullQuery = rgx.Replace(fullQuery, ""); List <string> words = fullQuery.Split(' ').ToList <string>(); List <string> stopWords = GetStopWords(); words = words.Where(e => !stopWords.Any(g => g == e)).ToList <string>(); //Turn string of search terms into a list, remove stopwords List <List <int> > result = new List <List <int> >(); List <int> held; ConcurrentDictionary <int, List <int> > temp; bool nonePresent = true; words.Distinct(); for (int i = 0; i < words.Count; i++) { lock (lock1) if (isStemming ?? false) { words[i] = stemmer.StemWord(words[i]); } //If stemming is on; stem the search term if (invertedIndex.ContainsKey(words[i])) { invertedIndex.TryGetValue(words[i], out temp); held = temp.Keys.ToList(); //Get a list of all file IDs containing the current search term result.Add(held); //Add that list to a collection of lists nonePresent = false; } } if (nonePresent) { listBox.Items.Add("No search terms had a match in the dataset."); } else { if (searchType == 1) { held = IntersectAll(result); } else if (searchType == 2) { held = CombineAll(result); } else { held = SubtractIntersection(result); } foreach (int i in held) { if (!listBox.Items.Contains(files[i])) { listBox.Items.Add(files[i]); //Only list the file if it hasn't been listed already } } } s.Stop(); Console.WriteLine("Search Time: " + s.Elapsed); }