private void ProcessFile(FileInfo file) { if (!file.Exists) { return; } if (!extensions.Contains(file.Extension)) { return; } // \u2022 is the unicode for a bullet symbol. var separators = new[] { ' ', '\u2022', '’', '\"', '“', '!', '\'', '\\', '/', '_', '(', ')', '-', ',', ':', '?', ';', '.', '\r', '\n', '|' }; try { //use toxy to extract string from files. //parser = ParserFactory.CreateText(new ParserContext(file.FullName)); //checks if file has an html or xml extension. string document; ITextParser parser; if (file.Extension == ".html" || file.Extension == ".htm" || file.Extension == ".xml") { parser = ParserFactory.CreateText(new ParserContext(file.FullName)); string textWithTags = parser.Parse(); document = RemoveAllTags(textWithTags); } else if (file.Extension == ".pptx") { document = ExtractPptxText(file); } else { parser = ParserFactory.CreateText(new ParserContext(file.FullName)); document = parser.Parse(); } // Split with separators and ignore empty spaces. foreach (var word in document.ToLower().Split(separators, StringSplitOptions.RemoveEmptyEntries)) { // Remove stop words and numeric data. if (stopwords.Contains(word) || Regex.IsMatch(word, "\\d+")) { continue; } //stems word before adding it to the inverted index. InvertedIndex.GetInstance() .Add(stemmer.StemWord(word.Trim()), new InvertedIndex.Tuple(docId, wordPosition++)); } } catch (Exception e) when(e is IOException || e is NullReferenceException || e is ZipException) { MessageBox.Show(@"Please close all programs using the files you want to search."); } catch (Exception e) when(e is InvalidDataException) { MessageBox.Show(@"Invalid file format."); } FileMatch.GetInstance().Add(docId, file); docId++; }