/// <summary> /// Saves files indexed to disk /// </summary> public void SaveToDisk() { string pathTostoreFiles = Path.GetDirectoryName(pathToIndexTo) + @"\file.dat"; new Serializer <InvertedIndex>(pathToIndexTo).Serialize(InvertedIndex.GetInstance()); new Serializer <FileMatch>(pathTostoreFiles).Serialize(FileMatch.GetInstance()); }
/// <summary> /// Gets the inverted index /// </summary> /// <returns>Returns a single instance of the InvertedIndex class</returns> public static InvertedIndex GetInstance() { // Returns only one instance of an inverted index for every word to be indexed if (myIndex != null) { return(myIndex); } myIndex = new InvertedIndex(); return(myIndex); }
private void ProcessFile(FileInfo file) { if (!file.Exists) { return; } if (!extensions.Contains(file.Extension)) { return; } // \u2022 is the unicode for a bullet symbol. var separators = new[] { ' ', '\u2022', '’', '\"', '“', '!', '\'', '\\', '/', '_', '(', ')', '-', ',', ':', '?', ';', '.', '\r', '\n', '|' }; try { //use toxy to extract string from files. //parser = ParserFactory.CreateText(new ParserContext(file.FullName)); //checks if file has an html or xml extension. string document; ITextParser parser; if (file.Extension == ".html" || file.Extension == ".htm" || file.Extension == ".xml") { parser = ParserFactory.CreateText(new ParserContext(file.FullName)); string textWithTags = parser.Parse(); document = RemoveAllTags(textWithTags); } else if (file.Extension == ".pptx") { document = ExtractPptxText(file); } else { parser = ParserFactory.CreateText(new ParserContext(file.FullName)); document = parser.Parse(); } // Split with separators and ignore empty spaces. foreach (var word in document.ToLower().Split(separators, StringSplitOptions.RemoveEmptyEntries)) { // Remove stop words and numeric data. if (stopwords.Contains(word) || Regex.IsMatch(word, "\\d+")) { continue; } //stems word before adding it to the inverted index. InvertedIndex.GetInstance() .Add(stemmer.StemWord(word.Trim()), new InvertedIndex.Tuple(docId, wordPosition++)); } } catch (Exception e) when(e is IOException || e is NullReferenceException || e is ZipException) { MessageBox.Show(@"Please close all programs using the files you want to search."); } catch (Exception e) when(e is InvalidDataException) { MessageBox.Show(@"Invalid file format."); } FileMatch.GetInstance().Add(docId, file); docId++; }
/// <summary> ///A three arguement ranking class contructor. /// </summary> /// <param name="terms">terms in the query.</param> /// <param name="resultsFound">a list of documents found.</param> /// <param name="invertedIndex">The inverted index.</param> public Ranking(string[] terms, SortedSet <int> resultsFound, InvertedIndex invertedIndex) { this.terms = terms; this.invertedIndex = invertedIndex; this.resultsFound = resultsFound; }
/// <summary> ///A two arguement query contructor /// </summary> /// <param name="queryString">The string to be queried</param> /// <param name="queryPath">Path to unserialize the inverted index from</param> public Query(string queryString, string queryPath) { this.queryString = queryString; invertedIndex = new Serializer <InvertedIndex>(queryPath).Deserialize(); }