Esempio n. 1
0
        /// <summary>
        /// Saves files indexed to disk
        /// </summary>
        public void SaveToDisk()
        {
            string pathTostoreFiles = Path.GetDirectoryName(pathToIndexTo) + @"\file.dat";

            new Serializer <InvertedIndex>(pathToIndexTo).Serialize(InvertedIndex.GetInstance());
            new Serializer <FileMatch>(pathTostoreFiles).Serialize(FileMatch.GetInstance());
        }
Esempio n. 2
0
        /// <summary>
        /// Gets the inverted index
        /// </summary>
        /// <returns>Returns a single instance of the InvertedIndex class</returns>
        public static InvertedIndex GetInstance()
        {
            // Returns only one instance of an inverted index for every word to be indexed
            if (myIndex != null)
            {
                return(myIndex);
            }
            myIndex = new InvertedIndex();

            return(myIndex);
        }
Esempio n. 3
0
        private void ProcessFile(FileInfo file)
        {
            if (!file.Exists)
            {
                return;
            }
            if (!extensions.Contains(file.Extension))
            {
                return;
            }
            // \u2022 is the unicode for a bullet symbol.
            var separators = new[]
            {
                ' ', '\u2022', '’', '\"', '“', '!', '\'', '\\', '/', '_', '(', ')', '-', ',', ':', '?', ';', '.',
                '\r', '\n', '|'
            };

            try
            {
                //use toxy to extract string from files.
                //parser = ParserFactory.CreateText(new ParserContext(file.FullName));
                //checks if file has an html or xml extension.

                string      document;
                ITextParser parser;
                if (file.Extension == ".html" || file.Extension == ".htm" || file.Extension == ".xml")
                {
                    parser = ParserFactory.CreateText(new ParserContext(file.FullName));
                    string textWithTags = parser.Parse();
                    document = RemoveAllTags(textWithTags);
                }
                else if (file.Extension == ".pptx")
                {
                    document = ExtractPptxText(file);
                }
                else
                {
                    parser   = ParserFactory.CreateText(new ParserContext(file.FullName));
                    document = parser.Parse();
                }

                // Split with separators and ignore empty spaces.
                foreach (var word in document.ToLower().Split(separators, StringSplitOptions.RemoveEmptyEntries))
                {
                    // Remove stop words and numeric data.
                    if (stopwords.Contains(word) || Regex.IsMatch(word, "\\d+"))
                    {
                        continue;
                    }

                    //stems word before adding it to the inverted index.
                    InvertedIndex.GetInstance()
                    .Add(stemmer.StemWord(word.Trim()), new InvertedIndex.Tuple(docId, wordPosition++));
                }
            }
            catch (Exception e) when(e is IOException || e is NullReferenceException || e is ZipException)
            {
                MessageBox.Show(@"Please close all programs using the files you want to search.");
            }
            catch (Exception e) when(e is InvalidDataException)
            {
                MessageBox.Show(@"Invalid file format.");
            }

            FileMatch.GetInstance().Add(docId, file);
            docId++;
        }
Esempio n. 4
0
 /// <summary>
 ///A three arguement ranking class contructor.
 /// </summary>
 /// <param name="terms">terms in the query.</param>
 /// <param name="resultsFound">a list of documents found.</param>
 /// <param name="invertedIndex">The inverted index.</param>
 public Ranking(string[] terms, SortedSet <int> resultsFound, InvertedIndex invertedIndex)
 {
     this.terms         = terms;
     this.invertedIndex = invertedIndex;
     this.resultsFound  = resultsFound;
 }
 /// <summary>
 ///A two arguement query contructor
 /// </summary>
 /// <param name="queryString">The string to be queried</param>
 /// <param name="queryPath">Path to unserialize the inverted index from</param>
 public Query(string queryString, string queryPath)
 {
     this.queryString = queryString;
     invertedIndex    = new Serializer <InvertedIndex>(queryPath).Deserialize();
 }