示例#1
0
        private void ProcessFile(FileInfo file)
        {
            if (!file.Exists)
            {
                return;
            }
            if (!extensions.Contains(file.Extension))
            {
                return;
            }
            // \u2022 is the unicode for a bullet symbol.
            var separators = new[]
            {
                ' ', '\u2022', '’', '\"', '“', '!', '\'', '\\', '/', '_', '(', ')', '-', ',', ':', '?', ';', '.',
                '\r', '\n', '|'
            };

            try
            {
                //use toxy to extract string from files.
                //parser = ParserFactory.CreateText(new ParserContext(file.FullName));
                //checks if file has an html or xml extension.

                string      document;
                ITextParser parser;
                if (file.Extension == ".html" || file.Extension == ".htm" || file.Extension == ".xml")
                {
                    parser = ParserFactory.CreateText(new ParserContext(file.FullName));
                    string textWithTags = parser.Parse();
                    document = RemoveAllTags(textWithTags);
                }
                else if (file.Extension == ".pptx")
                {
                    document = ExtractPptxText(file);
                }
                else
                {
                    parser   = ParserFactory.CreateText(new ParserContext(file.FullName));
                    document = parser.Parse();
                }

                // Split with separators and ignore empty spaces.
                foreach (var word in document.ToLower().Split(separators, StringSplitOptions.RemoveEmptyEntries))
                {
                    // Remove stop words and numeric data.
                    if (stopwords.Contains(word) || Regex.IsMatch(word, "\\d+"))
                    {
                        continue;
                    }

                    //stems word before adding it to the inverted index.
                    InvertedIndex.GetInstance()
                    .Add(stemmer.StemWord(word.Trim()), new InvertedIndex.Tuple(docId, wordPosition++));
                }
            }
            catch (Exception e) when(e is IOException || e is NullReferenceException || e is ZipException)
            {
                MessageBox.Show(@"Please close all programs using the files you want to search.");
            }
            catch (Exception e) when(e is InvalidDataException)
            {
                MessageBox.Show(@"Invalid file format.");
            }

            FileMatch.GetInstance().Add(docId, file);
            docId++;
        }