Ejemplo n.º 1
0
        //Parse document(option=['document'->.docx, .pdf] or ['txt'->.txt])
        //Removes punctuation and returns the words of the document in an array of strings
        public string[] GetText(string path, string option)
        {
            string text = null;

            try
            {
                ParserContext context = new ParserContext(path);
                if (option.Equals("txt"))
                {
                    ITextParser parser = ParserFactory.CreateText(context);
                    text = parser.Parse().ToString().ToLower().Replace('\n', ' ').Replace('\r', ' ')
                           .Replace('\t', ' ');
                }
                else if (option.Equals("document"))
                {
                    IDocumentParser parser = ParserFactory.CreateDocument(context);
                    text = parser.Parse().ToString().ToLower().Replace('\n', ' ').Replace('\r', ' ')
                           .Replace('\t', ' ');
                }
            }
            catch (Exception e)
            {
                Console.WriteLine("Exception found");
                Console.WriteLine(e.Message);
            }
            text = RemovePunctuation(text);
            string[] words = text.Split(default(Char[]), StringSplitOptions.RemoveEmptyEntries);
            return(words);
        }
Ejemplo n.º 2
0
        public void TestParseSimpleDocumentFromWord()
        {
            ParserContext   context = new ParserContext(TestDataSample.GetWordPath("SampleDoc.docx"));
            IDocumentParser parser  = ParserFactory.CreateDocument(context);
            ToxyDocument    doc     = parser.Parse();

            Assert.AreEqual(7, doc.Paragraphs.Count);
            Assert.AreEqual("I am a test document", doc.Paragraphs[0].Text);
            Assert.AreEqual("This is page 1", doc.Paragraphs[1].Text);
            Assert.AreEqual("I am Calibri (Body) in font size 11", doc.Paragraphs[2].Text);
            Assert.AreEqual("\n", doc.Paragraphs[3].Text);
            Assert.AreEqual("This is page two", doc.Paragraphs[4].Text);
            Assert.AreEqual("It’s Arial Black in 16 point", doc.Paragraphs[5].Text);
            Assert.AreEqual("It’s also in blue", doc.Paragraphs[6].Text);
        }
Ejemplo n.º 3
0
        public void TestParseDocumentWithTable()
        {
            ParserContext   context = new ParserContext(TestDataSample.GetWordPath("simple-table.docx"));
            IDocumentParser parser  = ParserFactory.CreateDocument(context);
            ToxyDocument    doc     = parser.Parse();

            Assert.AreEqual(8, doc.Paragraphs.Count);
            Assert.AreEqual("This is a Word document that was created using Word 97 – SR2.  It contains a paragraph, a table consisting of 2 rows and 3 columns and a final paragraph.",
                            doc.Paragraphs[0].Text);
            Assert.AreEqual("This text is below the table.", doc.Paragraphs[1].Text);
            Assert.AreEqual("Cell 1,1", doc.Paragraphs[2].Text);
            Assert.AreEqual("Cell 1,2", doc.Paragraphs[3].Text);
            Assert.AreEqual("Cell 1,3", doc.Paragraphs[4].Text);
            Assert.AreEqual("Cell 2,1", doc.Paragraphs[5].Text);
            Assert.AreEqual("Cell 2,2", doc.Paragraphs[6].Text);
            Assert.AreEqual("Cell 2,3", doc.Paragraphs[7].Text);
        }
Ejemplo n.º 4
0
        public string ExtractText(string filePath, string extension)
        {
            ParserContext c = new ParserContext(filePath);

            try
            {
                IDocumentParser parser = ParserFactory.CreateDocument(c);
                ToxyDocument    result = parser.Parse();
                return(result.ToString());
            }
            catch (InvalidDataException)
            {
                Console.Error.WriteLine($"'{filePath}' is supported but don't have the required extension.");
                var newFilePath = $"{filePath}.{extension}";
                Console.Error.WriteLine($"Creating a copy in '{newFilePath}' and using that to read.");
                File.Copy(filePath, newFilePath);
                return(ExtractText(newFilePath, extension));
            }
            catch (Exception e)
            {
                Console.Error.WriteLine("{0} Exception caught error with {1}.", e, filePath);
                return(null);
            }
        }