//Parse document(option=['document'->.docx, .pdf] or ['txt'->.txt]) //Removes punctuation and returns the words of the document in an array of strings public string[] GetText(string path, string option) { string text = null; try { ParserContext context = new ParserContext(path); if (option.Equals("txt")) { ITextParser parser = ParserFactory.CreateText(context); text = parser.Parse().ToString().ToLower().Replace('\n', ' ').Replace('\r', ' ') .Replace('\t', ' '); } else if (option.Equals("document")) { IDocumentParser parser = ParserFactory.CreateDocument(context); text = parser.Parse().ToString().ToLower().Replace('\n', ' ').Replace('\r', ' ') .Replace('\t', ' '); } } catch (Exception e) { Console.WriteLine("Exception found"); Console.WriteLine(e.Message); } text = RemovePunctuation(text); string[] words = text.Split(default(Char[]), StringSplitOptions.RemoveEmptyEntries); return(words); }
public void TestParseSimpleDocumentFromWord() { ParserContext context = new ParserContext(TestDataSample.GetWordPath("SampleDoc.docx")); IDocumentParser parser = ParserFactory.CreateDocument(context); ToxyDocument doc = parser.Parse(); Assert.AreEqual(7, doc.Paragraphs.Count); Assert.AreEqual("I am a test document", doc.Paragraphs[0].Text); Assert.AreEqual("This is page 1", doc.Paragraphs[1].Text); Assert.AreEqual("I am Calibri (Body) in font size 11", doc.Paragraphs[2].Text); Assert.AreEqual("\n", doc.Paragraphs[3].Text); Assert.AreEqual("This is page two", doc.Paragraphs[4].Text); Assert.AreEqual("It’s Arial Black in 16 point", doc.Paragraphs[5].Text); Assert.AreEqual("It’s also in blue", doc.Paragraphs[6].Text); }
public void TestParseDocumentWithTable() { ParserContext context = new ParserContext(TestDataSample.GetWordPath("simple-table.docx")); IDocumentParser parser = ParserFactory.CreateDocument(context); ToxyDocument doc = parser.Parse(); Assert.AreEqual(8, doc.Paragraphs.Count); Assert.AreEqual("This is a Word document that was created using Word 97 – SR2. It contains a paragraph, a table consisting of 2 rows and 3 columns and a final paragraph.", doc.Paragraphs[0].Text); Assert.AreEqual("This text is below the table.", doc.Paragraphs[1].Text); Assert.AreEqual("Cell 1,1", doc.Paragraphs[2].Text); Assert.AreEqual("Cell 1,2", doc.Paragraphs[3].Text); Assert.AreEqual("Cell 1,3", doc.Paragraphs[4].Text); Assert.AreEqual("Cell 2,1", doc.Paragraphs[5].Text); Assert.AreEqual("Cell 2,2", doc.Paragraphs[6].Text); Assert.AreEqual("Cell 2,3", doc.Paragraphs[7].Text); }
public string ExtractText(string filePath, string extension) { ParserContext c = new ParserContext(filePath); try { IDocumentParser parser = ParserFactory.CreateDocument(c); ToxyDocument result = parser.Parse(); return(result.ToString()); } catch (InvalidDataException) { Console.Error.WriteLine($"'{filePath}' is supported but don't have the required extension."); var newFilePath = $"{filePath}.{extension}"; Console.Error.WriteLine($"Creating a copy in '{newFilePath}' and using that to read."); File.Copy(filePath, newFilePath); return(ExtractText(newFilePath, extension)); } catch (Exception e) { Console.Error.WriteLine("{0} Exception caught error with {1}.", e, filePath); return(null); } }