public ToxyDocument Parse() { if (!File.Exists(Context.Path)) throw new FileNotFoundException("File " + Context.Path + " is not found"); ToxyDocument rdoc = new ToxyDocument(); ITextExtractionStrategy its = new iTextSharp.text.pdf.parser.LocationTextExtractionStrategy(); using (PdfReader reader = new PdfReader(this.Context.Path)) { for (int i = 1; i <= reader.NumberOfPages; i++) { string thePage = PdfTextExtractor.GetTextFromPage(reader, i, its); string[] theLines = thePage.Split('\n'); foreach (var theLine in theLines) { ToxyParagraph para = new ToxyParagraph(); para.Text = theLine; rdoc.Paragraphs.Add(para); } } } return rdoc; }
public ToxyDocument Parse() { if (!File.Exists(Context.Path)) { throw new FileNotFoundException("File " + Context.Path + " is not found"); } ToxyDocument rdoc = new ToxyDocument(); ITextExtractionStrategy its = new iTextSharp.text.pdf.parser.LocationTextExtractionStrategy(); using (PdfReader reader = new PdfReader(this.Context.Path)) { for (int i = 1; i <= reader.NumberOfPages; i++) { string thePage = PdfTextExtractor.GetTextFromPage(reader, i, its); string[] theLines = thePage.Split('\n'); foreach (var theLine in theLines) { ToxyParagraph para = new ToxyParagraph(); para.Text = theLine; rdoc.Paragraphs.Add(para); } } } return(rdoc); }
public ToxyDocument Parse() { if (!File.Exists(Context.Path)) { throw new FileNotFoundException("File " + Context.Path + " is not found"); } ToxyDocument rdoc = new ToxyDocument(); using (Stream stream = File.OpenRead(Context.Path)) using (PdfDocument doc = PdfReader.Open(stream, PdfDocumentOpenMode.ReadOnly)) { for (int i = 0; i < doc.PageCount; i++) { var texts = doc.Pages[i].ExtractText(); foreach (var text in texts) { ToxyParagraph para = new ToxyParagraph(); para.Text = text; rdoc.Paragraphs.Add(para); } } } return(rdoc); }
public ToxyDocument Parse() { if (!File.Exists(Context.Path)) { throw new FileNotFoundException("File " + Context.Path + " is not found"); } bool extractHeader = false; if (Context.Properties.ContainsKey("ExtractHeader")) { extractHeader = Utility.IsTrue(Context.Properties["ExtractHeader"]); } bool extractFooter = false; if (Context.Properties.ContainsKey("ExtractFooter")) { extractFooter = Utility.IsTrue(Context.Properties["ExtractFooter"]); } ToxyDocument rdoc = new ToxyDocument(); using (FileStream stream = File.OpenRead(Context.Path)) { HWPFDocument worddoc = new HWPFDocument(stream); if (extractHeader && worddoc.GetHeaderStoryRange() != null) { StringBuilder sb = new StringBuilder(); rdoc.Header = worddoc.GetHeaderStoryRange().Text; } if (extractFooter && worddoc.GetFootnoteRange() != null) { StringBuilder sb = new StringBuilder(); rdoc.Footer = worddoc.GetFootnoteRange().Text; } for (int i = 0; i < worddoc.GetRange().NumParagraphs; i++) { Paragraph para = worddoc.GetRange().GetParagraph(i); string text = para.Text; ToxyParagraph p = new ToxyParagraph(); p.Text = text; //var runs = para.Runs; p.StyleID = para.GetStyleIndex().ToString(); //for (int i = 0; i < runs.Count; i++) //{ // var run = runs[i]; //} rdoc.Paragraphs.Add(p); } } return(rdoc); }
public ToxyDocument Parse() { if (!File.Exists(Context.Path)) throw new FileNotFoundException("File " + Context.Path + " is not found"); bool extractHeader = false; if (Context.Properties.ContainsKey("ExtractHeader")) { extractHeader = Utility.IsTrue(Context.Properties["ExtractHeader"]); } bool extractFooter = false; if (Context.Properties.ContainsKey("ExtractFooter")) { extractFooter = Utility.IsTrue(Context.Properties["ExtractFooter"]); } ToxyDocument rdoc = new ToxyDocument(); using (FileStream stream = File.OpenRead(Context.Path)) { HWPFDocument worddoc = new HWPFDocument(stream); if (extractHeader && worddoc.GetHeaderStoryRange() != null) { StringBuilder sb = new StringBuilder(); rdoc.Header = worddoc.GetHeaderStoryRange().Text; } if (extractFooter && worddoc.GetFootnoteRange() != null) { StringBuilder sb = new StringBuilder(); rdoc.Footer = worddoc.GetFootnoteRange().Text; } for (int i=0;i<worddoc.GetRange().NumParagraphs;i++) { Paragraph para = worddoc.GetRange().GetParagraph(i); string text = para.Text; ToxyParagraph p = new ToxyParagraph(); p.Text = text; //var runs = para.Runs; p.StyleID = para.GetStyleIndex().ToString(); //for (int i = 0; i < runs.Count; i++) //{ // var run = runs[i]; //} rdoc.Paragraphs.Add(p); } } return rdoc; }
public void TestParseSimpleDocumentFromWord() { ParserContext context = new ParserContext(TestDataSample.GetWordPath("SampleDoc.docx")); IDocumentParser parser = ParserFactory.CreateDocument(context); ToxyDocument doc = parser.Parse(); Assert.AreEqual(7, doc.Paragraphs.Count); Assert.AreEqual("I am a test document", doc.Paragraphs[0].Text); Assert.AreEqual("This is page 1", doc.Paragraphs[1].Text); Assert.AreEqual("I am Calibri (Body) in font size 11", doc.Paragraphs[2].Text); Assert.AreEqual("\n", doc.Paragraphs[3].Text); Assert.AreEqual("This is page two", doc.Paragraphs[4].Text); Assert.AreEqual("It’s Arial Black in 16 point", doc.Paragraphs[5].Text); Assert.AreEqual("It’s also in blue", doc.Paragraphs[6].Text); }
public void TestParseDocumentWithTable() { ParserContext context = new ParserContext(TestDataSample.GetWordPath("simple-table.docx")); IDocumentParser parser = ParserFactory.CreateDocument(context); ToxyDocument doc = parser.Parse(); Assert.AreEqual(8, doc.Paragraphs.Count); Assert.AreEqual("This is a Word document that was created using Word 97 – SR2. It contains a paragraph, a table consisting of 2 rows and 3 columns and a final paragraph.", doc.Paragraphs[0].Text); Assert.AreEqual("This text is below the table.", doc.Paragraphs[1].Text); Assert.AreEqual("Cell 1,1", doc.Paragraphs[2].Text); Assert.AreEqual("Cell 1,2", doc.Paragraphs[3].Text); Assert.AreEqual("Cell 1,3", doc.Paragraphs[4].Text); Assert.AreEqual("Cell 2,1", doc.Paragraphs[5].Text); Assert.AreEqual("Cell 2,2", doc.Paragraphs[6].Text); Assert.AreEqual("Cell 2,3", doc.Paragraphs[7].Text); }
public ToxyDocument Parse() { ToxyDocument rdoc = new ToxyDocument(); using (FileStream stream = File.OpenRead(Context.Path)) { XWPFDocument worddoc = new XWPFDocument(stream); foreach (var para in worddoc.Paragraphs) { string text = para.ParagraphText; ToxyParagraph p = new ToxyParagraph(); p.Text = text; //var runs = para.Runs; p.StyleID = para.Style; //for (int i = 0; i < runs.Count; i++) //{ // var run = runs[i]; //} rdoc.Paragraphs.Add(p); } var tables = worddoc.Tables; foreach (var table in tables) { foreach (var row in table.Rows) { var cells = row.GetTableCells(); foreach(var cell in cells) { foreach (var para in cell.Paragraphs) { string text = para.ParagraphText; ToxyParagraph p = new ToxyParagraph(); p.Text = text; //var runs = para.Runs; p.StyleID= para.Style; rdoc.Paragraphs.Add(p); } } } } } return rdoc; }
public string ExtractText(string filePath, string extension) { ParserContext c = new ParserContext(filePath); try { IDocumentParser parser = ParserFactory.CreateDocument(c); ToxyDocument result = parser.Parse(); return(result.ToString()); } catch (InvalidDataException) { Console.Error.WriteLine($"'{filePath}' is supported but don't have the required extension."); var newFilePath = $"{filePath}.{extension}"; Console.Error.WriteLine($"Creating a copy in '{newFilePath}' and using that to read."); File.Copy(filePath, newFilePath); return(ExtractText(newFilePath, extension)); } catch (Exception e) { Console.Error.WriteLine("{0} Exception caught error with {1}.", e, filePath); return(null); } }
public ToxyDocument Parse() { if (!File.Exists(Context.Path)) throw new FileNotFoundException("File " + Context.Path + " is not found"); ToxyDocument rdoc = new ToxyDocument(); using (Stream stream = File.OpenRead(Context.Path)) using (PdfDocument doc = PdfReader.Open(stream, PdfDocumentOpenMode.ReadOnly)) { for (int i = 0; i < doc.PageCount; i++) { var texts = doc.Pages[i].ExtractText(); foreach (var text in texts) { ToxyParagraph para = new ToxyParagraph(); para.Text = text; rdoc.Paragraphs.Add(para); } } } return rdoc; }
public ToxyDocument Parse() { if (!File.Exists(Context.Path)) { throw new FileNotFoundException("File " + Context.Path + " is not found"); } bool extractHeader = false; if (Context.Properties.ContainsKey("ExtractHeader")) { extractHeader = Utility.IsTrue(Context.Properties["ExtractHeader"]); } bool extractFooter = false; if (Context.Properties.ContainsKey("ExtractFooter")) { extractFooter = Utility.IsTrue(Context.Properties["ExtractFooter"]); } ToxyDocument rdoc = new ToxyDocument(); using (FileStream stream = File.OpenRead(Context.Path)) { XWPFDocument worddoc = new XWPFDocument(stream); if (extractHeader && worddoc.HeaderList != null) { StringBuilder sb = new StringBuilder(); foreach (var header in worddoc.HeaderList) { sb.AppendLine(header.Text); } rdoc.Header = sb.ToString(); } if (extractFooter && worddoc.FooterList != null) { StringBuilder sb = new StringBuilder(); foreach (var footer in worddoc.FooterList) { sb.AppendLine(footer.Text); } rdoc.Footer = sb.ToString(); } foreach (var para in worddoc.Paragraphs) { string text = para.ParagraphText; ToxyParagraph p = new ToxyParagraph(); p.Text = text; //var runs = para.Runs; p.StyleID = para.Style; //for (int i = 0; i < runs.Count; i++) //{ // var run = runs[i]; //} rdoc.Paragraphs.Add(p); } var tables = worddoc.Tables; foreach (var table in tables) { foreach (var row in table.Rows) { var cells = row.GetTableCells(); foreach (var cell in cells) { foreach (var para in cell.Paragraphs) { string text = para.ParagraphText; ToxyParagraph p = new ToxyParagraph(); p.Text = text; //var runs = para.Runs; p.StyleID = para.Style; rdoc.Paragraphs.Add(p); } } } } } return(rdoc); }
public ToxyDocument Parse() { if (!File.Exists(Context.Path)) throw new FileNotFoundException("File " + Context.Path + " is not found"); bool extractHeader = false; if (Context.Properties.ContainsKey("ExtractHeader")) { extractHeader = Utility.IsTrue(Context.Properties["ExtractHeader"]); } bool extractFooter = false; if (Context.Properties.ContainsKey("ExtractFooter")) { extractFooter = Utility.IsTrue(Context.Properties["ExtractFooter"]); } ToxyDocument rdoc = new ToxyDocument(); using (FileStream stream = File.OpenRead(Context.Path)) { XWPFDocument worddoc = new XWPFDocument(stream); if (extractHeader && worddoc.HeaderList != null) { StringBuilder sb = new StringBuilder(); foreach (var header in worddoc.HeaderList) { sb.AppendLine(header.Text); } rdoc.Header = sb.ToString(); } if (extractFooter && worddoc.FooterList != null) { StringBuilder sb = new StringBuilder(); foreach (var footer in worddoc.FooterList) { sb.AppendLine(footer.Text); } rdoc.Footer = sb.ToString(); } foreach (var para in worddoc.Paragraphs) { string text = para.ParagraphText; ToxyParagraph p = new ToxyParagraph(); p.Text = text; //var runs = para.Runs; p.StyleID = para.Style; //for (int i = 0; i < runs.Count; i++) //{ // var run = runs[i]; //} rdoc.Paragraphs.Add(p); } var tables = worddoc.Tables; foreach (var table in tables) { foreach (var row in table.Rows) { var cells = row.GetTableCells(); foreach(var cell in cells) { foreach (var para in cell.Paragraphs) { string text = para.ParagraphText; ToxyParagraph p = new ToxyParagraph(); p.Text = text; //var runs = para.Runs; p.StyleID= para.Style; rdoc.Paragraphs.Add(p); } } } } } return rdoc; }