public ToxyDocument Parse() { if (!File.Exists(Context.Path)) throw new FileNotFoundException("File " + Context.Path + " is not found"); ToxyDocument rdoc = new ToxyDocument(); ITextExtractionStrategy its = new iTextSharp.text.pdf.parser.LocationTextExtractionStrategy(); using (PdfReader reader = new PdfReader(this.Context.Path)) { for (int i = 1; i <= reader.NumberOfPages; i++) { string thePage = PdfTextExtractor.GetTextFromPage(reader, i, its); string[] theLines = thePage.Split('\n'); foreach (var theLine in theLines) { ToxyParagraph para = new ToxyParagraph(); para.Text = theLine; rdoc.Paragraphs.Add(para); } } } return rdoc; }
public ToxyDocument Parse() { if (!File.Exists(Context.Path)) { throw new FileNotFoundException("File " + Context.Path + " is not found"); } ToxyDocument rdoc = new ToxyDocument(); ITextExtractionStrategy its = new iTextSharp.text.pdf.parser.LocationTextExtractionStrategy(); using (PdfReader reader = new PdfReader(this.Context.Path)) { for (int i = 1; i <= reader.NumberOfPages; i++) { string thePage = PdfTextExtractor.GetTextFromPage(reader, i, its); string[] theLines = thePage.Split('\n'); foreach (var theLine in theLines) { ToxyParagraph para = new ToxyParagraph(); para.Text = theLine; rdoc.Paragraphs.Add(para); } } } return(rdoc); }
public ToxyDocument Parse() { if (!File.Exists(Context.Path)) { throw new FileNotFoundException("File " + Context.Path + " is not found"); } ToxyDocument rdoc = new ToxyDocument(); using (Stream stream = File.OpenRead(Context.Path)) using (PdfDocument doc = PdfReader.Open(stream, PdfDocumentOpenMode.ReadOnly)) { for (int i = 0; i < doc.PageCount; i++) { var texts = doc.Pages[i].ExtractText(); foreach (var text in texts) { ToxyParagraph para = new ToxyParagraph(); para.Text = text; rdoc.Paragraphs.Add(para); } } } return(rdoc); }
public ToxyDocument Parse() { if (!File.Exists(Context.Path)) { throw new FileNotFoundException("File " + Context.Path + " is not found"); } bool extractHeader = false; if (Context.Properties.ContainsKey("ExtractHeader")) { extractHeader = Utility.IsTrue(Context.Properties["ExtractHeader"]); } bool extractFooter = false; if (Context.Properties.ContainsKey("ExtractFooter")) { extractFooter = Utility.IsTrue(Context.Properties["ExtractFooter"]); } ToxyDocument rdoc = new ToxyDocument(); using (FileStream stream = File.OpenRead(Context.Path)) { HWPFDocument worddoc = new HWPFDocument(stream); if (extractHeader && worddoc.GetHeaderStoryRange() != null) { StringBuilder sb = new StringBuilder(); rdoc.Header = worddoc.GetHeaderStoryRange().Text; } if (extractFooter && worddoc.GetFootnoteRange() != null) { StringBuilder sb = new StringBuilder(); rdoc.Footer = worddoc.GetFootnoteRange().Text; } for (int i = 0; i < worddoc.GetRange().NumParagraphs; i++) { Paragraph para = worddoc.GetRange().GetParagraph(i); string text = para.Text; ToxyParagraph p = new ToxyParagraph(); p.Text = text; //var runs = para.Runs; p.StyleID = para.GetStyleIndex().ToString(); //for (int i = 0; i < runs.Count; i++) //{ // var run = runs[i]; //} rdoc.Paragraphs.Add(p); } } return(rdoc); }
public ToxyDocument Parse() { if (!File.Exists(Context.Path)) throw new FileNotFoundException("File " + Context.Path + " is not found"); bool extractHeader = false; if (Context.Properties.ContainsKey("ExtractHeader")) { extractHeader = Utility.IsTrue(Context.Properties["ExtractHeader"]); } bool extractFooter = false; if (Context.Properties.ContainsKey("ExtractFooter")) { extractFooter = Utility.IsTrue(Context.Properties["ExtractFooter"]); } ToxyDocument rdoc = new ToxyDocument(); using (FileStream stream = File.OpenRead(Context.Path)) { HWPFDocument worddoc = new HWPFDocument(stream); if (extractHeader && worddoc.GetHeaderStoryRange() != null) { StringBuilder sb = new StringBuilder(); rdoc.Header = worddoc.GetHeaderStoryRange().Text; } if (extractFooter && worddoc.GetFootnoteRange() != null) { StringBuilder sb = new StringBuilder(); rdoc.Footer = worddoc.GetFootnoteRange().Text; } for (int i=0;i<worddoc.GetRange().NumParagraphs;i++) { Paragraph para = worddoc.GetRange().GetParagraph(i); string text = para.Text; ToxyParagraph p = new ToxyParagraph(); p.Text = text; //var runs = para.Runs; p.StyleID = para.GetStyleIndex().ToString(); //for (int i = 0; i < runs.Count; i++) //{ // var run = runs[i]; //} rdoc.Paragraphs.Add(p); } } return rdoc; }
public ToxyDocument Parse() { ToxyDocument rdoc = new ToxyDocument(); using (FileStream stream = File.OpenRead(Context.Path)) { XWPFDocument worddoc = new XWPFDocument(stream); foreach (var para in worddoc.Paragraphs) { string text = para.ParagraphText; ToxyParagraph p = new ToxyParagraph(); p.Text = text; //var runs = para.Runs; p.StyleID = para.Style; //for (int i = 0; i < runs.Count; i++) //{ // var run = runs[i]; //} rdoc.Paragraphs.Add(p); } var tables = worddoc.Tables; foreach (var table in tables) { foreach (var row in table.Rows) { var cells = row.GetTableCells(); foreach(var cell in cells) { foreach (var para in cell.Paragraphs) { string text = para.ParagraphText; ToxyParagraph p = new ToxyParagraph(); p.Text = text; //var runs = para.Runs; p.StyleID= para.Style; rdoc.Paragraphs.Add(p); } } } } } return rdoc; }
public ToxyDocument Parse() { if (!File.Exists(Context.Path)) throw new FileNotFoundException("File " + Context.Path + " is not found"); ToxyDocument rdoc = new ToxyDocument(); using (Stream stream = File.OpenRead(Context.Path)) using (PdfDocument doc = PdfReader.Open(stream, PdfDocumentOpenMode.ReadOnly)) { for (int i = 0; i < doc.PageCount; i++) { var texts = doc.Pages[i].ExtractText(); foreach (var text in texts) { ToxyParagraph para = new ToxyParagraph(); para.Text = text; rdoc.Paragraphs.Add(para); } } } return rdoc; }
public ToxyDocument Parse() { if (!File.Exists(Context.Path)) { throw new FileNotFoundException("File " + Context.Path + " is not found"); } bool extractHeader = false; if (Context.Properties.ContainsKey("ExtractHeader")) { extractHeader = Utility.IsTrue(Context.Properties["ExtractHeader"]); } bool extractFooter = false; if (Context.Properties.ContainsKey("ExtractFooter")) { extractFooter = Utility.IsTrue(Context.Properties["ExtractFooter"]); } ToxyDocument rdoc = new ToxyDocument(); using (FileStream stream = File.OpenRead(Context.Path)) { XWPFDocument worddoc = new XWPFDocument(stream); if (extractHeader && worddoc.HeaderList != null) { StringBuilder sb = new StringBuilder(); foreach (var header in worddoc.HeaderList) { sb.AppendLine(header.Text); } rdoc.Header = sb.ToString(); } if (extractFooter && worddoc.FooterList != null) { StringBuilder sb = new StringBuilder(); foreach (var footer in worddoc.FooterList) { sb.AppendLine(footer.Text); } rdoc.Footer = sb.ToString(); } foreach (var para in worddoc.Paragraphs) { string text = para.ParagraphText; ToxyParagraph p = new ToxyParagraph(); p.Text = text; //var runs = para.Runs; p.StyleID = para.Style; //for (int i = 0; i < runs.Count; i++) //{ // var run = runs[i]; //} rdoc.Paragraphs.Add(p); } var tables = worddoc.Tables; foreach (var table in tables) { foreach (var row in table.Rows) { var cells = row.GetTableCells(); foreach (var cell in cells) { foreach (var para in cell.Paragraphs) { string text = para.ParagraphText; ToxyParagraph p = new ToxyParagraph(); p.Text = text; //var runs = para.Runs; p.StyleID = para.Style; rdoc.Paragraphs.Add(p); } } } } } return(rdoc); }
public ToxyDocument Parse() { if (!File.Exists(Context.Path)) throw new FileNotFoundException("File " + Context.Path + " is not found"); bool extractHeader = false; if (Context.Properties.ContainsKey("ExtractHeader")) { extractHeader = Utility.IsTrue(Context.Properties["ExtractHeader"]); } bool extractFooter = false; if (Context.Properties.ContainsKey("ExtractFooter")) { extractFooter = Utility.IsTrue(Context.Properties["ExtractFooter"]); } ToxyDocument rdoc = new ToxyDocument(); using (FileStream stream = File.OpenRead(Context.Path)) { XWPFDocument worddoc = new XWPFDocument(stream); if (extractHeader && worddoc.HeaderList != null) { StringBuilder sb = new StringBuilder(); foreach (var header in worddoc.HeaderList) { sb.AppendLine(header.Text); } rdoc.Header = sb.ToString(); } if (extractFooter && worddoc.FooterList != null) { StringBuilder sb = new StringBuilder(); foreach (var footer in worddoc.FooterList) { sb.AppendLine(footer.Text); } rdoc.Footer = sb.ToString(); } foreach (var para in worddoc.Paragraphs) { string text = para.ParagraphText; ToxyParagraph p = new ToxyParagraph(); p.Text = text; //var runs = para.Runs; p.StyleID = para.Style; //for (int i = 0; i < runs.Count; i++) //{ // var run = runs[i]; //} rdoc.Paragraphs.Add(p); } var tables = worddoc.Tables; foreach (var table in tables) { foreach (var row in table.Rows) { var cells = row.GetTableCells(); foreach(var cell in cells) { foreach (var para in cell.Paragraphs) { string text = para.ParagraphText; ToxyParagraph p = new ToxyParagraph(); p.Text = text; //var runs = para.Runs; p.StyleID= para.Style; rdoc.Paragraphs.Add(p); } } } } } return rdoc; }