Example #1
2
        public ToxyDocument Parse()
        {
            if (!File.Exists(Context.Path))
                throw new FileNotFoundException("File " + Context.Path + " is not found");

            ToxyDocument rdoc = new ToxyDocument();
            ITextExtractionStrategy its = new iTextSharp.text.pdf.parser.LocationTextExtractionStrategy();

            using (PdfReader reader = new PdfReader(this.Context.Path))
            {

                for (int i = 1; i <= reader.NumberOfPages; i++)
                {
                    string thePage = PdfTextExtractor.GetTextFromPage(reader, i, its);
                    string[] theLines = thePage.Split('\n');
                    foreach (var theLine in theLines)
                    {
                        ToxyParagraph para = new ToxyParagraph();
                        para.Text = theLine;
                        rdoc.Paragraphs.Add(para);
                    }
                }
            }
            return rdoc;
        }
Example #2
0
        public ToxyDocument Parse()
        {
            if (!File.Exists(Context.Path))
            {
                throw new FileNotFoundException("File " + Context.Path + " is not found");
            }

            ToxyDocument            rdoc = new ToxyDocument();
            ITextExtractionStrategy its  = new iTextSharp.text.pdf.parser.LocationTextExtractionStrategy();

            using (PdfReader reader = new PdfReader(this.Context.Path))
            {
                for (int i = 1; i <= reader.NumberOfPages; i++)
                {
                    string   thePage  = PdfTextExtractor.GetTextFromPage(reader, i, its);
                    string[] theLines = thePage.Split('\n');
                    foreach (var theLine in theLines)
                    {
                        ToxyParagraph para = new ToxyParagraph();
                        para.Text = theLine;
                        rdoc.Paragraphs.Add(para);
                    }
                }
            }
            return(rdoc);
        }
Example #3
0
        public ToxyDocument Parse()
        {
            if (!File.Exists(Context.Path))
            {
                throw new FileNotFoundException("File " + Context.Path + " is not found");
            }

            ToxyDocument rdoc = new ToxyDocument();

            using (Stream stream = File.OpenRead(Context.Path))

                using (PdfDocument doc = PdfReader.Open(stream, PdfDocumentOpenMode.ReadOnly))
                {
                    for (int i = 0; i < doc.PageCount; i++)
                    {
                        var texts = doc.Pages[i].ExtractText();
                        foreach (var text in texts)
                        {
                            ToxyParagraph para = new ToxyParagraph();
                            para.Text = text;
                            rdoc.Paragraphs.Add(para);
                        }
                    }
                }

            return(rdoc);
        }
        public ToxyDocument Parse()
        {
            if (!File.Exists(Context.Path))
            {
                throw new FileNotFoundException("File " + Context.Path + " is not found");
            }

            bool extractHeader = false;

            if (Context.Properties.ContainsKey("ExtractHeader"))
            {
                extractHeader = Utility.IsTrue(Context.Properties["ExtractHeader"]);
            }
            bool extractFooter = false;

            if (Context.Properties.ContainsKey("ExtractFooter"))
            {
                extractFooter = Utility.IsTrue(Context.Properties["ExtractFooter"]);
            }

            ToxyDocument rdoc = new ToxyDocument();


            using (FileStream stream = File.OpenRead(Context.Path))
            {
                HWPFDocument worddoc = new HWPFDocument(stream);
                if (extractHeader && worddoc.GetHeaderStoryRange() != null)
                {
                    StringBuilder sb = new StringBuilder();
                    rdoc.Header = worddoc.GetHeaderStoryRange().Text;
                }
                if (extractFooter && worddoc.GetFootnoteRange() != null)
                {
                    StringBuilder sb = new StringBuilder();
                    rdoc.Footer = worddoc.GetFootnoteRange().Text;
                }
                for (int i = 0; i < worddoc.GetRange().NumParagraphs; i++)
                {
                    Paragraph     para = worddoc.GetRange().GetParagraph(i);
                    string        text = para.Text;
                    ToxyParagraph p    = new ToxyParagraph();
                    p.Text = text;
                    //var runs = para.Runs;
                    p.StyleID = para.GetStyleIndex().ToString();

                    //for (int i = 0; i < runs.Count; i++)
                    //{
                    //    var run = runs[i];

                    //}
                    rdoc.Paragraphs.Add(p);
                }
            }
            return(rdoc);
        }
Example #5
0
        public ToxyDocument Parse()
        {
            if (!File.Exists(Context.Path))
                throw new FileNotFoundException("File " + Context.Path + " is not found");

            bool extractHeader = false;
            if (Context.Properties.ContainsKey("ExtractHeader"))
            {
                extractHeader = Utility.IsTrue(Context.Properties["ExtractHeader"]);
            }
            bool extractFooter = false;
            if (Context.Properties.ContainsKey("ExtractFooter"))
            {
                extractFooter = Utility.IsTrue(Context.Properties["ExtractFooter"]);
            }

            ToxyDocument rdoc = new ToxyDocument();


            using (FileStream stream = File.OpenRead(Context.Path))
            {
                HWPFDocument worddoc = new HWPFDocument(stream);
                if (extractHeader && worddoc.GetHeaderStoryRange() != null)
                {
                    StringBuilder sb = new StringBuilder();
                    rdoc.Header = worddoc.GetHeaderStoryRange().Text;
                }
                if (extractFooter && worddoc.GetFootnoteRange() != null)
                {
                    StringBuilder sb = new StringBuilder();
                    rdoc.Footer = worddoc.GetFootnoteRange().Text;
                }
                for (int i=0;i<worddoc.GetRange().NumParagraphs;i++)
                {
                    Paragraph para = worddoc.GetRange().GetParagraph(i);
                    string text = para.Text;
                    ToxyParagraph p = new ToxyParagraph();
                    p.Text = text;
                    //var runs = para.Runs;
                    p.StyleID = para.GetStyleIndex().ToString();

                    //for (int i = 0; i < runs.Count; i++)
                    //{
                    //    var run = runs[i];

                    //}
                    rdoc.Paragraphs.Add(p);
                }               
            }
            return rdoc;
        }
Example #6
0
        public ToxyDocument Parse()
        {
            ToxyDocument rdoc = new ToxyDocument();
            using (FileStream stream = File.OpenRead(Context.Path))
            {
                XWPFDocument worddoc = new XWPFDocument(stream);
                foreach (var para in worddoc.Paragraphs)
                {
                    string text = para.ParagraphText;
                    ToxyParagraph p = new ToxyParagraph();
                    p.Text = text;
                    //var runs = para.Runs;
                    p.StyleID = para.Style;

                    //for (int i = 0; i < runs.Count; i++)
                    //{
                    //    var run = runs[i];

                    //}
                    rdoc.Paragraphs.Add(p);
                }
               
                var tables = worddoc.Tables;
                foreach (var table in tables)
                {
                    foreach (var row in table.Rows)
                    {
                        var cells = row.GetTableCells();
                        foreach(var cell in cells)
                        {
                            foreach (var para in cell.Paragraphs)
                            {
                                string text = para.ParagraphText;
                                ToxyParagraph p = new ToxyParagraph();
                                p.Text = text;
                                //var runs = para.Runs;
                                p.StyleID= para.Style;
                                rdoc.Paragraphs.Add(p);
                            }
                        }
                    }
                }
            }
            return rdoc;
        }
Example #7
0
        public ToxyDocument Parse()
        {
            if (!File.Exists(Context.Path))
                throw new FileNotFoundException("File " + Context.Path + " is not found");

            ToxyDocument rdoc = new ToxyDocument();
            using (Stream stream = File.OpenRead(Context.Path))

            using (PdfDocument doc = PdfReader.Open(stream, PdfDocumentOpenMode.ReadOnly))
            {
                for (int i = 0; i < doc.PageCount; i++)
                {
                    var texts = doc.Pages[i].ExtractText();
                    foreach (var text in texts)
                    {
                        ToxyParagraph para = new ToxyParagraph();
                        para.Text = text;
                        rdoc.Paragraphs.Add(para);
                    }
                }
            }

            return rdoc;
        }
Example #8
0
        public ToxyDocument Parse()
        {
            if (!File.Exists(Context.Path))
            {
                throw new FileNotFoundException("File " + Context.Path + " is not found");
            }

            bool extractHeader = false;

            if (Context.Properties.ContainsKey("ExtractHeader"))
            {
                extractHeader = Utility.IsTrue(Context.Properties["ExtractHeader"]);
            }
            bool extractFooter = false;

            if (Context.Properties.ContainsKey("ExtractFooter"))
            {
                extractFooter = Utility.IsTrue(Context.Properties["ExtractFooter"]);
            }

            ToxyDocument rdoc = new ToxyDocument();


            using (FileStream stream = File.OpenRead(Context.Path))
            {
                XWPFDocument worddoc = new XWPFDocument(stream);
                if (extractHeader && worddoc.HeaderList != null)
                {
                    StringBuilder sb = new StringBuilder();
                    foreach (var header in worddoc.HeaderList)
                    {
                        sb.AppendLine(header.Text);
                    }
                    rdoc.Header = sb.ToString();
                }
                if (extractFooter && worddoc.FooterList != null)
                {
                    StringBuilder sb = new StringBuilder();
                    foreach (var footer in worddoc.FooterList)
                    {
                        sb.AppendLine(footer.Text);
                    }
                    rdoc.Footer = sb.ToString();
                }
                foreach (var para in worddoc.Paragraphs)
                {
                    string        text = para.ParagraphText;
                    ToxyParagraph p    = new ToxyParagraph();
                    p.Text = text;
                    //var runs = para.Runs;
                    p.StyleID = para.Style;

                    //for (int i = 0; i < runs.Count; i++)
                    //{
                    //    var run = runs[i];

                    //}
                    rdoc.Paragraphs.Add(p);
                }

                var tables = worddoc.Tables;
                foreach (var table in tables)
                {
                    foreach (var row in table.Rows)
                    {
                        var cells = row.GetTableCells();
                        foreach (var cell in cells)
                        {
                            foreach (var para in cell.Paragraphs)
                            {
                                string        text = para.ParagraphText;
                                ToxyParagraph p    = new ToxyParagraph();
                                p.Text = text;
                                //var runs = para.Runs;
                                p.StyleID = para.Style;
                                rdoc.Paragraphs.Add(p);
                            }
                        }
                    }
                }
            }
            return(rdoc);
        }
Example #9
0
        public ToxyDocument Parse()
        {
            if (!File.Exists(Context.Path))
                throw new FileNotFoundException("File " + Context.Path + " is not found");

            bool extractHeader = false;
            if (Context.Properties.ContainsKey("ExtractHeader"))
            {
                extractHeader = Utility.IsTrue(Context.Properties["ExtractHeader"]);
            }
            bool extractFooter = false;
            if (Context.Properties.ContainsKey("ExtractFooter"))
            {
                extractFooter = Utility.IsTrue(Context.Properties["ExtractFooter"]);
            }

            ToxyDocument rdoc = new ToxyDocument();


            using (FileStream stream = File.OpenRead(Context.Path))
            {
                XWPFDocument worddoc = new XWPFDocument(stream);
                if (extractHeader && worddoc.HeaderList != null)
                {
                    StringBuilder sb = new StringBuilder();
                    foreach (var header in worddoc.HeaderList)
                    {
                        sb.AppendLine(header.Text);
                    }
                    rdoc.Header = sb.ToString();
                }
                if (extractFooter && worddoc.FooterList != null)
                {
                    StringBuilder sb = new StringBuilder();
                    foreach (var footer in worddoc.FooterList)
                    {
                        sb.AppendLine(footer.Text);
                    }
                    rdoc.Footer = sb.ToString();
                }
                foreach (var para in worddoc.Paragraphs)
                {
                    string text = para.ParagraphText;
                    ToxyParagraph p = new ToxyParagraph();
                    p.Text = text;
                    //var runs = para.Runs;
                    p.StyleID = para.Style;

                    //for (int i = 0; i < runs.Count; i++)
                    //{
                    //    var run = runs[i];

                    //}
                    rdoc.Paragraphs.Add(p);
                }
               
                var tables = worddoc.Tables;
                foreach (var table in tables)
                {
                    foreach (var row in table.Rows)
                    {
                        var cells = row.GetTableCells();
                        foreach(var cell in cells)
                        {
                            foreach (var para in cell.Paragraphs)
                            {
                                string text = para.ParagraphText;
                                ToxyParagraph p = new ToxyParagraph();
                                p.Text = text;
                                //var runs = para.Runs;
                                p.StyleID= para.Style;
                                rdoc.Paragraphs.Add(p);
                            }
                        }
                    }
                }
            }
            return rdoc;
        }