Example #1
2
        public ToxyDocument Parse()
        {
            if (!File.Exists(Context.Path))
                throw new FileNotFoundException("File " + Context.Path + " is not found");

            ToxyDocument rdoc = new ToxyDocument();
            ITextExtractionStrategy its = new iTextSharp.text.pdf.parser.LocationTextExtractionStrategy();

            using (PdfReader reader = new PdfReader(this.Context.Path))
            {

                for (int i = 1; i <= reader.NumberOfPages; i++)
                {
                    string thePage = PdfTextExtractor.GetTextFromPage(reader, i, its);
                    string[] theLines = thePage.Split('\n');
                    foreach (var theLine in theLines)
                    {
                        ToxyParagraph para = new ToxyParagraph();
                        para.Text = theLine;
                        rdoc.Paragraphs.Add(para);
                    }
                }
            }
            return rdoc;
        }
Example #2
0
        public ToxyDocument Parse()
        {
            if (!File.Exists(Context.Path))
            {
                throw new FileNotFoundException("File " + Context.Path + " is not found");
            }

            ToxyDocument            rdoc = new ToxyDocument();
            ITextExtractionStrategy its  = new iTextSharp.text.pdf.parser.LocationTextExtractionStrategy();

            using (PdfReader reader = new PdfReader(this.Context.Path))
            {
                for (int i = 1; i <= reader.NumberOfPages; i++)
                {
                    string   thePage  = PdfTextExtractor.GetTextFromPage(reader, i, its);
                    string[] theLines = thePage.Split('\n');
                    foreach (var theLine in theLines)
                    {
                        ToxyParagraph para = new ToxyParagraph();
                        para.Text = theLine;
                        rdoc.Paragraphs.Add(para);
                    }
                }
            }
            return(rdoc);
        }
Example #3
0
        public ToxyDocument Parse()
        {
            if (!File.Exists(Context.Path))
            {
                throw new FileNotFoundException("File " + Context.Path + " is not found");
            }

            ToxyDocument rdoc = new ToxyDocument();

            using (Stream stream = File.OpenRead(Context.Path))

                using (PdfDocument doc = PdfReader.Open(stream, PdfDocumentOpenMode.ReadOnly))
                {
                    for (int i = 0; i < doc.PageCount; i++)
                    {
                        var texts = doc.Pages[i].ExtractText();
                        foreach (var text in texts)
                        {
                            ToxyParagraph para = new ToxyParagraph();
                            para.Text = text;
                            rdoc.Paragraphs.Add(para);
                        }
                    }
                }

            return(rdoc);
        }
        public ToxyDocument Parse()
        {
            if (!File.Exists(Context.Path))
            {
                throw new FileNotFoundException("File " + Context.Path + " is not found");
            }

            bool extractHeader = false;

            if (Context.Properties.ContainsKey("ExtractHeader"))
            {
                extractHeader = Utility.IsTrue(Context.Properties["ExtractHeader"]);
            }
            bool extractFooter = false;

            if (Context.Properties.ContainsKey("ExtractFooter"))
            {
                extractFooter = Utility.IsTrue(Context.Properties["ExtractFooter"]);
            }

            ToxyDocument rdoc = new ToxyDocument();


            using (FileStream stream = File.OpenRead(Context.Path))
            {
                HWPFDocument worddoc = new HWPFDocument(stream);
                if (extractHeader && worddoc.GetHeaderStoryRange() != null)
                {
                    StringBuilder sb = new StringBuilder();
                    rdoc.Header = worddoc.GetHeaderStoryRange().Text;
                }
                if (extractFooter && worddoc.GetFootnoteRange() != null)
                {
                    StringBuilder sb = new StringBuilder();
                    rdoc.Footer = worddoc.GetFootnoteRange().Text;
                }
                for (int i = 0; i < worddoc.GetRange().NumParagraphs; i++)
                {
                    Paragraph     para = worddoc.GetRange().GetParagraph(i);
                    string        text = para.Text;
                    ToxyParagraph p    = new ToxyParagraph();
                    p.Text = text;
                    //var runs = para.Runs;
                    p.StyleID = para.GetStyleIndex().ToString();

                    //for (int i = 0; i < runs.Count; i++)
                    //{
                    //    var run = runs[i];

                    //}
                    rdoc.Paragraphs.Add(p);
                }
            }
            return(rdoc);
        }
Example #5
0
        public ToxyDocument Parse()
        {
            if (!File.Exists(Context.Path))
                throw new FileNotFoundException("File " + Context.Path + " is not found");

            bool extractHeader = false;
            if (Context.Properties.ContainsKey("ExtractHeader"))
            {
                extractHeader = Utility.IsTrue(Context.Properties["ExtractHeader"]);
            }
            bool extractFooter = false;
            if (Context.Properties.ContainsKey("ExtractFooter"))
            {
                extractFooter = Utility.IsTrue(Context.Properties["ExtractFooter"]);
            }

            ToxyDocument rdoc = new ToxyDocument();


            using (FileStream stream = File.OpenRead(Context.Path))
            {
                HWPFDocument worddoc = new HWPFDocument(stream);
                if (extractHeader && worddoc.GetHeaderStoryRange() != null)
                {
                    StringBuilder sb = new StringBuilder();
                    rdoc.Header = worddoc.GetHeaderStoryRange().Text;
                }
                if (extractFooter && worddoc.GetFootnoteRange() != null)
                {
                    StringBuilder sb = new StringBuilder();
                    rdoc.Footer = worddoc.GetFootnoteRange().Text;
                }
                for (int i=0;i<worddoc.GetRange().NumParagraphs;i++)
                {
                    Paragraph para = worddoc.GetRange().GetParagraph(i);
                    string text = para.Text;
                    ToxyParagraph p = new ToxyParagraph();
                    p.Text = text;
                    //var runs = para.Runs;
                    p.StyleID = para.GetStyleIndex().ToString();

                    //for (int i = 0; i < runs.Count; i++)
                    //{
                    //    var run = runs[i];

                    //}
                    rdoc.Paragraphs.Add(p);
                }               
            }
            return rdoc;
        }
Example #6
0
        public void TestParseSimpleDocumentFromWord()
        {
            ParserContext   context = new ParserContext(TestDataSample.GetWordPath("SampleDoc.docx"));
            IDocumentParser parser  = ParserFactory.CreateDocument(context);
            ToxyDocument    doc     = parser.Parse();

            Assert.AreEqual(7, doc.Paragraphs.Count);
            Assert.AreEqual("I am a test document", doc.Paragraphs[0].Text);
            Assert.AreEqual("This is page 1", doc.Paragraphs[1].Text);
            Assert.AreEqual("I am Calibri (Body) in font size 11", doc.Paragraphs[2].Text);
            Assert.AreEqual("\n", doc.Paragraphs[3].Text);
            Assert.AreEqual("This is page two", doc.Paragraphs[4].Text);
            Assert.AreEqual("It’s Arial Black in 16 point", doc.Paragraphs[5].Text);
            Assert.AreEqual("It’s also in blue", doc.Paragraphs[6].Text);
        }
Example #7
0
        public void TestParseDocumentWithTable()
        {
            ParserContext   context = new ParserContext(TestDataSample.GetWordPath("simple-table.docx"));
            IDocumentParser parser  = ParserFactory.CreateDocument(context);
            ToxyDocument    doc     = parser.Parse();

            Assert.AreEqual(8, doc.Paragraphs.Count);
            Assert.AreEqual("This is a Word document that was created using Word 97 – SR2.  It contains a paragraph, a table consisting of 2 rows and 3 columns and a final paragraph.",
                            doc.Paragraphs[0].Text);
            Assert.AreEqual("This text is below the table.", doc.Paragraphs[1].Text);
            Assert.AreEqual("Cell 1,1", doc.Paragraphs[2].Text);
            Assert.AreEqual("Cell 1,2", doc.Paragraphs[3].Text);
            Assert.AreEqual("Cell 1,3", doc.Paragraphs[4].Text);
            Assert.AreEqual("Cell 2,1", doc.Paragraphs[5].Text);
            Assert.AreEqual("Cell 2,2", doc.Paragraphs[6].Text);
            Assert.AreEqual("Cell 2,3", doc.Paragraphs[7].Text);
        }
Example #8
0
        public ToxyDocument Parse()
        {
            ToxyDocument rdoc = new ToxyDocument();
            using (FileStream stream = File.OpenRead(Context.Path))
            {
                XWPFDocument worddoc = new XWPFDocument(stream);
                foreach (var para in worddoc.Paragraphs)
                {
                    string text = para.ParagraphText;
                    ToxyParagraph p = new ToxyParagraph();
                    p.Text = text;
                    //var runs = para.Runs;
                    p.StyleID = para.Style;

                    //for (int i = 0; i < runs.Count; i++)
                    //{
                    //    var run = runs[i];

                    //}
                    rdoc.Paragraphs.Add(p);
                }
               
                var tables = worddoc.Tables;
                foreach (var table in tables)
                {
                    foreach (var row in table.Rows)
                    {
                        var cells = row.GetTableCells();
                        foreach(var cell in cells)
                        {
                            foreach (var para in cell.Paragraphs)
                            {
                                string text = para.ParagraphText;
                                ToxyParagraph p = new ToxyParagraph();
                                p.Text = text;
                                //var runs = para.Runs;
                                p.StyleID= para.Style;
                                rdoc.Paragraphs.Add(p);
                            }
                        }
                    }
                }
            }
            return rdoc;
        }
Example #9
0
        public string ExtractText(string filePath, string extension)
        {
            ParserContext c = new ParserContext(filePath);

            try
            {
                IDocumentParser parser = ParserFactory.CreateDocument(c);
                ToxyDocument    result = parser.Parse();
                return(result.ToString());
            }
            catch (InvalidDataException)
            {
                Console.Error.WriteLine($"'{filePath}' is supported but don't have the required extension.");
                var newFilePath = $"{filePath}.{extension}";
                Console.Error.WriteLine($"Creating a copy in '{newFilePath}' and using that to read.");
                File.Copy(filePath, newFilePath);
                return(ExtractText(newFilePath, extension));
            }
            catch (Exception e)
            {
                Console.Error.WriteLine("{0} Exception caught error with {1}.", e, filePath);
                return(null);
            }
        }
Example #10
0
        public ToxyDocument Parse()
        {
            if (!File.Exists(Context.Path))
                throw new FileNotFoundException("File " + Context.Path + " is not found");

            ToxyDocument rdoc = new ToxyDocument();
            using (Stream stream = File.OpenRead(Context.Path))

            using (PdfDocument doc = PdfReader.Open(stream, PdfDocumentOpenMode.ReadOnly))
            {
                for (int i = 0; i < doc.PageCount; i++)
                {
                    var texts = doc.Pages[i].ExtractText();
                    foreach (var text in texts)
                    {
                        ToxyParagraph para = new ToxyParagraph();
                        para.Text = text;
                        rdoc.Paragraphs.Add(para);
                    }
                }
            }

            return rdoc;
        }
Example #11
0
        public ToxyDocument Parse()
        {
            if (!File.Exists(Context.Path))
            {
                throw new FileNotFoundException("File " + Context.Path + " is not found");
            }

            bool extractHeader = false;

            if (Context.Properties.ContainsKey("ExtractHeader"))
            {
                extractHeader = Utility.IsTrue(Context.Properties["ExtractHeader"]);
            }
            bool extractFooter = false;

            if (Context.Properties.ContainsKey("ExtractFooter"))
            {
                extractFooter = Utility.IsTrue(Context.Properties["ExtractFooter"]);
            }

            ToxyDocument rdoc = new ToxyDocument();


            using (FileStream stream = File.OpenRead(Context.Path))
            {
                XWPFDocument worddoc = new XWPFDocument(stream);
                if (extractHeader && worddoc.HeaderList != null)
                {
                    StringBuilder sb = new StringBuilder();
                    foreach (var header in worddoc.HeaderList)
                    {
                        sb.AppendLine(header.Text);
                    }
                    rdoc.Header = sb.ToString();
                }
                if (extractFooter && worddoc.FooterList != null)
                {
                    StringBuilder sb = new StringBuilder();
                    foreach (var footer in worddoc.FooterList)
                    {
                        sb.AppendLine(footer.Text);
                    }
                    rdoc.Footer = sb.ToString();
                }
                foreach (var para in worddoc.Paragraphs)
                {
                    string        text = para.ParagraphText;
                    ToxyParagraph p    = new ToxyParagraph();
                    p.Text = text;
                    //var runs = para.Runs;
                    p.StyleID = para.Style;

                    //for (int i = 0; i < runs.Count; i++)
                    //{
                    //    var run = runs[i];

                    //}
                    rdoc.Paragraphs.Add(p);
                }

                var tables = worddoc.Tables;
                foreach (var table in tables)
                {
                    foreach (var row in table.Rows)
                    {
                        var cells = row.GetTableCells();
                        foreach (var cell in cells)
                        {
                            foreach (var para in cell.Paragraphs)
                            {
                                string        text = para.ParagraphText;
                                ToxyParagraph p    = new ToxyParagraph();
                                p.Text = text;
                                //var runs = para.Runs;
                                p.StyleID = para.Style;
                                rdoc.Paragraphs.Add(p);
                            }
                        }
                    }
                }
            }
            return(rdoc);
        }
Example #12
0
        public ToxyDocument Parse()
        {
            if (!File.Exists(Context.Path))
                throw new FileNotFoundException("File " + Context.Path + " is not found");

            bool extractHeader = false;
            if (Context.Properties.ContainsKey("ExtractHeader"))
            {
                extractHeader = Utility.IsTrue(Context.Properties["ExtractHeader"]);
            }
            bool extractFooter = false;
            if (Context.Properties.ContainsKey("ExtractFooter"))
            {
                extractFooter = Utility.IsTrue(Context.Properties["ExtractFooter"]);
            }

            ToxyDocument rdoc = new ToxyDocument();


            using (FileStream stream = File.OpenRead(Context.Path))
            {
                XWPFDocument worddoc = new XWPFDocument(stream);
                if (extractHeader && worddoc.HeaderList != null)
                {
                    StringBuilder sb = new StringBuilder();
                    foreach (var header in worddoc.HeaderList)
                    {
                        sb.AppendLine(header.Text);
                    }
                    rdoc.Header = sb.ToString();
                }
                if (extractFooter && worddoc.FooterList != null)
                {
                    StringBuilder sb = new StringBuilder();
                    foreach (var footer in worddoc.FooterList)
                    {
                        sb.AppendLine(footer.Text);
                    }
                    rdoc.Footer = sb.ToString();
                }
                foreach (var para in worddoc.Paragraphs)
                {
                    string text = para.ParagraphText;
                    ToxyParagraph p = new ToxyParagraph();
                    p.Text = text;
                    //var runs = para.Runs;
                    p.StyleID = para.Style;

                    //for (int i = 0; i < runs.Count; i++)
                    //{
                    //    var run = runs[i];

                    //}
                    rdoc.Paragraphs.Add(p);
                }
               
                var tables = worddoc.Tables;
                foreach (var table in tables)
                {
                    foreach (var row in table.Rows)
                    {
                        var cells = row.GetTableCells();
                        foreach(var cell in cells)
                        {
                            foreach (var para in cell.Paragraphs)
                            {
                                string text = para.ParagraphText;
                                ToxyParagraph p = new ToxyParagraph();
                                p.Text = text;
                                //var runs = para.Runs;
                                p.StyleID= para.Style;
                                rdoc.Paragraphs.Add(p);
                            }
                        }
                    }
                }
            }
            return rdoc;
        }