XWPFWordExtractor, Npoi.Core.XWPF.Extractor C# (CSharp) 코드 예제들

예제 #1

0

파일 보기

        public void TestTableFootnotes()
        {
            XWPFDocument      doc       = XWPFTestDataSamples.OpenSampleDocument("table_footnotes.docx");
            XWPFWordExtractor extractor = new XWPFWordExtractor(doc);

            Assert.IsTrue(extractor.Text.Contains("snoska"));
        }

예제 #2

0

파일 보기

        public void TestGetSimpleText()
        {
            XWPFDocument      doc       = XWPFTestDataSamples.OpenSampleDocument("sample.docx");
            XWPFWordExtractor extractor = new XWPFWordExtractor(doc);

            String text = extractor.Text;

            Assert.IsTrue(text.Length > 0);

            // Check contents
            Assert.IsTrue(text.StartsWith(
                              "Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Nunc at risus vel erat tempus posuere. Aenean non ante. Suspendisse vehicula dolor sit amet odio."
                              ));
            Assert.IsTrue(text.EndsWith(
                              "Phasellus ultricies mi nec leo. Sed tempus. In sit amet lorem at velit faucibus vestibulum.\n"
                              ));

            // Check number of paragraphs
            int ps = 0;

            char[] t = text.ToCharArray();
            for (int i = 0; i < t.Length; i++)
            {
                if (t[i] == '\n')
                {
                    ps++;
                }
            }
            Assert.AreEqual(3, ps);
        }

예제 #3

0

파일 보기

        public void TestGetWithHyperlinks()
        {
            XWPFDocument      doc       = XWPFTestDataSamples.OpenSampleDocument("TestDocument.docx");
            XWPFWordExtractor extractor = new XWPFWordExtractor(doc);

            // Now check contents
            extractor.SetFetchHyperlinks(false);
            Assert.AreEqual(
                "This is a test document.\nThis bit is in bold and italic\n" +
                "Back to normal\n" +
                "This contains BOLD, ITALIC and BOTH, as well as RED and YELLOW text.\n" +
                "We have a hyperlink here, and another.\n",
                extractor.Text
                );

            // One hyperlink is a real one, one is just to the top of page
            extractor.SetFetchHyperlinks(true);
            Assert.AreEqual(
                "This is a test document.\nThis bit is in bold and italic\n" +
                "Back to normal\n" +
                "This contains BOLD, ITALIC and BOTH, as well as RED and YELLOW text.\n" +
                "We have a hyperlink <http://poi.apache.org/> here, and another.\n",
                extractor.Text
                );
        }

예제 #4

0

파일 보기

        public void TestDrawings()
        {
            XWPFDocument      doc       = XWPFTestDataSamples.OpenSampleDocument("drawing.docx");
            XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
            String            text      = extractor.Text;

            Assert.IsTrue(text.Length > 0);
        }

예제 #5

0

파일 보기

        public void TestInsertedDeletedText()
        {
            XWPFDocument      doc       = XWPFTestDataSamples.OpenSampleDocument("delins.docx");
            XWPFWordExtractor extractor = new XWPFWordExtractor(doc);

            Assert.IsTrue(extractor.Text.Contains("pendant worn"));
            Assert.IsTrue(extractor.Text.Contains("extremely well"));
        }

예제 #6

0

파일 보기

        public void TestFldSimpleContent()
        {
            XWPFDocument      doc       = XWPFTestDataSamples.OpenSampleDocument("FldSimple.docx");
            XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
            String            text      = extractor.Text;

            Assert.IsTrue(text.Length > 0);
            Assert.IsTrue(text.Contains("FldSimple.docx"));
        }

예제 #7

0

파일 보기

        public void TestDOCMFiles()
        {
            XWPFDocument      doc       = XWPFTestDataSamples.OpenSampleDocument("45690.docm");
            XWPFWordExtractor extractor = new XWPFWordExtractor(doc);

            Assert.IsTrue(extractor.Text.Contains("2004"));
            Assert.IsTrue(extractor.Text.Contains("2008"));
            Assert.IsTrue(extractor.Text.Contains("(120 "));
        }

예제 #8

0

파일 보기

        public void TestParagraphHeader()
        {
            XWPFDocument      doc       = XWPFTestDataSamples.OpenSampleDocument("Headers.docx");
            XWPFWordExtractor extractor = new XWPFWordExtractor(doc);

            Assert.IsTrue(extractor.Text.Contains("Section 1"));
            Assert.IsTrue(extractor.Text.Contains("Section 2"));
            Assert.IsTrue(extractor.Text.Contains("Section 3"));
        }

예제 #9

0

파일 보기

        public void TestEndnotes()
        {
            XWPFDocument      doc       = XWPFTestDataSamples.OpenSampleDocument("endnotes.docx");
            XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
            string            text      = extractor.Text;

            Assert.IsTrue(text.Contains("XXX"));
            Assert.IsTrue(text.Contains("tilaka [endnoteRef:2]or 'tika'"));
        }

예제 #10

0

파일 보기

        public void TestFootnotes()
        {
            XWPFDocument      doc       = XWPFTestDataSamples.OpenSampleDocument("footnotes.docx");
            XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
            String            text      = extractor.Text;

            Assert.IsTrue(extractor.Text.Contains("snoska"));
            Assert.IsTrue(text.Contains("Eto ochen prostoy[footnoteRef:1] text so snoskoy"));
        }

예제 #11

0

파일 보기

        public void TestNoFieldCodes()
        {
            XWPFDocument      doc       = XWPFTestDataSamples.OpenSampleDocument("FieldCodes.docx");
            XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
            String            text      = extractor.Text;

            Assert.IsTrue(text.Length > 0);
            Assert.IsFalse(text.Contains("AUTHOR"));
            Assert.IsFalse(text.Contains("CREATEDATE"));
        }

예제 #12

0

파일 보기

        public void TestBug55733()
        {
            XWPFDocument      doc       = XWPFTestDataSamples.OpenSampleDocument("55733.docx");
            XWPFWordExtractor extractor = new XWPFWordExtractor(doc);

            // Check it gives text without error
            string text = extractor.Text;

            extractor.Close();
        }

예제 #13

0

파일 보기

        public void TestFormFootnotes()
        {
            XWPFDocument      doc       = XWPFTestDataSamples.OpenSampleDocument("form_footnotes.docx");
            XWPFWordExtractor extractor = new XWPFWordExtractor(doc);

            String text = extractor.Text;

            Assert.IsTrue(text.Contains("testdoc"), "Unable to find expected word in text\n" + text);
            Assert.IsTrue(text.Contains("test phrase"), "Unable to find expected word in text\n" + text);
        }

예제 #14

0

파일 보기

        public void TestDocTabs()
        {
            XWPFDocument      doc       = XWPFTestDataSamples.OpenSampleDocument("WithTabs.docx");
            XWPFWordExtractor extractor = new XWPFWordExtractor(doc);

            // Check bits
            Assert.IsTrue(extractor.Text.Contains("a"));
            Assert.IsTrue(extractor.Text.Contains("\t"));
            Assert.IsTrue(extractor.Text.Contains("b"));

            // Now check the first paragraph in total
            Assert.IsTrue(extractor.Text.Contains("a\tb\n"));
        }

예제 #15

0

파일 보기

        public void TestFile()
        {
            XWPFDocument      doc       = XWPFTestDataSamples.OpenSampleDocument("ExternalEntityInText.docx");
            XWPFWordExtractor extractor = new XWPFWordExtractor(doc);

            String text = extractor.Text;

            Assert.IsTrue(text.Length > 0);

            // Check contents, they should not contain the text from POI web site After colon!
            Assert.AreEqual("Here should not be the POI web site: \"\"", text.Trim());

            extractor.Close();
        }

예제 #16

0

파일 보기

        public void TestCheckboxes()
        {
            XWPFDocument doc = XWPFTestDataSamples.OpenSampleDocument("checkboxes.docx");

            Console.WriteLine(doc);
            XWPFWordExtractor extractor = new XWPFWordExtractor(doc);

            Assert.AreEqual("This is a small test for checkboxes \nunchecked: |_| \n" +
                            "Or checked: |X|\n\n\n\n\n" +
                            "Test a checkbox within a textbox: |_| -> |X|\n\n\n" +
                            "In Table:\n|_|\t|X|\n\n\n" +
                            "In Sequence:\n|X||_||X|\n", extractor.Text);
            extractor.Close();
        }

예제 #17

0

파일 보기

        public void TestHeadersFooters()
        {
            XWPFDocument      doc       = XWPFTestDataSamples.OpenSampleDocument("ThreeColHeadFoot.docx");
            XWPFWordExtractor extractor = new XWPFWordExtractor(doc);

            Assert.AreEqual(
                "First header column!\tMid header\tRight header!\n" +
                "This is a sample word document. It has two pages. It has a three column heading, and a three column footer\n" +
                "\n" +
                "HEADING TEXT\n" +
                "\n" +
                "More on page one\n" +
                "\n\n" +
                "End of page 1\n\n\n" +
                "This is page two. It also has a three column heading, and a three column footer.\n" +
                "Footer Left\tFooter Middle\tFooter Right\n",
                extractor.Text
                );

            // Now another file, expect multiple headers
            //  and multiple footers
            doc       = XWPFTestDataSamples.OpenSampleDocument("DiffFirstPageHeadFoot.docx");
            extractor = new XWPFWordExtractor(doc);
            extractor =
                new XWPFWordExtractor(doc);
            //extractor.Text;

            Assert.AreEqual(
                "I am the header on the first page, and I" + '\u2019' + "m nice and simple\n" +
                "First header column!\tMid header\tRight header!\n" +
                "This is a sample word document. It has two pages. It has a simple header and footer, which is different to all the other pages.\n" +
                "\n" +
                "HEADING TEXT\n" +
                "\n" +
                "More on page one\n" +
                "\n\n" +
                "End of page 1\n\n\n" +
                "This is page two. It also has a three column heading, and a three column footer.\n" +
                "The footer of the first page\n" +
                "Footer Left\tFooter Middle\tFooter Right\n",
                extractor.Text
                );
        }

예제 #18

0

파일 보기

        public void TestGetComplexText()
        {
            XWPFDocument      doc       = XWPFTestDataSamples.OpenSampleDocument("IllustrativeCases.docx");
            XWPFWordExtractor extractor = new XWPFWordExtractor(doc);

            String text = extractor.Text;

            Assert.IsTrue(text.Length > 0);

            char euro = '\u20ac';

            Debug.WriteLine("'" + text.Substring(text.Length - 40) + "'");

            //Check contents
            Assert.IsTrue(text.StartsWith(
                              "  \n(V) ILLUSTRATIVE CASES\n\n"
                              ));
            Assert.IsTrue(text.Contains(
                              "As well as gaining " + euro + "90 from child benefit increases, he will also receive the early childhood supplement of " + euro + "250 per quarter for Vincent for the full four quarters of the year.\n\n\n\n"// \n\n\n"
                              ));
            Assert.IsTrue(text.EndsWith(
                              "11.4%\t\t90\t\t\t\t\t250\t\t1,310\t\n\n \n\n\n"
                              ));

            // Check number of paragraphs
            int ps = 0;

            char[] t = text.ToCharArray();
            for (int i = 0; i < t.Length; i++)
            {
                if (t[i] == '\n')
                {
                    ps++;
                }
            }
            Assert.AreEqual(134, ps);
        }

예제 #19

0

파일 보기

        public void TestSimpleControlContent()
        {
            XWPFDocument doc = XWPFTestDataSamples.OpenSampleDocument("Bug54849.docx");

            String[] targs = new String[] {
                "header_rich_text",
                "rich_text",
                "rich_text_pre_table\nrich_text_cell1\t\t\t\n\t\t\t\n\t\t\t\n\nrich_text_post_table",
                "plain_text_no_newlines",
                "plain_text_with_newlines1\nplain_text_with_newlines2\n",
                "watermelon\n",
                "dirt\n",
                "4/16/2013\n",
                "rich_text_in_cell",
                "abc",
                "rich_text_in_paragraph_in_cell",
                "footer_rich_text",
                "footnote_sdt",
                "endnote_sdt"
            };
            XWPFWordExtractor ex = new XWPFWordExtractor(doc);
            String            s  = ex.Text.ToLower();
            int hits             = 0;

            foreach (String targ in targs)
            {
                bool hitted = false;
                if (s.IndexOf(targ) > -1)
                {
                    hitted = true;
                    hits++;
                }
                Assert.AreEqual(true, hitted, "controlled content loading-" + targ);
            }
            Assert.AreEqual(targs.Length, hits, "controlled content loading hit count");

            ex.Close();

            doc   = XWPFTestDataSamples.OpenSampleDocument("Bug54771a.docx");
            targs = new String[] {
                "bb",
                "test subtitle\n",
                "test user\n",
            };
            ex = new XWPFWordExtractor(doc);
            s  = ex.Text.ToLower();

            //At one point in development there were three copies of the text.
            //This ensures that there is only one copy.
            MatchCollection mc;
            int             hit;

            foreach (String targ in targs)
            {
                mc  = Regex.Matches(s, targ);
                hit = 0;
                foreach (Match m in mc)
                {
                    if (m.Success)
                    {
                        hit++;
                    }
                }
                Assert.AreEqual(1, hit, "controlled content loading-" + targ);
            }
            //"test\n" appears twice: once as the "title" and once in the text.
            //This also happens when you save this document as text from MSWord.
            mc  = Regex.Matches(s, "test\n");
            hit = 0;
            foreach (Match m in mc)
            {
                if (m.Success)
                {
                    hit++;
                }
            }
            Assert.AreEqual(2, hit, "test<N>");
            ex.Close();
        }

C# (CSharp) Npoi.Core.XWPF.Extractor XWPFWordExtractor 예제들