Пример #1
0
        public void TestProblemMetadata()
        {
            WordExtractor extractor =
                new WordExtractor(POIDataSamples.GetDocumentInstance().OpenResourceAsStream("ProblemExtracting.doc"));

            // Check it gives text without error
            string text=extractor.Text;
            string[] paratext=extractor.ParagraphText;
            string textfrompieces=extractor.TextFromPieces;
        }
Пример #2
0
        public void SetUp()
        {

            String filename = "test2.doc";
            String filename2 = "test.doc";
            filename3 = "excel_with_embeded.xls";
            filename4 = "ThreeColHeadFoot.doc";
            filename5 = "HeaderFooterUnicode.doc";
            filename6 = "footnote.doc";
            POIDataSamples docTests = POIDataSamples.GetDocumentInstance();
            extractor = new WordExtractor(docTests.OpenResourceAsStream(filename));
            extractor2 = new WordExtractor(docTests.OpenResourceAsStream(filename2));

            // Build splat'd out text version
            for (int i = 0; i < p_text1.Length; i++)
            {
                p_text1_block += p_text1[i];
            }
        }
Пример #3
0
        public void TestProblemHeaderStories49936()
        {
            HWPFDocument doc = HWPFTestDataSamples.OpenSampleFile("HeaderFooterProblematic.doc");
            HeaderStories hs = new HeaderStories(doc);

            Assert.AreEqual("", hs.FirstHeader);
            Assert.AreEqual("\r", hs.EvenHeader);
            Assert.AreEqual("", hs.OddHeader);

            Assert.AreEqual("", hs.FirstFooter);
            Assert.AreEqual("", hs.EvenFooter);
            Assert.AreEqual("", hs.OddFooter);

            WordExtractor ext = new WordExtractor(doc);
            Assert.AreEqual("\n", ext.HeaderText);
            Assert.AreEqual("", ext.FooterText);
        }
Пример #4
0
        public void TestFirstParagraphFix()
        {
            extractor = new WordExtractor(
                    POIDataSamples.GetDocumentInstance().OpenResourceAsStream("Bug48075.doc")
            );

            String text = extractor.Text;

            Assert.IsTrue(text.StartsWith("\u041f\u0440\u0438\u043b\u043e\u0436\u0435\u043d\u0438\u0435"));
        }
Пример #5
0
        public void TestFastSaved()
        {
            extractor = new WordExtractor(
                    POIDataSamples.GetDocumentInstance().OpenResourceAsStream("rasp.doc")
            );

            String text = extractor.Text;
            Assert.IsTrue(text.Contains("\u0425\u0425\u0425\u0425\u0425"));
            Assert.IsTrue(text.Contains("\u0423\u0423\u0423\u0423\u0423"));
        }
Пример #6
0
        public void TestWord6()
        {
            // Too old for the default
            try
            {
                extractor = new WordExtractor(
                        POIDataSamples.GetDocumentInstance().OpenResourceAsStream("Word6.doc")
                );
                Assert.Fail();
            }
            catch (OldWordFileFormatException) { }

            Word6Extractor w6e = new Word6Extractor(
                    POIDataSamples.GetDocumentInstance().OpenResourceAsStream("Word6.doc")
            );
            String text = w6e.Text;

            Assert.IsTrue(text.Contains("The quick brown fox jumps over the lazy dog"));

            String[] tp = w6e.ParagraphText;
            Assert.AreEqual(1, tp.Length);
            Assert.AreEqual("The quick brown fox jumps over the lazy dog\r\n", tp[0]);
        }
Пример #7
0
        public void TestWord95()
        {
            // Too old for the default
            try
            {
                extractor = new WordExtractor(
                        POIDataSamples.GetDocumentInstance().OpenResourceAsStream("Word95.doc")
                );
                Assert.Fail();
            }
            catch (OldWordFileFormatException ) { }

            // Can work with the special one
            Word6Extractor w6e = new Word6Extractor(
                    POIDataSamples.GetDocumentInstance().OpenResourceAsStream("Word95.doc")
            );
            String text = w6e.Text;

            Assert.IsTrue(text.Contains("The quick brown fox jumps over the lazy dog"));
            Assert.IsTrue(text.Contains("Paragraph 2"));
            Assert.IsTrue(text.Contains("Paragraph 3. Has some RED text and some BLUE BOLD text in it"));
            Assert.IsTrue(text.Contains("Last (4th) paragraph"));

            String[] tp = w6e.ParagraphText;
            Assert.AreEqual(7, tp.Length);
            Assert.AreEqual("The quick brown fox jumps over the lazy dog\r\n", tp[0]);
            Assert.AreEqual("\r\n", tp[1]);
            Assert.AreEqual("Paragraph 2\r\n", tp[2]);
            Assert.AreEqual("\r\n", tp[3]);
            Assert.AreEqual("Paragraph 3. Has some RED text and some BLUE BOLD text in it.\r\n", tp[4]);
            Assert.AreEqual("\r\n", tp[5]);
            Assert.AreEqual("Last (4th) paragraph.\r\n", tp[6]);
        }
Пример #8
0
        public void TestComments()
        {
            HWPFDocument doc = HWPFTestDataSamples.OpenSampleFile(filename6);
            extractor = new WordExtractor(doc);

            String[] text = extractor.CommentsText;
            StringBuilder b = new StringBuilder();
            for (int i = 0; i < text.Length; i++)
            {
                b.Append(text[i]);
            }

            Assert.IsTrue(b.ToString().Contains("TestComment"));
        }
Пример #9
0
        public void TestWithFooter()
        {
            // Non-unicode
            HWPFDocument doc = HWPFTestDataSamples.OpenSampleFile(filename4);
            extractor = new WordExtractor(doc);

            Assert.AreEqual("Footer Left\tFooter Middle Footer Right\n", extractor.FooterText);

            String text = extractor.Text;
            Assert.IsTrue(text.IndexOf("Footer Left") > -1);

            // Unicode
            doc = HWPFTestDataSamples.OpenSampleFile(filename5);
            extractor = new WordExtractor(doc);

            Assert.AreEqual("The footer, with Moli\u00e8re, has Unicode in it.\n", extractor
                    .FooterText);
            text = extractor.Text;
            Assert.IsTrue(text.IndexOf("The footer, with") > -1);
        }
Пример #10
0
        public void TestWithHeader()
        {
            // Non-unicode
            HWPFDocument doc = HWPFTestDataSamples.OpenSampleFile(filename4);
            extractor = new WordExtractor(doc);

            Assert.AreEqual("First header column!\tMid header Right header!\n", extractor.HeaderText);

            String text = extractor.Text;
            Assert.IsTrue(text.IndexOf("First header column!") > -1);

            // Unicode
            doc = HWPFTestDataSamples.OpenSampleFile(filename5);
            extractor = new WordExtractor(doc);

            Assert.AreEqual("This is a simple header, with a \u20ac euro symbol in it.\n\n", extractor
                    .HeaderText);
            text = extractor.Text;
            Assert.IsTrue(text.IndexOf("This is a simple header") > -1);
        }
Пример #11
0
        public void TestExtractFromEmbeded()
        {
            POIFSFileSystem fs = new POIFSFileSystem(POIDataSamples.GetSpreadSheetInstance().OpenResourceAsStream(filename3));
            HWPFDocument doc;
            WordExtractor extractor3;

            DirectoryNode dirA = (DirectoryNode)fs.Root.GetEntry("MBD0000A3B7");
            DirectoryNode dirB = (DirectoryNode)fs.Root.GetEntry("MBD0000A3B2");

            // Should have WordDocument and 1Table
            Assert.IsNotNull(dirA.GetEntry("1Table"));
            Assert.IsNotNull(dirA.GetEntry("WordDocument"));

            Assert.IsNotNull(dirB.GetEntry("1Table"));
            Assert.IsNotNull(dirB.GetEntry("WordDocument"));

            // Check each in turn
            doc = new HWPFDocument(dirA, fs);
            extractor3 = new WordExtractor(doc);

            Assert.IsNotNull(extractor3.Text);
            Assert.IsTrue(extractor3.Text.Length > 20);
            Assert.AreEqual("I am a sample document\r\nNot much on me\r\nI am document 1\r\n", extractor3
                    .Text);
            Assert.AreEqual("Sample Doc 1", extractor3.SummaryInformation.Title);
            Assert.AreEqual("Sample Test", extractor3.SummaryInformation.Subject);

            doc = new HWPFDocument(dirB, fs);
            extractor3 = new WordExtractor(doc);

            Assert.IsNotNull(extractor3.Text);
            Assert.IsTrue(extractor3.Text.Length > 20);
            Assert.AreEqual("I am another sample document\r\nNot much on me\r\nI am document 2\r\n",
                    extractor3.Text);
            Assert.AreEqual("Sample Doc 2", extractor3.SummaryInformation.Title);
            Assert.AreEqual("Another Sample Test", extractor3.SummaryInformation.Subject);
        }