public void TestProblemMetadata() { WordExtractor extractor = new WordExtractor(POIDataSamples.GetDocumentInstance().OpenResourceAsStream("ProblemExtracting.doc")); // Check it gives text without error string text=extractor.Text; string[] paratext=extractor.ParagraphText; string textfrompieces=extractor.TextFromPieces; }
public void SetUp() { String filename = "test2.doc"; String filename2 = "test.doc"; filename3 = "excel_with_embeded.xls"; filename4 = "ThreeColHeadFoot.doc"; filename5 = "HeaderFooterUnicode.doc"; filename6 = "footnote.doc"; POIDataSamples docTests = POIDataSamples.GetDocumentInstance(); extractor = new WordExtractor(docTests.OpenResourceAsStream(filename)); extractor2 = new WordExtractor(docTests.OpenResourceAsStream(filename2)); // Build splat'd out text version for (int i = 0; i < p_text1.Length; i++) { p_text1_block += p_text1[i]; } }
public void TestProblemHeaderStories49936() { HWPFDocument doc = HWPFTestDataSamples.OpenSampleFile("HeaderFooterProblematic.doc"); HeaderStories hs = new HeaderStories(doc); Assert.AreEqual("", hs.FirstHeader); Assert.AreEqual("\r", hs.EvenHeader); Assert.AreEqual("", hs.OddHeader); Assert.AreEqual("", hs.FirstFooter); Assert.AreEqual("", hs.EvenFooter); Assert.AreEqual("", hs.OddFooter); WordExtractor ext = new WordExtractor(doc); Assert.AreEqual("\n", ext.HeaderText); Assert.AreEqual("", ext.FooterText); }
public void TestFirstParagraphFix() { extractor = new WordExtractor( POIDataSamples.GetDocumentInstance().OpenResourceAsStream("Bug48075.doc") ); String text = extractor.Text; Assert.IsTrue(text.StartsWith("\u041f\u0440\u0438\u043b\u043e\u0436\u0435\u043d\u0438\u0435")); }
public void TestFastSaved() { extractor = new WordExtractor( POIDataSamples.GetDocumentInstance().OpenResourceAsStream("rasp.doc") ); String text = extractor.Text; Assert.IsTrue(text.Contains("\u0425\u0425\u0425\u0425\u0425")); Assert.IsTrue(text.Contains("\u0423\u0423\u0423\u0423\u0423")); }
public void TestWord6() { // Too old for the default try { extractor = new WordExtractor( POIDataSamples.GetDocumentInstance().OpenResourceAsStream("Word6.doc") ); Assert.Fail(); } catch (OldWordFileFormatException) { } Word6Extractor w6e = new Word6Extractor( POIDataSamples.GetDocumentInstance().OpenResourceAsStream("Word6.doc") ); String text = w6e.Text; Assert.IsTrue(text.Contains("The quick brown fox jumps over the lazy dog")); String[] tp = w6e.ParagraphText; Assert.AreEqual(1, tp.Length); Assert.AreEqual("The quick brown fox jumps over the lazy dog\r\n", tp[0]); }
public void TestWord95() { // Too old for the default try { extractor = new WordExtractor( POIDataSamples.GetDocumentInstance().OpenResourceAsStream("Word95.doc") ); Assert.Fail(); } catch (OldWordFileFormatException ) { } // Can work with the special one Word6Extractor w6e = new Word6Extractor( POIDataSamples.GetDocumentInstance().OpenResourceAsStream("Word95.doc") ); String text = w6e.Text; Assert.IsTrue(text.Contains("The quick brown fox jumps over the lazy dog")); Assert.IsTrue(text.Contains("Paragraph 2")); Assert.IsTrue(text.Contains("Paragraph 3. Has some RED text and some BLUE BOLD text in it")); Assert.IsTrue(text.Contains("Last (4th) paragraph")); String[] tp = w6e.ParagraphText; Assert.AreEqual(7, tp.Length); Assert.AreEqual("The quick brown fox jumps over the lazy dog\r\n", tp[0]); Assert.AreEqual("\r\n", tp[1]); Assert.AreEqual("Paragraph 2\r\n", tp[2]); Assert.AreEqual("\r\n", tp[3]); Assert.AreEqual("Paragraph 3. Has some RED text and some BLUE BOLD text in it.\r\n", tp[4]); Assert.AreEqual("\r\n", tp[5]); Assert.AreEqual("Last (4th) paragraph.\r\n", tp[6]); }
public void TestComments() { HWPFDocument doc = HWPFTestDataSamples.OpenSampleFile(filename6); extractor = new WordExtractor(doc); String[] text = extractor.CommentsText; StringBuilder b = new StringBuilder(); for (int i = 0; i < text.Length; i++) { b.Append(text[i]); } Assert.IsTrue(b.ToString().Contains("TestComment")); }
public void TestWithFooter() { // Non-unicode HWPFDocument doc = HWPFTestDataSamples.OpenSampleFile(filename4); extractor = new WordExtractor(doc); Assert.AreEqual("Footer Left\tFooter Middle Footer Right\n", extractor.FooterText); String text = extractor.Text; Assert.IsTrue(text.IndexOf("Footer Left") > -1); // Unicode doc = HWPFTestDataSamples.OpenSampleFile(filename5); extractor = new WordExtractor(doc); Assert.AreEqual("The footer, with Moli\u00e8re, has Unicode in it.\n", extractor .FooterText); text = extractor.Text; Assert.IsTrue(text.IndexOf("The footer, with") > -1); }
public void TestWithHeader() { // Non-unicode HWPFDocument doc = HWPFTestDataSamples.OpenSampleFile(filename4); extractor = new WordExtractor(doc); Assert.AreEqual("First header column!\tMid header Right header!\n", extractor.HeaderText); String text = extractor.Text; Assert.IsTrue(text.IndexOf("First header column!") > -1); // Unicode doc = HWPFTestDataSamples.OpenSampleFile(filename5); extractor = new WordExtractor(doc); Assert.AreEqual("This is a simple header, with a \u20ac euro symbol in it.\n\n", extractor .HeaderText); text = extractor.Text; Assert.IsTrue(text.IndexOf("This is a simple header") > -1); }
public void TestExtractFromEmbeded() { POIFSFileSystem fs = new POIFSFileSystem(POIDataSamples.GetSpreadSheetInstance().OpenResourceAsStream(filename3)); HWPFDocument doc; WordExtractor extractor3; DirectoryNode dirA = (DirectoryNode)fs.Root.GetEntry("MBD0000A3B7"); DirectoryNode dirB = (DirectoryNode)fs.Root.GetEntry("MBD0000A3B2"); // Should have WordDocument and 1Table Assert.IsNotNull(dirA.GetEntry("1Table")); Assert.IsNotNull(dirA.GetEntry("WordDocument")); Assert.IsNotNull(dirB.GetEntry("1Table")); Assert.IsNotNull(dirB.GetEntry("WordDocument")); // Check each in turn doc = new HWPFDocument(dirA, fs); extractor3 = new WordExtractor(doc); Assert.IsNotNull(extractor3.Text); Assert.IsTrue(extractor3.Text.Length > 20); Assert.AreEqual("I am a sample document\r\nNot much on me\r\nI am document 1\r\n", extractor3 .Text); Assert.AreEqual("Sample Doc 1", extractor3.SummaryInformation.Title); Assert.AreEqual("Sample Test", extractor3.SummaryInformation.Subject); doc = new HWPFDocument(dirB, fs); extractor3 = new WordExtractor(doc); Assert.IsNotNull(extractor3.Text); Assert.IsTrue(extractor3.Text.Length > 20); Assert.AreEqual("I am another sample document\r\nNot much on me\r\nI am document 2\r\n", extractor3.Text); Assert.AreEqual("Sample Doc 2", extractor3.SummaryInformation.Title); Assert.AreEqual("Another Sample Test", extractor3.SummaryInformation.Subject); }