public void TestTableFootnotes() { XWPFDocument doc = XWPFTestDataSamples.OpenSampleDocument("table_footnotes.docx"); XWPFWordExtractor extractor = new XWPFWordExtractor(doc); Assert.IsTrue(extractor.Text.Contains("snoska")); }
public void TestGetSimpleText() { XWPFDocument doc = XWPFTestDataSamples.OpenSampleDocument("sample.docx"); XWPFWordExtractor extractor = new XWPFWordExtractor(doc); String text = extractor.Text; Assert.IsTrue(text.Length > 0); // Check contents Assert.IsTrue(text.StartsWith( "Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Nunc at risus vel erat tempus posuere. Aenean non ante. Suspendisse vehicula dolor sit amet odio." )); Assert.IsTrue(text.EndsWith( "Phasellus ultricies mi nec leo. Sed tempus. In sit amet lorem at velit faucibus vestibulum.\n" )); // Check number of paragraphs int ps = 0; char[] t = text.ToCharArray(); for (int i = 0; i < t.Length; i++) { if (t[i] == '\n') { ps++; } } Assert.AreEqual(3, ps); }
public void Init(string path) { if (path.EndsWith(".doc")) { HWPFDocument hwpf; using (FileStream file = new FileStream(path, FileMode.Open, FileAccess.Read)) { hwpf = new HWPFDocument(file); } this.textBody = hwpf.Text.ToString(); } else if (path.EndsWith(".docx")) { XWPFDocument xwpf; using (FileStream file = new FileStream(path, FileMode.Open, FileAccess.Read)) { xwpf = new XWPFDocument(file); } XWPFWordExtractor ex = new XWPFWordExtractor(xwpf); this.textBody = ex.Text; } textBody = textBody.Replace("(", "(").Replace(")", ")"); }
public void TestGetWithHyperlinks() { XWPFDocument doc = XWPFTestDataSamples.OpenSampleDocument("TestDocument.docx"); XWPFWordExtractor extractor = new XWPFWordExtractor(doc); // Now check contents extractor.SetFetchHyperlinks(false); Assert.AreEqual( "This is a test document.\nThis bit is in bold and italic\n" + "Back to normal\n" + "This contains BOLD, ITALIC and BOTH, as well as RED and YELLOW text.\n" + "We have a hyperlink here, and another.\n", extractor.Text ); // One hyperlink is a real one, one is just to the top of page extractor.SetFetchHyperlinks(true); Assert.AreEqual( "This is a test document.\nThis bit is in bold and italic\n" + "Back to normal\n" + "This contains BOLD, ITALIC and BOTH, as well as RED and YELLOW text.\n" + "We have a hyperlink <http://poi.apache.org/> here, and another.\n", extractor.Text ); }
public void TestDrawings() { XWPFDocument doc = XWPFTestDataSamples.OpenSampleDocument("drawing.docx"); XWPFWordExtractor extractor = new XWPFWordExtractor(doc); String text = extractor.Text; Assert.IsTrue(text.Length > 0); }
public void TestInsertedDeletedText() { XWPFDocument doc = XWPFTestDataSamples.OpenSampleDocument("delins.docx"); XWPFWordExtractor extractor = new XWPFWordExtractor(doc); Assert.IsTrue(extractor.Text.Contains("pendant worn")); Assert.IsTrue(extractor.Text.Contains("extremely well")); }
public void TestFootnotes() { XWPFDocument doc = XWPFTestDataSamples.OpenSampleDocument("footnotes.docx"); XWPFWordExtractor extractor = new XWPFWordExtractor(doc); String text = extractor.Text; Assert.IsTrue(extractor.Text.Contains("snoska")); Assert.IsTrue(text.Contains("Eto ochen prostoy[footnoteRef:1] text so snoskoy")); }
/// <summary> /// 读取Word以字符串方式返回 /// </summary> /// <param name="filepath">文档地址</param> /// <returns>字符串形式的文档内容</returns> public string ReaderWord(string filepath) { using (FileStream fs = File.OpenRead(filepath)) { XWPFDocument doc = new XWPFDocument(fs); XWPFWordExtractor extractor = new XWPFWordExtractor(doc); return(extractor.ToString()); } }
public void TestFldSimpleContent() { XWPFDocument doc = XWPFTestDataSamples.OpenSampleDocument("FldSimple.docx"); XWPFWordExtractor extractor = new XWPFWordExtractor(doc); String text = extractor.Text; Assert.IsTrue(text.Length > 0); Assert.IsTrue(text.Contains("FldSimple.docx")); }
public void TestDOCMFiles() { XWPFDocument doc = XWPFTestDataSamples.OpenSampleDocument("45690.docm"); XWPFWordExtractor extractor = new XWPFWordExtractor(doc); Assert.IsTrue(extractor.Text.Contains("2004")); Assert.IsTrue(extractor.Text.Contains("2008")); Assert.IsTrue(extractor.Text.Contains("(120 ")); }
public void TestParagraphHeader() { XWPFDocument doc = XWPFTestDataSamples.OpenSampleDocument("Headers.docx"); XWPFWordExtractor extractor = new XWPFWordExtractor(doc); Assert.IsTrue(extractor.Text.Contains("Section 1")); Assert.IsTrue(extractor.Text.Contains("Section 2")); Assert.IsTrue(extractor.Text.Contains("Section 3")); }
public void TestEndnotes() { XWPFDocument doc = XWPFTestDataSamples.OpenSampleDocument("endnotes.docx"); XWPFWordExtractor extractor = new XWPFWordExtractor(doc); string text = extractor.Text; Assert.IsTrue(text.Contains("XXX")); Assert.IsTrue(text.Contains("tilaka [endnoteRef:2]or 'tika'")); }
public void TestNoFieldCodes() { XWPFDocument doc = XWPFTestDataSamples.OpenSampleDocument("FieldCodes.docx"); XWPFWordExtractor extractor = new XWPFWordExtractor(doc); String text = extractor.Text; Assert.IsTrue(text.Length > 0); Assert.IsFalse(text.Contains("AUTHOR")); Assert.IsFalse(text.Contains("CREATEDATE")); }
public void TestFormFootnotes() { XWPFDocument doc = XWPFTestDataSamples.OpenSampleDocument("form_footnotes.docx"); XWPFWordExtractor extractor = new XWPFWordExtractor(doc); String text = extractor.Text; Assert.IsTrue(text.Contains("testdoc"), "Unable to find expected word in text\n" + text); Assert.IsTrue(text.Contains("test phrase"), "Unable to find expected word in text\n" + text); }
public void TestBug55733() { XWPFDocument doc = XWPFTestDataSamples.OpenSampleDocument("55733.docx"); XWPFWordExtractor extractor = new XWPFWordExtractor(doc); // Check it gives text without error string text = extractor.Text; extractor.Close(); }
/// <summary> /// 文档属性 /// </summary> /// <param name="filepath">文档地址</param> /// <returns>0.创建者,1分类,2标题</returns> public Tuple <string, string, string> GetDocProperties(string filepath) { using (FileStream fs = File.OpenRead(filepath)) { XWPFDocument doc = new XWPFDocument(fs); XWPFWordExtractor extractor = new XWPFWordExtractor(doc); CoreProperties t = extractor.GetCoreProperties(); return(new Tuple <string, string, string>(t.Creator, t.Category, t.Title)); } }
public void TestDocTabs() { XWPFDocument doc = XWPFTestDataSamples.OpenSampleDocument("WithTabs.docx"); XWPFWordExtractor extractor = new XWPFWordExtractor(doc); // Check bits Assert.IsTrue(extractor.Text.Contains("a")); Assert.IsTrue(extractor.Text.Contains("\t")); Assert.IsTrue(extractor.Text.Contains("b")); // Now check the first paragraph in total Assert.IsTrue(extractor.Text.Contains("a\tb\n")); }
public void TestFile() { XWPFDocument doc = XWPFTestDataSamples.OpenSampleDocument("ExternalEntityInText.docx"); XWPFWordExtractor extractor = new XWPFWordExtractor(doc); String text = extractor.Text; Assert.IsTrue(text.Length > 0); // Check contents, they should not contain the text from POI web site After colon! Assert.AreEqual("Here should not be the POI web site: \"\"", text.Trim()); extractor.Close(); }
public void TestCheckboxes() { XWPFDocument doc = XWPFTestDataSamples.OpenSampleDocument("checkboxes.docx"); Console.WriteLine(doc); XWPFWordExtractor extractor = new XWPFWordExtractor(doc); Assert.AreEqual("This is a small test for checkboxes \nunchecked: |_| \n" + "Or checked: |X|\n\n\n\n\n" + "Test a checkbox within a textbox: |_| -> |X|\n\n\n" + "In Table:\n|_|\t|X|\n\n\n" + "In Sequence:\n|X||_||X|\n", extractor.Text); extractor.Close(); }
public void TestWriteFromReadOnlyOPC() { OPCPackage opc = OPCPackage.Open( POIDataSamples.GetDocumentInstance().GetFileInfo("SampleDoc.docx"), PackageAccess.READ ); XWPFDocument doc = new XWPFDocument(opc); XWPFWordExtractor ext = new XWPFWordExtractor(doc); String origText = ext.Text; doc = XWPFTestDataSamples.WriteOutAndReadBack(doc); ext = new XWPFWordExtractor(doc); Assert.AreEqual(origText, ext.Text); }
public void TestHeadersFooters() { XWPFDocument doc = XWPFTestDataSamples.OpenSampleDocument("ThreeColHeadFoot.docx"); XWPFWordExtractor extractor = new XWPFWordExtractor(doc); Assert.AreEqual( "First header column!\tMid header\tRight header!\n" + "This is a sample word document. It has two pages. It has a three column heading, and a three column footer\n" + "\n" + "HEADING TEXT\n" + "\n" + "More on page one\n" + "\n\n" + "End of page 1\n\n\n" + "This is page two. It also has a three column heading, and a three column footer.\n" + "Footer Left\tFooter Middle\tFooter Right\n", extractor.Text ); // Now another file, expect multiple headers // and multiple footers doc = XWPFTestDataSamples.OpenSampleDocument("DiffFirstPageHeadFoot.docx"); extractor = new XWPFWordExtractor(doc); extractor = new XWPFWordExtractor(doc); //extractor.Text; Assert.AreEqual( "I am the header on the first page, and I" + '\u2019' + "m nice and simple\n" + "First header column!\tMid header\tRight header!\n" + "This is a sample word document. It has two pages. It has a simple header and footer, which is different to all the other pages.\n" + "\n" + "HEADING TEXT\n" + "\n" + "More on page one\n" + "\n\n" + "End of page 1\n\n\n" + "This is page two. It also has a three column heading, and a three column footer.\n" + "The footer of the first page\n" + "Footer Left\tFooter Middle\tFooter Right\n", extractor.Text ); }
public IActionResult PostearDoc(IFormFile file) { string texto; if (file.ContentType == "application/pdf") { using (MemoryStream ms = new MemoryStream()) { file.CopyTo(ms); byte[] que = ms.ToArray(); PdfReader pdfReader = new PdfReader(que); byte[] contenidoPageUno = pdfReader.GetPageContent(1); PrTokeniser tokenizer = new PrTokeniser(new RandomAccessFileOrArray(contenidoPageUno)); List <string> strList = new List <string>(); texto = String.Empty; while (tokenizer.NextToken()) { if (tokenizer.TokenType == PrTokeniser.TK_STRING) { strList.Add(tokenizer.StringValue); texto = texto + tokenizer.StringValue; } } pdfReader.Close(); } } else { XWPFDocument doc = new XWPFDocument(file.OpenReadStream()); XWPFWordExtractor extractor = new XWPFWordExtractor(doc); texto = extractor.Text; } return(Json(new { texto })); }
public void TestGetComplexText() { XWPFDocument doc = XWPFTestDataSamples.OpenSampleDocument("IllustrativeCases.docx"); XWPFWordExtractor extractor = new XWPFWordExtractor(doc); String text = extractor.Text; Assert.IsTrue(text.Length > 0); char euro = '\u20ac'; Debug.WriteLine("'" + text.Substring(text.Length - 40) + "'"); //Check contents Assert.IsTrue(text.StartsWith( " \n(V) ILLUSTRATIVE CASES\n\n" )); Assert.IsTrue(text.Contains( "As well as gaining " + euro + "90 from child benefit increases, he will also receive the early childhood supplement of " + euro + "250 per quarter for Vincent for the full four quarters of the year.\n\n\n\n"// \n\n\n" )); Assert.IsTrue(text.EndsWith( "11.4%\t\t90\t\t\t\t\t250\t\t1,310\t\n\n \n\n\n" )); // Check number of paragraphs int ps = 0; char[] t = text.ToCharArray(); for (int i = 0; i < t.Length; i++) { if (t[i] == '\n') { ps++; } } Assert.AreEqual(134, ps); }
public string ReadData() { string fileData = string.Empty; logger.AppendLog("Validating file extension.."); if (ValidateFile() != FileValidationError.NoError) { logger.AppendError("Incorrect file extension."); return(fileData); } logger.AppendLog("Valid file extension."); logger.AppendLog("Reading file..."); try { using (FileStream fs = new FileStream(this.FileName, FileMode.Open, FileAccess.Read)) { XWPFDocument doc = new XWPFDocument(fs); XWPFWordExtractor docExtractor = new XWPFWordExtractor(doc); fileData = docExtractor.Text; } logger.AppendLog("File read successfully."); } catch (Exception ex) { string errorMsg = "Error reading word file. Aborting read operation."; logger.AppendError(errorMsg); throw new Exception(errorMsg + ex.Message); } return(fileData); }
public void TestSimpleControlContent() { XWPFDocument doc = XWPFTestDataSamples.OpenSampleDocument("Bug54849.docx"); String[] targs = new String[] { "header_rich_text", "rich_text", "rich_text_pre_table\nrich_text_cell1\t\t\t\n\t\t\t\n\t\t\t\n\nrich_text_post_table", "plain_text_no_newlines", "plain_text_with_newlines1\nplain_text_with_newlines2\n", "watermelon\n", "dirt\n", "4/16/2013\n", "rich_text_in_cell", "abc", "rich_text_in_paragraph_in_cell", "footer_rich_text", "footnote_sdt", "endnote_sdt" }; XWPFWordExtractor ex = new XWPFWordExtractor(doc); String s = ex.Text.ToLower(); int hits = 0; foreach (String targ in targs) { bool hitted = false; if (s.Contains(targ)) { hitted = true; hits++; } Assert.AreEqual(true, hitted, "controlled content loading-" + targ); } Assert.AreEqual(targs.Length, hits, "controlled content loading hit count"); ex.Close(); doc = XWPFTestDataSamples.OpenSampleDocument("Bug54771a.docx"); targs = new String[] { "bb", "test subtitle\n", "test user\n", }; ex = new XWPFWordExtractor(doc); s = ex.Text.ToLower(); //At one point in development there were three copies of the text. //This ensures that there is only one copy. MatchCollection mc; int hit; foreach (String targ in targs) { mc = Regex.Matches(s, targ); hit = 0; foreach (Match m in mc) { if (m.Success) { hit++; } } Assert.AreEqual(1, hit, "controlled content loading-" + targ); } //"test\n" appears twice: once as the "title" and once in the text. //This also happens when you save this document as text from MSWord. mc = Regex.Matches(s, "test\n"); hit = 0; foreach (Match m in mc) { if (m.Success) { hit++; } } Assert.AreEqual(2, hit, "test<N>"); ex.Close(); }