XWPFWordExtractor C# (CSharp)のコード例

コード例 #1

0

ファイルを表示

ファイル: TestXWPFWordExtractor.cs プロジェクト: zzy092/npoi

        public void TestTableFootnotes()
        {
            XWPFDocument      doc       = XWPFTestDataSamples.OpenSampleDocument("table_footnotes.docx");
            XWPFWordExtractor extractor = new XWPFWordExtractor(doc);

            Assert.IsTrue(extractor.Text.Contains("snoska"));
        }

コード例 #2

0

ファイルを表示

ファイル: TestXWPFWordExtractor.cs プロジェクト: zzy092/npoi

        public void TestGetSimpleText()
        {
            XWPFDocument      doc       = XWPFTestDataSamples.OpenSampleDocument("sample.docx");
            XWPFWordExtractor extractor = new XWPFWordExtractor(doc);

            String text = extractor.Text;

            Assert.IsTrue(text.Length > 0);

            // Check contents
            Assert.IsTrue(text.StartsWith(
                              "Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Nunc at risus vel erat tempus posuere. Aenean non ante. Suspendisse vehicula dolor sit amet odio."
                              ));
            Assert.IsTrue(text.EndsWith(
                              "Phasellus ultricies mi nec leo. Sed tempus. In sit amet lorem at velit faucibus vestibulum.\n"
                              ));

            // Check number of paragraphs
            int ps = 0;

            char[] t = text.ToCharArray();
            for (int i = 0; i < t.Length; i++)
            {
                if (t[i] == '\n')
                {
                    ps++;
                }
            }
            Assert.AreEqual(3, ps);
        }

コード例 #3

0

ファイルを表示

ファイル: SubjectParserCheck.cs プロジェクト: liaxu/autoreview

        public void Init(string path)
        {
            if (path.EndsWith(".doc"))
            {
                HWPFDocument hwpf;
                using (FileStream file = new FileStream(path, FileMode.Open, FileAccess.Read))
                {
                    hwpf = new HWPFDocument(file);
                }

                this.textBody = hwpf.Text.ToString();
            }
            else if (path.EndsWith(".docx"))
            {
                XWPFDocument xwpf;
                using (FileStream file = new FileStream(path, FileMode.Open, FileAccess.Read))
                {
                    xwpf = new XWPFDocument(file);
                }

                XWPFWordExtractor ex = new XWPFWordExtractor(xwpf);
                this.textBody = ex.Text;
            }

            textBody = textBody.Replace("（", "(").Replace("）", ")");
        }

コード例 #4

0

ファイルを表示

ファイル: TestXWPFWordExtractor.cs プロジェクト: zzy092/npoi

        public void TestGetWithHyperlinks()
        {
            XWPFDocument      doc       = XWPFTestDataSamples.OpenSampleDocument("TestDocument.docx");
            XWPFWordExtractor extractor = new XWPFWordExtractor(doc);

            // Now check contents
            extractor.SetFetchHyperlinks(false);
            Assert.AreEqual(
                "This is a test document.\nThis bit is in bold and italic\n" +
                "Back to normal\n" +
                "This contains BOLD, ITALIC and BOTH, as well as RED and YELLOW text.\n" +
                "We have a hyperlink here, and another.\n",
                extractor.Text
                );

            // One hyperlink is a real one, one is just to the top of page
            extractor.SetFetchHyperlinks(true);
            Assert.AreEqual(
                "This is a test document.\nThis bit is in bold and italic\n" +
                "Back to normal\n" +
                "This contains BOLD, ITALIC and BOTH, as well as RED and YELLOW text.\n" +
                "We have a hyperlink <http://poi.apache.org/> here, and another.\n",
                extractor.Text
                );
        }

コード例 #5

0

ファイルを表示

ファイル: TestXWPFWordExtractor.cs プロジェクト: zzy092/npoi

        public void TestDrawings()
        {
            XWPFDocument      doc       = XWPFTestDataSamples.OpenSampleDocument("drawing.docx");
            XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
            String            text      = extractor.Text;

            Assert.IsTrue(text.Length > 0);
        }

コード例 #6

0

ファイルを表示

ファイル: TestXWPFWordExtractor.cs プロジェクト: zzy092/npoi

        public void TestInsertedDeletedText()
        {
            XWPFDocument      doc       = XWPFTestDataSamples.OpenSampleDocument("delins.docx");
            XWPFWordExtractor extractor = new XWPFWordExtractor(doc);

            Assert.IsTrue(extractor.Text.Contains("pendant worn"));
            Assert.IsTrue(extractor.Text.Contains("extremely well"));
        }

コード例 #7

0

ファイルを表示

ファイル: TestXWPFWordExtractor.cs プロジェクト: zzy092/npoi

        public void TestFootnotes()
        {
            XWPFDocument      doc       = XWPFTestDataSamples.OpenSampleDocument("footnotes.docx");
            XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
            String            text      = extractor.Text;

            Assert.IsTrue(extractor.Text.Contains("snoska"));
            Assert.IsTrue(text.Contains("Eto ochen prostoy[footnoteRef:1] text so snoskoy"));
        }

コード例 #8

0

ファイルを表示

 /// <summary>
 /// 读取Word以字符串方式返回
 /// </summary>
 /// <param name="filepath">文档地址</param>
 /// <returns>字符串形式的文档内容</returns>
 public string ReaderWord(string filepath)
 {
     using (FileStream fs = File.OpenRead(filepath))
     {
         XWPFDocument      doc       = new XWPFDocument(fs);
         XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
         return(extractor.ToString());
     }
 }

コード例 #9

0

ファイルを表示

ファイル: TestXWPFWordExtractor.cs プロジェクト: zzy092/npoi

        public void TestFldSimpleContent()
        {
            XWPFDocument      doc       = XWPFTestDataSamples.OpenSampleDocument("FldSimple.docx");
            XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
            String            text      = extractor.Text;

            Assert.IsTrue(text.Length > 0);
            Assert.IsTrue(text.Contains("FldSimple.docx"));
        }

コード例 #10

0

ファイルを表示

ファイル: TestXWPFWordExtractor.cs プロジェクト: zzy092/npoi

        public void TestDOCMFiles()
        {
            XWPFDocument      doc       = XWPFTestDataSamples.OpenSampleDocument("45690.docm");
            XWPFWordExtractor extractor = new XWPFWordExtractor(doc);

            Assert.IsTrue(extractor.Text.Contains("2004"));
            Assert.IsTrue(extractor.Text.Contains("2008"));
            Assert.IsTrue(extractor.Text.Contains("(120 "));
        }

コード例 #11

0

ファイルを表示

ファイル: TestXWPFWordExtractor.cs プロジェクト: zzy092/npoi

        public void TestParagraphHeader()
        {
            XWPFDocument      doc       = XWPFTestDataSamples.OpenSampleDocument("Headers.docx");
            XWPFWordExtractor extractor = new XWPFWordExtractor(doc);

            Assert.IsTrue(extractor.Text.Contains("Section 1"));
            Assert.IsTrue(extractor.Text.Contains("Section 2"));
            Assert.IsTrue(extractor.Text.Contains("Section 3"));
        }

コード例 #12

0

ファイルを表示

ファイル: TestXWPFWordExtractor.cs プロジェクト: zzy092/npoi

        public void TestEndnotes()
        {
            XWPFDocument      doc       = XWPFTestDataSamples.OpenSampleDocument("endnotes.docx");
            XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
            string            text      = extractor.Text;

            Assert.IsTrue(text.Contains("XXX"));
            Assert.IsTrue(text.Contains("tilaka [endnoteRef:2]or 'tika'"));
        }

コード例 #13

0

ファイルを表示

ファイル: TestXWPFWordExtractor.cs プロジェクト: zzy092/npoi

        public void TestNoFieldCodes()
        {
            XWPFDocument      doc       = XWPFTestDataSamples.OpenSampleDocument("FieldCodes.docx");
            XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
            String            text      = extractor.Text;

            Assert.IsTrue(text.Length > 0);
            Assert.IsFalse(text.Contains("AUTHOR"));
            Assert.IsFalse(text.Contains("CREATEDATE"));
        }

コード例 #14

0

ファイルを表示

ファイル: TestXWPFWordExtractor.cs プロジェクト: zzy092/npoi

        public void TestFormFootnotes()
        {
            XWPFDocument      doc       = XWPFTestDataSamples.OpenSampleDocument("form_footnotes.docx");
            XWPFWordExtractor extractor = new XWPFWordExtractor(doc);

            String text = extractor.Text;

            Assert.IsTrue(text.Contains("testdoc"), "Unable to find expected word in text\n" + text);
            Assert.IsTrue(text.Contains("test phrase"), "Unable to find expected word in text\n" + text);
        }

コード例 #15

0

ファイルを表示

ファイル: TestXWPFWordExtractor.cs プロジェクト: zzy092/npoi

        public void TestBug55733()
        {
            XWPFDocument      doc       = XWPFTestDataSamples.OpenSampleDocument("55733.docx");
            XWPFWordExtractor extractor = new XWPFWordExtractor(doc);

            // Check it gives text without error
            string text = extractor.Text;

            extractor.Close();
        }

コード例 #16

0

ファイルを表示

 /// <summary>
 /// 文档属性
 /// </summary>
 /// <param name="filepath">文档地址</param>
 /// <returns>0.创建者,1分类,2标题</returns>
 public Tuple <string, string, string> GetDocProperties(string filepath)
 {
     using (FileStream fs = File.OpenRead(filepath))
     {
         XWPFDocument      doc       = new XWPFDocument(fs);
         XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
         CoreProperties    t         = extractor.GetCoreProperties();
         return(new Tuple <string, string, string>(t.Creator, t.Category, t.Title));
     }
 }

コード例 #17

0

ファイルを表示

ファイル: TestXWPFWordExtractor.cs プロジェクト: zzy092/npoi

        public void TestDocTabs()
        {
            XWPFDocument      doc       = XWPFTestDataSamples.OpenSampleDocument("WithTabs.docx");
            XWPFWordExtractor extractor = new XWPFWordExtractor(doc);

            // Check bits
            Assert.IsTrue(extractor.Text.Contains("a"));
            Assert.IsTrue(extractor.Text.Contains("\t"));
            Assert.IsTrue(extractor.Text.Contains("b"));

            // Now check the first paragraph in total
            Assert.IsTrue(extractor.Text.Contains("a\tb\n"));
        }

コード例 #18

0

ファイルを表示

        public void TestFile()
        {
            XWPFDocument      doc       = XWPFTestDataSamples.OpenSampleDocument("ExternalEntityInText.docx");
            XWPFWordExtractor extractor = new XWPFWordExtractor(doc);

            String text = extractor.Text;

            Assert.IsTrue(text.Length > 0);

            // Check contents, they should not contain the text from POI web site After colon!
            Assert.AreEqual("Here should not be the POI web site: \"\"", text.Trim());

            extractor.Close();
        }

コード例 #19

0

ファイルを表示

ファイル: TestXWPFWordExtractor.cs プロジェクト: zzy092/npoi

        public void TestCheckboxes()
        {
            XWPFDocument doc = XWPFTestDataSamples.OpenSampleDocument("checkboxes.docx");

            Console.WriteLine(doc);
            XWPFWordExtractor extractor = new XWPFWordExtractor(doc);

            Assert.AreEqual("This is a small test for checkboxes \nunchecked: |_| \n" +
                            "Or checked: |X|\n\n\n\n\n" +
                            "Test a checkbox within a textbox: |_| -> |X|\n\n\n" +
                            "In Table:\n|_|\t|X|\n\n\n" +
                            "In Sequence:\n|X||_||X|\n", extractor.Text);
            extractor.Close();
        }

コード例 #20

0

ファイルを表示

        public void TestWriteFromReadOnlyOPC()
        {
            OPCPackage opc = OPCPackage.Open(
                POIDataSamples.GetDocumentInstance().GetFileInfo("SampleDoc.docx"),
                PackageAccess.READ
                );
            XWPFDocument      doc      = new XWPFDocument(opc);
            XWPFWordExtractor ext      = new XWPFWordExtractor(doc);
            String            origText = ext.Text;

            doc = XWPFTestDataSamples.WriteOutAndReadBack(doc);
            ext = new XWPFWordExtractor(doc);

            Assert.AreEqual(origText, ext.Text);
        }

コード例 #21

0

ファイルを表示

ファイル: TestXWPFWordExtractor.cs プロジェクト: zzy092/npoi

        public void TestHeadersFooters()
        {
            XWPFDocument      doc       = XWPFTestDataSamples.OpenSampleDocument("ThreeColHeadFoot.docx");
            XWPFWordExtractor extractor = new XWPFWordExtractor(doc);

            Assert.AreEqual(
                "First header column!\tMid header\tRight header!\n" +
                "This is a sample word document. It has two pages. It has a three column heading, and a three column footer\n" +
                "\n" +
                "HEADING TEXT\n" +
                "\n" +
                "More on page one\n" +
                "\n\n" +
                "End of page 1\n\n\n" +
                "This is page two. It also has a three column heading, and a three column footer.\n" +
                "Footer Left\tFooter Middle\tFooter Right\n",
                extractor.Text
                );

            // Now another file, expect multiple headers
            //  and multiple footers
            doc       = XWPFTestDataSamples.OpenSampleDocument("DiffFirstPageHeadFoot.docx");
            extractor = new XWPFWordExtractor(doc);
            extractor =
                new XWPFWordExtractor(doc);
            //extractor.Text;

            Assert.AreEqual(
                "I am the header on the first page, and I" + '\u2019' + "m nice and simple\n" +
                "First header column!\tMid header\tRight header!\n" +
                "This is a sample word document. It has two pages. It has a simple header and footer, which is different to all the other pages.\n" +
                "\n" +
                "HEADING TEXT\n" +
                "\n" +
                "More on page one\n" +
                "\n\n" +
                "End of page 1\n\n\n" +
                "This is page two. It also has a three column heading, and a three column footer.\n" +
                "The footer of the first page\n" +
                "Footer Left\tFooter Middle\tFooter Right\n",
                extractor.Text
                );
        }

コード例 #22

0

ファイルを表示

        public IActionResult PostearDoc(IFormFile file)
        {
            string texto;

            if (file.ContentType == "application/pdf")
            {
                using (MemoryStream ms = new MemoryStream())
                {
                    file.CopyTo(ms);
                    byte[]    que       = ms.ToArray();
                    PdfReader pdfReader = new PdfReader(que);

                    byte[] contenidoPageUno = pdfReader.GetPageContent(1);

                    PrTokeniser tokenizer = new PrTokeniser(new RandomAccessFileOrArray(contenidoPageUno));

                    List <string> strList = new List <string>();
                    texto = String.Empty;

                    while (tokenizer.NextToken())
                    {
                        if (tokenizer.TokenType == PrTokeniser.TK_STRING)
                        {
                            strList.Add(tokenizer.StringValue);
                            texto = texto + tokenizer.StringValue;
                        }
                    }

                    pdfReader.Close();
                }
            }
            else
            {
                XWPFDocument      doc       = new XWPFDocument(file.OpenReadStream());
                XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
                texto = extractor.Text;
            }



            return(Json(new { texto }));
        }

コード例 #23

0

ファイルを表示

ファイル: TestXWPFWordExtractor.cs プロジェクト: zzy092/npoi

        public void TestGetComplexText()
        {
            XWPFDocument      doc       = XWPFTestDataSamples.OpenSampleDocument("IllustrativeCases.docx");
            XWPFWordExtractor extractor = new XWPFWordExtractor(doc);

            String text = extractor.Text;

            Assert.IsTrue(text.Length > 0);

            char euro = '\u20ac';

            Debug.WriteLine("'" + text.Substring(text.Length - 40) + "'");

            //Check contents
            Assert.IsTrue(text.StartsWith(
                              "  \n(V) ILLUSTRATIVE CASES\n\n"
                              ));
            Assert.IsTrue(text.Contains(
                              "As well as gaining " + euro + "90 from child benefit increases, he will also receive the early childhood supplement of " + euro + "250 per quarter for Vincent for the full four quarters of the year.\n\n\n\n"// \n\n\n"
                              ));
            Assert.IsTrue(text.EndsWith(
                              "11.4%\t\t90\t\t\t\t\t250\t\t1,310\t\n\n \n\n\n"
                              ));

            // Check number of paragraphs
            int ps = 0;

            char[] t = text.ToCharArray();
            for (int i = 0; i < t.Length; i++)
            {
                if (t[i] == '\n')
                {
                    ps++;
                }
            }
            Assert.AreEqual(134, ps);
        }

コード例 #24

0

ファイルを表示

ファイル: WordReader.cs プロジェクト: BhatShil23/FileConverter

        public string ReadData()
        {
            string fileData = string.Empty;

            logger.AppendLog("Validating file extension..");

            if (ValidateFile() != FileValidationError.NoError)
            {
                logger.AppendError("Incorrect file extension.");
                return(fileData);
            }

            logger.AppendLog("Valid file extension.");
            logger.AppendLog("Reading file...");

            try
            {
                using (FileStream fs = new FileStream(this.FileName,
                                                      FileMode.Open, FileAccess.Read))
                {
                    XWPFDocument      doc          = new XWPFDocument(fs);
                    XWPFWordExtractor docExtractor = new XWPFWordExtractor(doc);
                    fileData = docExtractor.Text;
                }

                logger.AppendLog("File read successfully.");
            }
            catch (Exception ex)
            {
                string errorMsg = "Error reading word file. Aborting read operation.";
                logger.AppendError(errorMsg);
                throw new Exception(errorMsg + ex.Message);
            }

            return(fileData);
        }

コード例 #25

0

ファイルを表示

ファイル: TestXWPFWordExtractor.cs プロジェクト: zzy092/npoi

        public void TestSimpleControlContent()
        {
            XWPFDocument doc = XWPFTestDataSamples.OpenSampleDocument("Bug54849.docx");

            String[] targs = new String[] {
                "header_rich_text",
                "rich_text",
                "rich_text_pre_table\nrich_text_cell1\t\t\t\n\t\t\t\n\t\t\t\n\nrich_text_post_table",
                "plain_text_no_newlines",
                "plain_text_with_newlines1\nplain_text_with_newlines2\n",
                "watermelon\n",
                "dirt\n",
                "4/16/2013\n",
                "rich_text_in_cell",
                "abc",
                "rich_text_in_paragraph_in_cell",
                "footer_rich_text",
                "footnote_sdt",
                "endnote_sdt"
            };
            XWPFWordExtractor ex = new XWPFWordExtractor(doc);
            String            s  = ex.Text.ToLower();
            int hits             = 0;

            foreach (String targ in targs)
            {
                bool hitted = false;
                if (s.Contains(targ))
                {
                    hitted = true;
                    hits++;
                }
                Assert.AreEqual(true, hitted, "controlled content loading-" + targ);
            }
            Assert.AreEqual(targs.Length, hits, "controlled content loading hit count");

            ex.Close();

            doc   = XWPFTestDataSamples.OpenSampleDocument("Bug54771a.docx");
            targs = new String[] {
                "bb",
                "test subtitle\n",
                "test user\n",
            };
            ex = new XWPFWordExtractor(doc);
            s  = ex.Text.ToLower();

            //At one point in development there were three copies of the text.
            //This ensures that there is only one copy.
            MatchCollection mc;
            int             hit;

            foreach (String targ in targs)
            {
                mc  = Regex.Matches(s, targ);
                hit = 0;
                foreach (Match m in mc)
                {
                    if (m.Success)
                    {
                        hit++;
                    }
                }
                Assert.AreEqual(1, hit, "controlled content loading-" + targ);
            }
            //"test\n" appears twice: once as the "title" and once in the text.
            //This also happens when you save this document as text from MSWord.
            mc  = Regex.Matches(s, "test\n");
            hit = 0;
            foreach (Match m in mc)
            {
                if (m.Success)
                {
                    hit++;
                }
            }
            Assert.AreEqual(2, hit, "test<N>");
            ex.Close();
        }

C# (CSharp) XWPFWordExtractorの例