/**
         * Searches for a tag in a page.
         *
         * @param tag
         *            the name of the tag
         * @param obj
         *            an identifier to find the marked content
         * @param page
         *            a page dictionary
         * @throws IOException
         */
        public void ParseTag(String tag, PdfObject obj, PdfDictionary page)
        {
            PRStream stream = (PRStream)page.GetAsStream(PdfName.CONTENTS);

            // if the identifier is a number, we can extract the content right away
            if (obj is PdfNumber)
            {
                PdfNumber                  mcid      = (PdfNumber)obj;
                RenderFilter               filter    = new MarkedContentRenderFilter(mcid.IntValue);
                ITextExtractionStrategy    strategy  = new SimpleTextExtractionStrategy();
                FilteredTextRenderListener listener  = new FilteredTextRenderListener(strategy, new RenderFilter[] { filter });
                PdfContentStreamProcessor  processor = new PdfContentStreamProcessor(
                    listener);
                processor.ProcessContent(PdfReader.GetStreamBytes(stream), page
                                         .GetAsDict(PdfName.RESOURCES));
                outp.Write(SimpleXMLParser.EscapeXML(listener.GetResultantText(), true));
            }
            // if the identifier is an array, we call the parseTag method
            // recursively
            else if (obj is PdfArray)
            {
                PdfArray arr = (PdfArray)obj;
                int      n   = arr.Size;
                for (int i = 0; i < n; i++)
                {
                    ParseTag(tag, arr[i], page);
                    if (i < n - 1)
                    {
                        outp.WriteLine();
                    }
                }
            }
            // if the identifier is a dictionary, we get the resources from the
            // dictionary
            else if (obj is PdfDictionary)
            {
                PdfDictionary mcr = (PdfDictionary)obj;
                ParseTag(tag, mcr.GetDirectObject(PdfName.MCID), mcr
                         .GetAsDict(PdfName.PG));
            }
        }
Example #2
0
 /**
  * Searches for a tag in a page.
  * 
  * @param tag
  *            the name of the tag
  * @param obj
  *            an identifier to find the marked content
  * @param page
  *            a page dictionary
  * @throws IOException
  */
 public void ParseTag(String tag, PdfObject obj, PdfDictionary page) {
     PRStream stream = (PRStream) page.GetAsStream(PdfName.CONTENTS);
     // if the identifier is a number, we can extract the content right away
     if (obj is PdfNumber) {
         PdfNumber mcid = (PdfNumber) obj;
         RenderFilter filter = new MarkedContentRenderFilter(mcid.IntValue);
         ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
         FilteredTextRenderListener listener = new FilteredTextRenderListener(strategy, new RenderFilter[]{filter});
         PdfContentStreamProcessor processor = new PdfContentStreamProcessor(
                 listener);
         processor.ProcessContent(PdfReader.GetStreamBytes(stream), page
                 .GetAsDict(PdfName.RESOURCES));
         outp.Write(SimpleXMLParser.EscapeXML(listener.GetResultantText(), true));
     }
     // if the identifier is an array, we call the parseTag method
     // recursively
     else if (obj is PdfArray) {
         PdfArray arr = (PdfArray) obj;
         int n = arr.Size;
         for (int i = 0; i < n; i++) {
             ParseTag(tag, arr[i], page);
             if (i < n - 1)
                 outp.WriteLine();
         }
     }
     // if the identifier is a dictionary, we get the resources from the
     // dictionary
     else if (obj is PdfDictionary) {
         PdfDictionary mcr = (PdfDictionary) obj;
         ParseTag(tag, mcr.GetDirectObject(PdfName.MCID), mcr
                 .GetAsDict(PdfName.PG));
     }
 }