ProcessContent() public method

public ProcessContent ( byte contentBytes, PdfDictionary resources ) : void
contentBytes byte
resources PdfDictionary
return void
        /**
         * Put all data from given day into given table
         */
        public static void collectDataforDay(DateTime dateTime, SQLiteConnection m_dbConnection)
        {
            string zeroMonth = dateTime.Month < 10 ? "0" : "";
              string zeroDay = dateTime.Day < 10 ? "0" : "";
              string date = zeroMonth + dateTime.Month + "/" + zeroDay + dateTime.Day + "/" + dateTime.Year;

              PdfReader reader;
              try
              {
            reader = new PdfReader("http://www.equibase.com/premium/eqbPDFChartPlus.cfm?RACE=A&BorP=P&TID=SAR&CTRY=USA&DT=" + date + "&DAY=D&STYLE=EQB");
              }
              catch (Exception e)
              {
            Console.WriteLine("CAPTCHA TIME");
            Console.ReadKey();
            Console.ReadKey();

            reader = new PdfReader("http://www.equibase.com/premium/eqbPDFChartPlus.cfm?RACE=A&BorP=P&TID=SAR&CTRY=USA&DT=" + date + "&DAY=D&STYLE=EQB");
              }
              StringBuilder builder = new StringBuilder();

              for (int x = 1; x <= reader.NumberOfPages; x++)
              {
            PdfDictionary page = reader.GetPageN(x);
            IRenderListener listener = new SBTextRenderer(builder);
            PdfContentStreamProcessor processor = new PdfContentStreamProcessor(listener);
            PdfDictionary pageDic = reader.GetPageN(x);
            PdfDictionary resourcesDic = pageDic.GetAsDict(PdfName.RESOURCES);
            processor.ProcessContent(ContentByteUtils.GetContentBytesForPage(reader, x), resourcesDic);
              }

              if (pages.Count != 0)
              {

            DataHandler handler = new DataHandler(dateTime, pages, m_dbConnection);
            Thread thread = new Thread(new ThreadStart(handler.extractPdfData));

            thread.Start();
            thread.Join();
            reader.Dispose();
            pages.Clear();
              }
              else
              {
            // If there were no races on this particular day, simply skip it! :D
            Console.WriteLine("Invalid Date: " + date);
              }
        }
            public void HandleXObject(PdfContentStreamProcessor processor, PdfStream stream, PdfIndirectReference refi)
            {
                PdfDictionary resources = stream.GetAsDict(PdfName.RESOURCES);

                // we read the content bytes up here so if it fails we don't leave the graphics state stack corrupted
                // this is probably not necessary (if we fail on this, probably the entire content stream processing
                // operation should be rejected
                byte[] contentBytes;
                contentBytes = ContentByteUtils.GetContentBytesFromContentObject(stream);
                PdfArray matrix = stream.GetAsArray(PdfName.MATRIX);

                new PushGraphicsState().Invoke(processor, null, null);

                if (matrix != null)
                {
                    float  a          = matrix.GetAsNumber(0).FloatValue;
                    float  b          = matrix.GetAsNumber(1).FloatValue;
                    float  c          = matrix.GetAsNumber(2).FloatValue;
                    float  d          = matrix.GetAsNumber(3).FloatValue;
                    float  e          = matrix.GetAsNumber(4).FloatValue;
                    float  f          = matrix.GetAsNumber(5).FloatValue;
                    Matrix formMatrix = new Matrix(a, b, c, d, e, f);

                    processor.Gs().ctm = formMatrix.Multiply(processor.Gs().ctm);
                }

                processor.ProcessContent(contentBytes, resources);

                new PopGraphicsState().Invoke(processor, null, null);
            }
        /**
         * Processes content from the specified page number using the specified listener
         * @param <E> the type of the renderListener - this makes it easy to chain calls
         * @param pageNumber the page number to process
         * @param renderListener the listener that will receive render callbacks
         * @return the provided renderListener
         * @throws IOException if operations on the reader fail
         */

        public E ProcessContent <E>(int pageNumber, E renderListener) where E : IRenderListener
        {
            PdfDictionary pageDic      = reader.GetPageN(pageNumber);
            PdfDictionary resourcesDic = pageDic.GetAsDict(PdfName.RESOURCES);

            PdfContentStreamProcessor processor = new PdfContentStreamProcessor(renderListener);

            processor.ProcessContent(ContentByteUtils.GetContentBytesForPage(reader, pageNumber), resourcesDic);
            return(renderListener);
        }
        /**
         * Processes content from the specified page number using the specified listener.
         * Also allows registration of custom ContentOperators
         * @param <E> the type of the renderListener - this makes it easy to chain calls
         * @param pageNumber the page number to process
         * @param renderListener the listener that will receive render callbacks
         * @param additionalContentOperators an optional dictionary of custom IContentOperators for rendering instructions
         * @return the provided renderListener
         * @throws IOException if operations on the reader fail
         */
        public virtual E ProcessContent <E>(int pageNumber, E renderListener, IDictionary <string, IContentOperator> additionalContentOperators) where E : IRenderListener
        {
            PdfDictionary pageDic      = reader.GetPageN(pageNumber);
            PdfDictionary resourcesDic = pageDic.GetAsDict(PdfName.RESOURCES);

            PdfContentStreamProcessor processor = new PdfContentStreamProcessor(renderListener);

            foreach (KeyValuePair <string, IContentOperator> entry in additionalContentOperators)
            {
                processor.RegisterContentOperator(entry.Key, entry.Value);
            }
            processor.ProcessContent(ContentByteUtils.GetContentBytesForPage(reader, pageNumber), resourcesDic);
            return(renderListener);
        }
 public void extractSnippets(String src, String dest)
 {
     TextWriter output = new StreamWriter(new FileStream(dest, FileMode.Create));
     PdfReader reader = new PdfReader(src);
     IRenderListener listener = new MyTextRenderListener(output);
     PdfContentStreamProcessor processor =
         new PdfContentStreamProcessor(listener);
     PdfDictionary pageDic = reader.GetPageN(1);
     PdfDictionary resourcesDic = pageDic.GetAsDict(PdfName.RESOURCES);
     processor.ProcessContent(ContentByteUtils.GetContentBytesForPage(reader, 1), resourcesDic);
     output.Flush();
     output.Close();
     reader.Close();
 }
 virtual public void WeirdHyphensTest() {
     PdfReader reader = TestResourceUtils.GetResourceAsPdfReader(TEST_RESOURCES_PATH, "WeirdHyphens.pdf");
     List<String> textChunks = new List<String>();
     IRenderListener listener = new MyTextRenderListener(textChunks);
     PdfContentStreamProcessor processor = new PdfContentStreamProcessor(listener);
     PdfDictionary pageDic = reader.GetPageN(1);
     PdfDictionary resourcesDic = pageDic.GetAsDict(PdfName.RESOURCES);
     processor.ProcessContent(ContentByteUtils.GetContentBytesForPage(reader, 1), resourcesDic);
     /**
      * This assertion makes sure that encoding has been read properly from FontDescriptor.
      * If not the vallue will be "\u0000 14".
      */
     Assert.AreEqual("\u0096 14", textChunks[18]);
     reader.Close();
 }
        private void ProcessBytes(
            string resourceName,
            int pageNumber)
        {
            PdfReader pdfReader = TestResourceUtils.GetResourceAsPdfReader(TEST_RESOURCES_PATH, resourceName);

            PdfDictionary pageDictionary = pdfReader.GetPageN(pageNumber);

            PdfDictionary resourceDictionary = pageDictionary.GetAsDict(PdfName.RESOURCES);

            PdfObject contentObject = pageDictionary.Get(PdfName.CONTENTS);
            byte[] contentBytes = ReadContentBytes(contentObject);
            PdfContentStreamProcessor processor = new PdfContentStreamProcessor(_renderListener);
            processor.ProcessContent(contentBytes, resourceDictionary);

        }
        public void TestType3FontWidth() {
            String inFile = "type3font_text.pdf";
            LineSegment origLineSegment = new LineSegment(new Vector(20.3246f, 769.4974f, 1.0f), new Vector(151.22923f, 769.4974f, 1.0f));

            PdfReader reader = TestResourceUtils.GetResourceAsPdfReader(TEST_RESOURCES_PATH, inFile);
            TextPositionRenderListener renderListener = new TextPositionRenderListener();
            PdfContentStreamProcessor processor = new PdfContentStreamProcessor(renderListener);

            PdfDictionary pageDic = reader.GetPageN(FIRST_PAGE);
            PdfDictionary resourcesDic = pageDic.GetAsDict(PdfName.RESOURCES);
            processor.ProcessContent(ContentByteUtils.GetContentBytesForPage(reader, FIRST_PAGE), resourcesDic);


            Assert.AreEqual(renderListener.LineSegments[FIRST_ELEMENT_INDEX].GetStartPoint()[FIRST_ELEMENT_INDEX],
                origLineSegment.GetStartPoint()[FIRST_ELEMENT_INDEX], 1/2f);

            Assert.AreEqual(renderListener.LineSegments[FIRST_ELEMENT_INDEX].GetEndPoint()[FIRST_ELEMENT_INDEX],
                origLineSegment.GetEndPoint()[FIRST_ELEMENT_INDEX], 1/2f);

        }
        /**
         * Searches for a tag in a page.
         *
         * @param tag
         *            the name of the tag
         * @param obj
         *            an identifier to find the marked content
         * @param page
         *            a page dictionary
         * @throws IOException
         */
        public void ParseTag(String tag, PdfObject obj, PdfDictionary page)
        {
            PRStream stream = (PRStream)page.GetAsStream(PdfName.CONTENTS);

            // if the identifier is a number, we can extract the content right away
            if (obj is PdfNumber)
            {
                PdfNumber                  mcid      = (PdfNumber)obj;
                RenderFilter               filter    = new MarkedContentRenderFilter(mcid.IntValue);
                ITextExtractionStrategy    strategy  = new SimpleTextExtractionStrategy();
                FilteredTextRenderListener listener  = new FilteredTextRenderListener(strategy, new RenderFilter[] { filter });
                PdfContentStreamProcessor  processor = new PdfContentStreamProcessor(
                    listener);
                processor.ProcessContent(PdfReader.GetStreamBytes(stream), page
                                         .GetAsDict(PdfName.RESOURCES));
                outp.Write(SimpleXMLParser.EscapeXML(listener.GetResultantText(), true));
            }
            // if the identifier is an array, we call the parseTag method
            // recursively
            else if (obj is PdfArray)
            {
                PdfArray arr = (PdfArray)obj;
                int      n   = arr.Size;
                for (int i = 0; i < n; i++)
                {
                    ParseTag(tag, arr[i], page);
                    if (i < n - 1)
                    {
                        outp.WriteLine();
                    }
                }
            }
            // if the identifier is a dictionary, we get the resources from the
            // dictionary
            else if (obj is PdfDictionary)
            {
                PdfDictionary mcr = (PdfDictionary)obj;
                ParseTag(tag, mcr.GetDirectObject(PdfName.MCID), mcr
                         .GetAsDict(PdfName.PG));
            }
        }
Example #10
0
 public override void ParseTag(String tag, PdfObject obj, PdfDictionary page) {
     if (obj is PdfNumber) {
         PdfNumber mcid = (PdfNumber) obj;
         RenderFilter filter = new MyMarkedContentRenderFilter(mcid.IntValue);
         ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
         FilteredTextRenderListener listener = new FilteredTextRenderListener(
             strategy, filter);
         PdfContentStreamProcessor processor = new PdfContentStreamProcessor(
             listener);
         processor.ProcessContent(PdfReader.GetPageContent(page), page
                                                                      .GetAsDict(PdfName.RESOURCES));
         outp.Write(XMLUtil.EscapeXML(listener.GetResultantText(), true));
     }
     else {
         base.ParseTag(tag, obj, page);
     }
 }
Example #11
0
        public virtual void SpaceTrimColumnTextTest() {
            Document doc = new Document(PageSize.A4, 50, 30, 50, 30);
            PdfWriter writer = PdfWriter.GetInstance(doc, new FileStream(OUTSPTRIMCT, FileMode.Create));
            doc.Open();

            Phrase under = new Phrase();
            under.Font = new Font(Font.FontFamily.TIMES_ROMAN, 12, Font.UNDERLINE);
            under.Add(new Chunk(" 1                                                      1                                                                                                                             9      "));

            Paragraph underlineTest = new Paragraph(under);
            underlineTest.KeepTogether = true;
            doc.Add(underlineTest);

            doc.Close();
            writer.Close();

            PdfReader reader = new PdfReader(OUTSPTRIMCT);
            MyTextRenderListener listener = new MyTextRenderListener();
            PdfContentStreamProcessor processor = new PdfContentStreamProcessor(listener);
            PdfDictionary pageDic = reader.GetPageN(1);
            PdfDictionary resourcesDic = pageDic.GetAsDict(PdfName.RESOURCES);
            processor.ProcessContent(ContentByteUtils.GetContentBytesForPage(reader, 1), resourcesDic);
            Assert.IsTrue(listener.GetText().Length == 60, "Unexpected text length");
        }
            public void HandleXObject(PdfContentStreamProcessor processor, PdfStream stream, PdfIndirectReference refi) {
                
                PdfDictionary resources = stream.GetAsDict(PdfName.RESOURCES);

                // we read the content bytes up here so if it fails we don't leave the graphics state stack corrupted
                // this is probably not necessary (if we fail on this, probably the entire content stream processing
                // operation should be rejected
                byte[] contentBytes;
                contentBytes = ContentByteUtils.GetContentBytesFromContentObject(stream);
                PdfArray matrix = stream.GetAsArray(PdfName.MATRIX);

                new PushGraphicsState().Invoke(processor, null, null);

                if (matrix != null){
                    float a = matrix.GetAsNumber(0).FloatValue;
                    float b = matrix.GetAsNumber(1).FloatValue;
                    float c = matrix.GetAsNumber(2).FloatValue;
                    float d = matrix.GetAsNumber(3).FloatValue;
                    float e = matrix.GetAsNumber(4).FloatValue;
                    float f = matrix.GetAsNumber(5).FloatValue;
                    Matrix formMatrix = new Matrix(a, b, c, d, e, f);

                    processor.Gs().ctm = formMatrix.Multiply(processor.Gs().ctm);
                }

                processor.ProcessContent(contentBytes, resources);

                new PopGraphicsState().Invoke(processor, null, null);
                
            }
Example #13
0
 /**
  * Searches for a tag in a page.
  * 
  * @param tag
  *            the name of the tag
  * @param obj
  *            an identifier to find the marked content
  * @param page
  *            a page dictionary
  * @throws IOException
  */
 public void ParseTag(String tag, PdfObject obj, PdfDictionary page) {
     PRStream stream = (PRStream) page.GetAsStream(PdfName.CONTENTS);
     // if the identifier is a number, we can extract the content right away
     if (obj is PdfNumber) {
         PdfNumber mcid = (PdfNumber) obj;
         RenderFilter filter = new MarkedContentRenderFilter(mcid.IntValue);
         ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
         FilteredTextRenderListener listener = new FilteredTextRenderListener(strategy, new RenderFilter[]{filter});
         PdfContentStreamProcessor processor = new PdfContentStreamProcessor(
                 listener);
         processor.ProcessContent(PdfReader.GetStreamBytes(stream), page
                 .GetAsDict(PdfName.RESOURCES));
         outp.Write(SimpleXMLParser.EscapeXML(listener.GetResultantText(), true));
     }
     // if the identifier is an array, we call the parseTag method
     // recursively
     else if (obj is PdfArray) {
         PdfArray arr = (PdfArray) obj;
         int n = arr.Size;
         for (int i = 0; i < n; i++) {
             ParseTag(tag, arr[i], page);
             if (i < n - 1)
                 outp.WriteLine();
         }
     }
     // if the identifier is a dictionary, we get the resources from the
     // dictionary
     else if (obj is PdfDictionary) {
         PdfDictionary mcr = (PdfDictionary) obj;
         ParseTag(tag, mcr.GetDirectObject(PdfName.MCID), mcr
                 .GetAsDict(PdfName.PG));
     }
 }
        private void CleanUpPage(int pageNum, IList<PdfCleanUpLocation> cleanUpLocations) {
            if (cleanUpLocations.Count == 0) {
                return;
            }

            PdfReader pdfReader = pdfStamper.Reader;
            PdfDictionary page = pdfReader.GetPageN(pageNum);
            PdfContentByte canvas = pdfStamper.GetUnderContent(pageNum);
            byte[] pageContentInput = ContentByteUtils.GetContentBytesForPage(pdfReader, pageNum);
            page.Remove(PdfName.CONTENTS);

            canvas.SaveState();

            PdfCleanUpRegionFilter filter = CreateFilter(cleanUpLocations);
            PdfCleanUpRenderListener pdfCleanUpRenderListener = new PdfCleanUpRenderListener(pdfStamper, filter);
            pdfCleanUpRenderListener.RegisterNewContext(pdfReader.GetPageResources(page), canvas);

            PdfContentStreamProcessor contentProcessor = new PdfContentStreamProcessor(pdfCleanUpRenderListener);
            PdfCleanUpContentOperator.PopulateOperators(contentProcessor, pdfCleanUpRenderListener);
            contentProcessor.ProcessContent(pageContentInput, page.GetAsDict(PdfName.RESOURCES));
            pdfCleanUpRenderListener.PopContext();

            canvas.RestoreState();

            ColorCleanedLocations(canvas, cleanUpLocations);

            if (redactAnnotIndirRefs != null) { // if it isn't null, then we are in "extract locations from redact annots" mode
                DeleteRedactAnnots(pageNum);
            }
        }
Example #15
0
// ---------------------------------------------------------------------------    
    /**
     * Extracts text from a PDF document.
     * @param src the original PDF document
     */
    public string ExtractText(byte[] src) {
      PdfReader reader = new PdfReader(src);
      MyTextRenderListener listener = new MyTextRenderListener();
      PdfContentStreamProcessor processor = new PdfContentStreamProcessor(listener);
      PdfDictionary pageDic = reader.GetPageN(1);
      PdfDictionary resourcesDic = pageDic.GetAsDict(PdfName.RESOURCES);
      processor.ProcessContent(
        ContentByteUtils.GetContentBytesForPage(reader, 1), 
        resourcesDic
      );
      return listener.Text.ToString();
    }