Пример #1
0
        public virtual void SpaceTrimColumnTextTest()
        {
            Document  doc    = new Document(PageSize.A4, 50, 30, 50, 30);
            PdfWriter writer = PdfWriter.GetInstance(doc, new FileStream(OUTSPTRIMCT, FileMode.Create));

            doc.Open();

            Phrase under = new Phrase();

            under.Font = new Font(Font.FontFamily.TIMES_ROMAN, 12, Font.UNDERLINE);
            under.Add(new Chunk(" 1                                                      1                                                                                                                             9      "));

            Paragraph underlineTest = new Paragraph(under);

            underlineTest.KeepTogether = true;
            doc.Add(underlineTest);

            doc.Close();
            writer.Close();

            PdfReader                 reader       = new PdfReader(OUTSPTRIMCT);
            MyTextRenderListener      listener     = new MyTextRenderListener();
            PdfContentStreamProcessor processor    = new PdfContentStreamProcessor(listener);
            PdfDictionary             pageDic      = reader.GetPageN(1);
            PdfDictionary             resourcesDic = pageDic.GetAsDict(PdfName.RESOURCES);

            processor.ProcessContent(ContentByteUtils.GetContentBytesForPage(reader, 1), resourcesDic);
            Assert.IsTrue(listener.GetText().Length == 60, "Unexpected text length");
        }
        /**
         * This method edits the immediate contents of a page, i.e. its content stream.
         * It explicitly does not descent into form xobjects, patterns, or annotations.
         */
        public void EditPage(PdfStamper pdfStamper, int pageNum)
        {
            var pdfReader        = pdfStamper.Reader;
            var page             = pdfReader.GetPageN(pageNum);
            var pageContentInput = ContentByteUtils.GetContentBytesForPage(pdfReader, pageNum);

            page.Remove(PdfName.CONTENTS);
            EditContent(pageContentInput, page.GetAsDict(PdfName.RESOURCES), pdfStamper.GetUnderContent(pageNum));
        }
Пример #3
0
        /**
         * Processes content from the specified page number using the specified listener
         * @param <E> the type of the renderListener - this makes it easy to chain calls
         * @param pageNumber the page number to process
         * @param renderListener the listener that will receive render callbacks
         * @return the provided renderListener
         * @throws IOException if operations on the reader fail
         */

        virtual public E ProcessContent <E>(int pageNumber, E renderListener) where E : IRenderListener
        {
            PdfDictionary pageDic      = reader.GetPageN(pageNumber);
            PdfDictionary resourcesDic = pageDic.GetAsDict(PdfName.RESOURCES);

            PdfContentStreamProcessor processor = new PdfContentStreamProcessor(renderListener);

            processor.ProcessContent(ContentByteUtils.GetContentBytesForPage(reader, pageNumber), resourcesDic);
            return(renderListener);
        }
Пример #4
0
        public static void ProcessContentPage(PdfReader reader, int page, Test_iTextSharp.ITextExtractionStrategy strategy)
        {
            PdfReaderContentParser parser = new PdfReaderContentParser(reader);

            PdfDictionary pageDic      = reader.GetPageN(page);
            PdfDictionary resourcesDic = pageDic.GetAsDict(PdfName.RESOURCES);

            Test_iTextSharp.PdfContentStreamProcessor processor = new Test_iTextSharp.PdfContentStreamProcessor(strategy);
            byte[] bytes = ContentByteUtils.GetContentBytesForPage(reader, page);
            processor.ProcessContent(bytes, resourcesDic);
        }
        public static string ExtractText(string pdfFilename, int pageNumber)
        {
            PdfReader                 reader       = new PdfReader(pdfFilename);
            MyTextRenderListener      listener     = new MyTextRenderListener();
            PdfContentStreamProcessor processor    = new PdfContentStreamProcessor(listener);
            PdfDictionary             pageDic      = reader.GetPageN(pageNumber);
            PdfDictionary             resourcesDic = pageDic.GetAsDict(PdfName.RESOURCES);

            processor.ProcessContent(ContentByteUtils.GetContentBytesForPage(reader, 1), resourcesDic);
            return(listener.Text.ToString());
        }
        private void CheckPageContent(String path)
        {
            PdfReader     pdfReader = new PdfReader(path);
            PdfDictionary pageDic   = pdfReader.GetPageN(1);

            IRenderListener           dummy     = new DummyRenderListner();
            PdfContentStreamProcessor processor = new PdfContentStreamProcessor(dummy);

            PdfDictionary resourcesDic = pageDic.GetAsDict(PdfName.RESOURCES);

            processor.ProcessContent(ContentByteUtils.GetContentBytesForPage(pdfReader, 1), resourcesDic);
            pdfReader.Close();
        }
Пример #7
0
        public void extractSnippets(String src, String dest)
        {
            TextWriter                output    = new StreamWriter(new FileStream(dest, FileMode.Create));
            PdfReader                 reader    = new PdfReader(src);
            IRenderListener           listener  = new MyTextRenderListener(output);
            PdfContentStreamProcessor processor =
                new PdfContentStreamProcessor(listener);
            PdfDictionary pageDic      = reader.GetPageN(1);
            PdfDictionary resourcesDic = pageDic.GetAsDict(PdfName.RESOURCES);

            processor.ProcessContent(ContentByteUtils.GetContentBytesForPage(reader, 1), resourcesDic);
            output.Flush();
            output.Close();
            reader.Close();
        }
        /**
         * Put all data from given day into given table
         */
        public static void collectDataforDay(DateTime dateTime, SQLiteConnection m_dbConnection)
        {
            string zeroMonth = dateTime.Month < 10 ? "0" : "";
            string zeroDay   = dateTime.Day < 10 ? "0" : "";
            string date      = zeroMonth + dateTime.Month + "/" + zeroDay + dateTime.Day + "/" + dateTime.Year;

            PdfReader reader;

            try
            {
                reader = new PdfReader("http://www.equibase.com/premium/eqbPDFChartPlus.cfm?RACE=A&BorP=P&TID=SAR&CTRY=USA&DT=" + date + "&DAY=D&STYLE=EQB");
            }
            catch (Exception e)
            {
                Console.WriteLine("CAPTCHA TIME");
                Console.ReadKey();
                Console.ReadKey();

                reader = new PdfReader("http://www.equibase.com/premium/eqbPDFChartPlus.cfm?RACE=A&BorP=P&TID=SAR&CTRY=USA&DT=" + date + "&DAY=D&STYLE=EQB");
            }
            StringBuilder builder = new StringBuilder();

            for (int x = 1; x <= reader.NumberOfPages; x++)
            {
                PdfDictionary             page         = reader.GetPageN(x);
                IRenderListener           listener     = new SBTextRenderer(builder);
                PdfContentStreamProcessor processor    = new PdfContentStreamProcessor(listener);
                PdfDictionary             pageDic      = reader.GetPageN(x);
                PdfDictionary             resourcesDic = pageDic.GetAsDict(PdfName.RESOURCES);
                processor.ProcessContent(ContentByteUtils.GetContentBytesForPage(reader, x), resourcesDic);
            }

            if (pages.Count != 0)
            {
                DataHandler handler = new DataHandler(dateTime, pages, m_dbConnection);
                Thread      thread  = new Thread(new ThreadStart(handler.extractPdfData));

                thread.Start();
                thread.Join();
                reader.Dispose();
                pages.Clear();
            }
            else
            {
                // If there were no races on this particular day, simply skip it! :D
                Console.WriteLine("Invalid Date: " + date);
            }
        }
Пример #9
0
        virtual public void WeirdHyphensTest()
        {
            PdfReader                 reader       = TestResourceUtils.GetResourceAsPdfReader(TEST_RESOURCES_PATH, "WeirdHyphens.pdf");
            List <String>             textChunks   = new List <String>();
            IRenderListener           listener     = new MyTextRenderListener(textChunks);
            PdfContentStreamProcessor processor    = new PdfContentStreamProcessor(listener);
            PdfDictionary             pageDic      = reader.GetPageN(1);
            PdfDictionary             resourcesDic = pageDic.GetAsDict(PdfName.RESOURCES);

            processor.ProcessContent(ContentByteUtils.GetContentBytesForPage(reader, 1), resourcesDic);

            /**
             * This assertion makes sure that encoding has been read properly from FontDescriptor.
             * If not the vallue will be "\u0000 14".
             */
            Assert.AreEqual("\u0096 14", textChunks[18]);
            reader.Close();
        }
Пример #10
0
        public void TestType3FontWidth()
        {
            String      inFile          = "type3font_text.pdf";
            LineSegment origLineSegment = new LineSegment(new Vector(20.3246f, 769.4974f, 1.0f), new Vector(151.22923f, 769.4974f, 1.0f));

            PdfReader reader = TestResourceUtils.GetResourceAsPdfReader(TEST_RESOURCES_PATH, inFile);
            TextPositionRenderListener renderListener = new TextPositionRenderListener();
            PdfContentStreamProcessor  processor      = new PdfContentStreamProcessor(renderListener);

            PdfDictionary pageDic      = reader.GetPageN(FIRST_PAGE);
            PdfDictionary resourcesDic = pageDic.GetAsDict(PdfName.RESOURCES);

            processor.ProcessContent(ContentByteUtils.GetContentBytesForPage(reader, FIRST_PAGE), resourcesDic);


            Assert.AreEqual(renderListener.LineSegments[FIRST_ELEMENT_INDEX].GetStartPoint()[FIRST_ELEMENT_INDEX],
                            origLineSegment.GetStartPoint()[FIRST_ELEMENT_INDEX], 1 / 2f);

            Assert.AreEqual(renderListener.LineSegments[FIRST_ELEMENT_INDEX].GetEndPoint()[FIRST_ELEMENT_INDEX],
                            origLineSegment.GetEndPoint()[FIRST_ELEMENT_INDEX], 1 / 2f);
        }
Пример #11
0
        private void CleanUpPage(int pageNum, IList <PdfCleanUpLocation> cleanUpLocations)
        {
            if (cleanUpLocations.Count == 0)
            {
                return;
            }

            PdfReader      pdfReader = pdfStamper.Reader;
            PdfDictionary  page      = pdfReader.GetPageN(pageNum);
            PdfContentByte canvas    = pdfStamper.GetUnderContent(pageNum);

            byte[] pageContentInput = ContentByteUtils.GetContentBytesForPage(pdfReader, pageNum);
            page.Remove(PdfName.CONTENTS);

            canvas.SaveState();

            PdfCleanUpRegionFilter   filter = CreateFilter(cleanUpLocations);
            PdfCleanUpRenderListener pdfCleanUpRenderListener = new PdfCleanUpRenderListener(pdfStamper, filter);

            pdfCleanUpRenderListener.RegisterNewContext(pdfReader.GetPageResources(page), canvas);

            PdfContentStreamProcessor contentProcessor = new PdfContentStreamProcessor(pdfCleanUpRenderListener);

            PdfCleanUpContentOperator.PopulateOperators(contentProcessor, pdfCleanUpRenderListener);
            contentProcessor.ProcessContent(pageContentInput, page.GetAsDict(PdfName.RESOURCES));
            pdfCleanUpRenderListener.PopContext();

            canvas.RestoreState();

            ColorCleanedLocations(canvas, cleanUpLocations);

            if (redactAnnotIndirRefs != null)   // if it isn't null, then we are in "extract locations from redact annots" mode
            {
                DeleteRedactAnnots(pageNum);
            }
        }
Пример #12
0
        private void RunBtn_Click(object sender, EventArgs e) //Paso uno, resibe PDF => StringBuilder
        {
            if (TxtUrl.Text == "")                            //En caso que no se ha seleccionado nada
            {
                MessageBox.Show("Please upload a PDF file first !!");
            }
            else
            {
                progressBar1.Visible = true;
                this.timer1.Start();


                PdfReader     reader  = new PdfReader(TxtUrl.Text.ToString()); //Lee la ubicacion del archivo
                StringBuilder builder = new StringBuilder();                   //El documento se guarada como string builder

                for (int x = 1; x <= reader.NumberOfPages; x++)
                {
                    //PdfDictionary page = reader.GetPageN(x);
                    IRenderListener           listener     = new SBTextRenderer(builder);
                    PdfContentStreamProcessor processor    = new PdfContentStreamProcessor(listener);
                    PdfDictionary             pageDic      = reader.GetPageN(x);
                    PdfDictionary             resourcesDic = pageDic.GetAsDict(PdfName.RESOURCES);
                    processor.ProcessContent(ContentByteUtils.GetContentBytesForPage(reader, x), resourcesDic);

                    if (listPages.Count != 0)
                    {
                        listPages.Add(builder.ToString().Replace(listPages[x - 2].ToString(), ""));
                    }
                    else
                    {
                        listPages.Add(builder.ToString());
                    }


                    if (x == reader.NumberOfPages)
                    {
                        foreach (string pagina in listPages)
                        {
                            for (int p = 0; p < pagina.Length; p++)
                            {
                                if (pagina[p].ToString() == "e" && pagina[p + 1].ToString() == "s" && pagina[p + 2].ToString() == " " && pagina[p + 3].ToString() == "D" && pagina[p + 4].ToString() == "a")
                                {
                                    int index = p + 7;

                                    listHeaders.Add(pagina.Substring(0, index));
                                    break;
                                }
                            }
                        }
                    }
                }//Find de for de conversion de PDF a string

                StringBuilder comenzar = new StringBuilder();
                int           Inicio   = 0;//Guardara la ubicacion dentro del PDF donde se puede comenzar a copiar caracteres
                int           final    = 0;

                for (int i = 0; i < builder.ToString().Length; i++)
                {
                    //ATP es la palabra donde comenzara a guardar caracteres
                    char uno  = 'A';
                    char dos  = 'T';
                    char tres = 'P';

                    encabezado = encabezado + builder.ToString()[i];//Esta variable guarda el encabezado para eliminarlo en las proximas paginas



                    if (builder.ToString()[i] == uno && builder.ToString()[i + 1] == dos && builder.ToString()[i + 2] == tres && banderaInicio == false || builder.ToString()[i] == 'B' && builder.ToString()[i + 1] == 'a' && builder.ToString()[i + 2] == 's' && builder.ToString()[i + 3] == 'e' && builder.ToString()[i + 4] == ':' && banderaInicio == false)
                    {
                        Inicio = i + 2; //Detecta donde inicia a guardar, le suma dos porque son los caracteres de TP de ATP
                                        //  break;//Como se ha encontrado donde comenzar a copiar , se rompe el ciclo
                        banderaInicio = true;
                        //Aqui detecta el nombre del aeropuerto
                        for (int r = i; r < builder.ToString().Length; r++)
                        {
                            string testErase = builder.ToString()[r].ToString();

                            if (builder.ToString()[r] == ':')
                            {
                                inicioAirport = r + 1;
                            }
                            else if (Regex.IsMatch(builder.ToString()[r].ToString(), @"^\d+$"))
                            {
                                finalAirport = r;
                            }
                            if (inicioAirport != 0 && finalAirport != 0 && airport == "")
                            {
                                airport = builder.ToString().Substring(inicioAirport, (finalAirport - inicioAirport));
                                break;
                            }
                        }
                    }


                    if (i == 2900)
                    {
                        int y = 0;
                    }

                    else if (builder.ToString()[i] == 'U' && builder.ToString()[i + 1] == 'C' && builder.ToString()[i + 2] == 'T' && builder.ToString()[i + 3] == ' ' && builder.ToString()[i + 4] == '2' && builder.ToString()[i + 5] == '2' && builder.ToString()[i + 6] == '1')
                    {
                        final = i;
                        break;
                    }
                    else if (builder.ToString()[i] == 'T' && builder.ToString()[i + 1] == 'T' && builder.ToString()[i + 2] == 'O' && builder.ToString()[i + 3] == ' ' && builder.ToString()[i + 4] == '2' && builder.ToString()[i + 5] == '2' && builder.ToString()[i + 6] == '1')
                    {
                        final = i;
                        break;
                    }
                }//Find de for de deteccion de inicio para copiar

                bool bandera = false;
                // for (int i = Inicio; i < builder.ToString().Length; i++)
                for (int i = Inicio; i < final; i++)
                {
                    if (bandera == false)                                   //Verifica si comienza con cero y con ello lograr comenzar a guardar
                    {
                        string auxiliar = builder.ToString()[i].ToString(); //Se puede borrar, solo se usa de prueba
                        if (builder.ToString()[i].ToString() == "0" || Regex.IsMatch(builder.ToString()[i].ToString(), @"^\d+$"))
                        {
                            bandera = true;
                        }
                    }

                    if (bandera == true)//Como se verifico que si comienza en cero comienza a guardarse
                    {
                        char a = 'T';
                        char b = 'O';
                        char c = 'T';
                        char d = 'A';

                        string auxiliar2 = builder.ToString()[i].ToString();//Solo de prueba, se puede eliminar

                        if (builder.ToString()[i] == a && builder.ToString()[i + 1] == b && builder.ToString()[i + 2] == c && builder.ToString()[i + 3] == d)
                        {
                            int aux = i + 18;//Detecta donde inicia a guardar, le suma dos porque son los caracteres de TP de ATP

                            for (int y = aux; y < builder.ToString().Length; y++)
                            {
                                string auxiliar3 = builder.ToString()[y].ToString();//Se puede borrar, es solo de prueba

                                if (builder.ToString()[y] == 'U' && builder.ToString()[y + 1] == 'S' && builder.ToString()[y + 2] == 'G')
                                {
                                    i = y + 2;
                                    break;
                                }
                            }
                        }
                        else if (builder.ToString()[i] == 'A' && builder.ToString()[i + 1] == 'i' && builder.ToString()[i + 2] == 'r' && builder.ToString()[i + 3] == 'p' && builder.ToString()[i + 4] == 'o' && builder.ToString()[i + 5] == 'r' && builder.ToString()[i + 6] == 't')
                        {
                            break;
                        }
                        else
                        {
                            string cadena = builder.ToString()[i].ToString();
                            comenzar.Append(cadena);
                        }
                    }
                }
                encabezado = encabezado.Replace("DateA", "Date");
                //textBox1.Text = encabezado;
                //textBox2.Text = comenzar.ToString().Replace(encabezado, "");
                string nuevo = comenzar.ToString().Replace(encabezado, "");
                LlenarGrid(nuevo.ToString());
            }
        }
Пример #13
0
        //static const Regex.Replace(s, @"\t|\n|\r", "");
        static void Main(string[] args)
        {
            string fileName = @"D:\mobi.pdf";

            //Open PDF document
            using (PdfReader reader = new PdfReader(fileName))
            {
                var sb = new TextRenderEx();
                //var parser = new PdfReaderContentParser(reader);
                for (int page = 1; page <= reader.NumberOfPages; page++)
                {
                    var size = reader.GetCropBox(page);
                    Console.WriteLine(size.Width);
                    Console.WriteLine(size.Height);

                    PdfDictionary             pdfDictionary = reader.GetPageN(page);
                    IRenderListener           listener      = new SBTextRenderer(sb);
                    PdfContentStreamProcessor processor     = new PdfContentStreamProcessor(listener);
                    PdfDictionary             pageDic       = reader.GetPageN(page);
                    PdfDictionary             resourcesDic  = pageDic.GetAsDict(PdfName.RESOURCES);
                    processor.ProcessContent(ContentByteUtils.GetContentBytesForPage(reader, page), resourcesDic);

                    //Create an instance of our strategy
                    //var t2 = new MyLocationTextExtractionStrategy(searchText, System.Globalization.CompareOptions.IgnoreCase);
                    //var ex = PdfTextExtractor.GetTextFromPage(reader, page, t2);
                    ////Loop through each chunk found
                    //foreach (var p in t2.myPoints)
                    //{
                    //    Console.WriteLine(string.Format("Found text {0} at {1}x{2}", p.Text, p.Rect.Left, p.Rect.Bottom));
                    //}
                    //var strategy = parser.ProcessContent(i, new LocationTextExtractionStrategyEx());

                    //var res = strategy.GetLocations();
                    var    its    = new LocationTextExtractionStrategyEx2();
                    String s      = PdfTextExtractor.GetTextFromPage(reader, page, its);
                    var    result = new StringBuilder();

                    foreach (var t in its.Columbs.Values)
                    {
                        string rs = t.ToString();
                        Console.WriteLine(rs);
                    }

                    string str = result.ToString();
                    //Console.Write(str);
                    //if (!string.IsNullOrWhiteSpace(str) && (str.IndexOf(SearchText) != -1)||searchText.IndexOf(str) != -1) {
                    //    Console.Write(str);
                    //}

                    //Console.WriteLine(pageResult.ToString());

                    // System.Diagnostics.Debug.WriteLine(s);
                    //var its2 = new LocationTextExtractionStrategyEx(searchText, page);
                    //String ss = PdfTextExtractor.GetTextFromPage(reader, page, its2);
                    //for (int i1 = 0; i1 < its2.m_SearchResultsList.Count; i1++)
                    //{
                    //    SearchResult t = its2.m_SearchResultsList[i1];
                    //    Console.WriteLine(string.Format("text:{2}; x:{0},y:{1}", t.iPosX, t.iPosY, t.Text));
                    //    Console.WriteLine(string.Format("topleft: x:{0},y:{1}", t.TopLeft[Vector.I1], t.TopLeft[Vector.I2]));
                    //}
                    var bbb  = sb.sb.ToString();
                    var asdf = "";
                }
            }
        }